From 6878ccb2e765bbef852f6893b0a010cae580f8fe Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 09:06:36 -0700 Subject: [PATCH 001/261] Create transform-R-to-Rmd --- .github/workflows/transform-R-to-Rmd | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/transform-R-to-Rmd diff --git a/.github/workflows/transform-R-to-Rmd b/.github/workflows/transform-R-to-Rmd new file mode 100644 index 00000000..aa2dbcb1 --- /dev/null +++ b/.github/workflows/transform-R-to-Rmd @@ -0,0 +1,38 @@ +name: Convert R Notebooks to Rmd + +on: + push + +jobs: + convert-ipynb-to-rmd: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Specify your Python version here + + - name: Install nbconvert + run: | + python -m pip install --upgrade pip + pip install nbconvert + + - name: Convert .ipynb to .Rmd + run: | + for notebook in PM1/*.irnb; do # Update this path to your directory + jupyter nbconvert --to rmarkdown "$notebook" + done + + - name: Commit and push changes + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add PM1/*.Rmd # Update this path to your directory + git commit -m 'Convert .ipynb to .Rmd' + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From eb7b54a68639d366442c90a22028099804af8805 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 09:07:50 -0700 Subject: [PATCH 002/261] Rename transform-R-to-Rmd to transform-R-to-Rmd.yml --- .github/workflows/{transform-R-to-Rmd => transform-R-to-Rmd.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{transform-R-to-Rmd => transform-R-to-Rmd.yml} (100%) diff --git a/.github/workflows/transform-R-to-Rmd b/.github/workflows/transform-R-to-Rmd.yml similarity index 100% rename from .github/workflows/transform-R-to-Rmd rename to .github/workflows/transform-R-to-Rmd.yml From d6a043b9b2dd0daff595aee8561c774c5605339a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 23:37:56 -0700 Subject: [PATCH 003/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 46 ++++++++++++++++-------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index aa2dbcb1..f3bc1547 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -1,7 +1,9 @@ -name: Convert R Notebooks to Rmd +name: Convert R Notebooks to Rmd and R Scripts on: - push + push: + paths: + - 'path/to/directory/*.ipynb' # Update this path to your directory jobs: convert-ipynb-to-rmd: @@ -11,27 +13,41 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Specify your Python version here + - name: Set up R + uses: r-lib/actions/setup-r@v2 + + - name: Install rmarkdown and knitr packages + run: | + R -e 'install.packages(c("rmarkdown", "knitr", "xfun"), repos="https://cloud.r-project.org")' - - name: Install nbconvert + - name: Convert .ipynb to .Rmd and .R run: | - python -m pip install --upgrade pip - pip install nbconvert + R -e ' + files <- list.files(path = "path/to/directory", pattern = "\\.ipynb$", full.names = TRUE, recursive = FALSE) + lapply(files, function(input) { + rmarkdown::convert_ipynb(input) + rmd_file <- xfun::with_ext(input, "Rmd") + knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) + }) + ' - - name: Convert .ipynb to .Rmd + - name: Zip .R files run: | - for notebook in PM1/*.irnb; do # Update this path to your directory - jupyter nbconvert --to rmarkdown "$notebook" - done + mkdir r_scripts + mv path/to/directory/*.R r_scripts/ + zip -r r_scripts.zip r_scripts + + - name: Upload artifact + uses: actions/upload-artifact@v2 + with: + name: r-scripts + path: r_scripts.zip - - name: Commit and push changes + - name: Commit and push .Rmd files run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add PM1/*.Rmd # Update this path to your directory + git add path/to/directory/*.Rmd # Update this path to your directory git commit -m 'Convert .ipynb to .Rmd' git push env: From 5b50672590c0622eabb372fd2b64c2e8da0ebac4 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 23:39:12 -0700 Subject: [PATCH 004/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index f3bc1547..72d22c33 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -1,9 +1,7 @@ name: Convert R Notebooks to Rmd and R Scripts on: - push: - paths: - - 'path/to/directory/*.ipynb' # Update this path to your directory + push jobs: convert-ipynb-to-rmd: From 0a91b81a55640a3e4388e51617b6181e6a845dbc Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 23:40:34 -0700 Subject: [PATCH 005/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 72d22c33..303cd3e9 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -4,7 +4,7 @@ on: push jobs: - convert-ipynb-to-rmd: + convert-irnb-to-rmd: runs-on: ubuntu-latest steps: @@ -18,10 +18,10 @@ jobs: run: | R -e 'install.packages(c("rmarkdown", "knitr", "xfun"), repos="https://cloud.r-project.org")' - - name: Convert .ipynb to .Rmd and .R + - name: Convert .irnb to .Rmd and .R run: | R -e ' - files <- list.files(path = "path/to/directory", pattern = "\\.ipynb$", full.names = TRUE, recursive = FALSE) + files <- list.files(path = "PM1", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) lapply(files, function(input) { rmarkdown::convert_ipynb(input) rmd_file <- xfun::with_ext(input, "Rmd") @@ -32,7 +32,7 @@ jobs: - name: Zip .R files run: | mkdir r_scripts - mv path/to/directory/*.R r_scripts/ + mv PM1/*.R r_scripts/ zip -r r_scripts.zip r_scripts - name: Upload artifact @@ -45,8 +45,8 @@ jobs: run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add path/to/directory/*.Rmd # Update this path to your directory - git commit -m 'Convert .ipynb to .Rmd' + git add PM1/*.Rmd # Update this path to your directory + git commit -m 'Convert .irnb to .Rmd' git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 4a4f9d8302bacbcdc9e5dbe2bba202f515ef366f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 06:44:50 +0000 Subject: [PATCH 006/261] Convert .irnb to .Rmd --- PM1/r-linear-model-overfitting.Rmd | 65 +++ ...r-ols-and-lasso-for-wage-gap-inference.Rmd | 389 +++++++++++++++++ PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 391 ++++++++++++++++++ 3 files changed, 845 insertions(+) create mode 100644 PM1/r-linear-model-overfitting.Rmd create mode 100644 PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd create mode 100644 PM1/r-ols-and-lasso-for-wage-prediction.Rmd diff --git a/PM1/r-linear-model-overfitting.Rmd b/PM1/r-linear-model-overfitting.Rmd new file mode 100644 index 00000000..5f1cb3b2 --- /dev/null +++ b/PM1/r-linear-model-overfitting.Rmd @@ -0,0 +1,65 @@ +--- +title: An R Markdown document converted from "PM1/r-linear-model-overfitting.irnb" +output: html_document +--- + +# Simple Exercise on Overfitting + +First set p=n + +```{r} + +set.seed(123) +n = 1000 + +p = n +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) +``` + +Second, set p=n/2. + +```{r} + +set.seed(123) +n = 1000 + +p = n/2 +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) +``` + +Third, set p/n =.05 + +```{r} + +set.seed(123) +n = 1000 + +p = .05*n +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) + +``` + diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd new file mode 100644 index 00000000..574ce287 --- /dev/null +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -0,0 +1,389 @@ +--- +title: An R Markdown document converted from "PM1/r-ols-and-lasso-for-wage-gap-inference.irnb" +output: html_document +--- + +# An inferential problem: The Gender Wage Gap + +In the previous lab, we analyzed data from the March Supplement of the U.S. Current Population Survey (2015) and answered the question of how to use job-relevant characteristics, such as education and experience, to best predict wages. Now, we focus on the following inference question: + +What is the difference in predicted wages between men and women with the same job-relevant characteristics? + +Thus, we analyze if there is a difference in the payment of men and women (*gender wage gap*). The gender wage gap may partly reflect *discrimination* against women in the labor market or may partly reflect a *selection effect*, namely that women are relatively more likely to take on occupations that pay somewhat less (for example, school teaching). + +To investigate the gender wage gap, we consider the following log-linear regression model + +\begin{align} +\log(Y) &= \beta'X + \epsilon\\ +&= \beta_1 D + \beta_2' W + \epsilon, +\end{align} + +where $Y$ is hourly wage, $D$ is the indicator of being female ($1$ if female and $0$ otherwise) and the +$W$'s are a vector of worker characteristics explaining variation in wages. Considering transformed wages by the logarithm, we are analyzing the relative difference in the payment of men and women. + +```{r} +install.packages("xtable") +install.packages("hdm") # a library for high-dimensional metrics +install.packages("sandwich") # a package used to compute robust standard errors + + +library(hdm) +library(xtable) +library(sandwich) +``` + +## Data analysis + +We consider the same subsample of the U.S. Current Population Survey (2015) as in the previous lab. Let us load the data set. + +```{r} +# load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") +# attach(data) +# dim(data) + +file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +data <- read.csv(file) +dim(data) +``` + +To start our (causal) analysis, we compare the sample means given gender: + +```{r} +Z <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +data_female <- data[data$sex==1,] +Z_female <- data_female[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +data_male <- data[data$sex==0,] +Z_male <- data_male[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +table <- matrix(0, 12, 3) +table[1:12,1] <- as.numeric(lapply(Z,mean)) +table[1:12,2] <- as.numeric(lapply(Z_male,mean)) +table[1:12,3] <- as.numeric(lapply(Z_female,mean)) +rownames(table) <- c("Log Wage","Sex","Less then High School","High School Graduate","Some College","College Graduate","Advanced Degree", "Northeast","Midwest","South","West","Experience") +colnames(table) <- c("All","Men","Women") +tab<- xtable(table, digits = 4) +tab +``` + +```{r} +print(tab,type="html") # set type="latex" for printing table in LaTeX +``` + + + + + + + + + + + + + + + + + +
All Men Women
Log Wage 2.9708 2.9878 2.9495
Sex 0.4445 0.0000 1.0000
Less then High School 0.0233 0.0318 0.0127
High School Graduate 0.2439 0.2943 0.1809
Some College 0.2781 0.2733 0.2840
Gollage Graduate 0.3177 0.2940 0.3473
Advanced Degree 0.1371 0.1066 0.1752
Northeast 0.2596 0.2590 0.2604
Midwest 0.2965 0.2981 0.2945
South 0.2161 0.2209 0.2101
West 0.2278 0.2220 0.2350
Experience 13.7606 13.7840 13.7313
+ +In particular, the table above shows that the difference in average *logwage* between men and women is equal to $0.038$ + +```{r} +mean(data_female$lwage)-mean(data_male$lwage) +``` + +Thus, the unconditional gender wage gap is about $3,8$\% for the group of never married workers (women get paid less on average in our sample). We also observe that never married working women are relatively more educated than working men and have lower working experience. + +This unconditional (predictive) effect of gender equals the coefficient $\beta$ in the univariate ols regression of $Y$ on $D$: + +\begin{align} +\log(Y) &=\beta D + \epsilon. +\end{align} + +We verify this by running an ols regression in R. + +```{r} +nocontrol.fit <- lm(lwage ~ sex, data=data) +nocontrol.est <- summary(nocontrol.fit)$coef["sex",1] +HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3'); # HC - "heteroskedasticity cosistent" -- HC3 is the SE that remains consistent in high dimensions +nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors + +# print unconditional effect of gender and the corresponding standard error +cat ("The estimated coefficient on the dummy for gender is",nocontrol.est," and the corresponding robust standard error is",nocontrol.se) +``` + +Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. + +Next, we run an ols regression of $Y$ on $(D,W)$ to control for the effect of covariates summarized in $W$: + +\begin{align} +\log(Y) &=\beta_1 D + \beta_2' W + \epsilon. +\end{align} + +Here, we are considering the flexible model from the previous lab. Hence, $W$ controls for experience, education, region, and occupation and industry indicators plus transformations and two-way interactions. + +Let us run the ols regression with controls. + +```{r} +# ols regression with controls + +flex <- lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) + +# Note that ()*() operation in formula objects in R creates a formula of the sort: +# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# This is not intuitive at all, but that's what it does. + +control.fit <- lm(flex, data=data) +control.est <- summary(control.fit)$coef[2,1] + +summary(control.fit) + +cat("Coefficient for OLS with controls", control.est) + +HCV.coefs <- vcovHC(control.fit, type = 'HC3'); +control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors +``` + +The estimated regression coefficient $\beta_1\approx-0.0696$ measures how our linear prediction of wage changes if we set the gender variable $D$ from 0 to 1, holding the controls $W$ fixed. +We can call this the *predictive effect* (PE), as it measures the impact of a variable on the prediction we make. Overall, we see that the unconditional wage gap of size $4$\% for women increases to about $7$\% after controlling for worker characteristics. + +We now show how the conditional gap and the remainder decompose the marginal wage gap into the parts explained and unexplained by the additional controls. (Note that this does *not* explain why there is a difference in the controls to begin with in the two groups.) + +```{r} +XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==0,]) +y0 = data[data$sex==0,]$lwage +XX1 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==1,]) +y1 = data[data$sex==1,]$lwage +mu1 = colMeans(XX1) +mu0 = colMeans(XX0) +betarest = summary(control.fit)$coef[3:(ncol(XX0)+1),1] # the coefficients excluding intercept and "sex" + +cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage),"\n") +diff.unexplained = control.est +cat("The unexplained difference: ",diff.unexplained,"\n") +diff.explained = sum(betarest*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])) +cat("The explained difference:",diff.explained,"\n") +cat("The sum of these differences:",diff.unexplained + diff.explained,"\n") +``` + +We next consider a Oaxaca-Blinder decomposition that also incorporates an interaction term. + +```{r} +svd0=svd(XX0) +svd1=svd(XX1) +svd0$d[svd0$d<=1e-10]=0 +svd0$d[svd0$d>1e-10]=1/svd0$d[svd0$d>1e-10] +beta0 = (svd0$v %*% (svd0$d*svd0$d*t(svd0$v))) %*% t(XX0) %*% y0 +svd1$d[svd1$d<=1e-10]=0 +svd1$d[svd1$d>1e-10]=1/svd1$d[svd1$d>1e-10] +beta1 = (svd1$v %*% (svd1$d*svd1$d*t(svd1$v))) %*% t(XX1) %*% y1 + +cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage),"\n") +cat("The unexplained difference:",beta1[1]-beta0[1],"\n") +cat("The difference explained by endowment:",sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])),"\n") +cat("The difference explained by coefficient:",sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),"\n") +cat("The sum of these differences:",beta1[1]-beta0[1] + sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])) + sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),"\n") +``` + +Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out the linear effect of the controls via ols. + +```{r} +# Partialling-out using ols + +# models +flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y +flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D + +# partialling-out the linear effect of W from Y +t.Y <- lm(flex.y, data=data)$res +# partialling-out the linear effect of W from D +t.D <- lm(flex.d, data=data)$res + +# regression of Y on D after partialling-out the effect of W +partial.fit <- lm(t.Y~t.D) +partial.est <- summary(partial.fit)$coef[2,1] + +cat("Coefficient for D via partialling-out", partial.est) + +# standard error +HCV.coefs <- vcovHC(partial.fit, type = 'HC3') +partial.se <- sqrt(diag(HCV.coefs))[2] + +# confidence interval +confint(partial.fit)[2,] +``` + +Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. + +We know that the partialling-out approach works well when the dimension of $W$ is low +in relation to the sample size $n$. When the dimension of $W$ is relatively high, we need to use variable selection +or penalization for regularization purposes. + +In the following, we illustrate the partialling-out approach using lasso instead of ols. + +```{r} +# Partialling-out using lasso + +# models +flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y +flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D + +# partialling-out the linear effect of W from Y +t.Y <- rlasso(flex.y, data=data)$res +# partialling-out the linear effect of W from D +t.D <- rlasso(flex.d, data=data)$res + +# regression of Y on D after partialling-out the effect of W +partial.lasso.fit <- lm(t.Y~t.D) +partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1] + +cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) + +# standard error +HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +partial.lasso.se <- sqrt(diag(HCV.coefs))[2] +``` + +Using lasso for partialling-out here provides similar results as using ols. + +Next, we summarize the results. + +```{r} +table<- matrix(0, 4, 2) +table[1,1]<- nocontrol.est +table[1,2]<- nocontrol.se +table[2,1]<- control.est +table[2,2]<- control.se +table[3,1]<- partial.est +table[3,2]<- partial.se +table[4,1]<- partial.lasso.est +table[4,2]<- partial.lasso.se +colnames(table)<- c("Estimate","Std. Error") +rownames(table)<- c("Without controls", "full reg", "partial reg", "partial reg via lasso") +tab<- xtable(table, digits=c(3, 3, 4)) +tab +``` + +```{r} +print(tab, type="html") +``` + + + + + + + + + +
Estimate Std. Error
Without controls -0.038 0.0159
full reg -0.070 0.0150
partial reg -0.070 0.0150
partial reg via lasso -0.072 0.0154
+ +It it worth noticing that controlling for worker characteristics increases the gender wage gap from less than 4\% to 7\%. The controls we used in our analysis include 5 educational attainment indicators (less than high school graduates, high school graduates, some college, college graduate, and advanced degree), 4 region indicators (midwest, south, west, and northeast); a quartic term (first, second, third, and fourth power) in experience and 22 occupation and 23 industry indicators. + +Keep in mind that the predictive effect (PE) does not only measures discrimination (causal effect of being female), it also may reflect +selection effects of unobserved differences in covariates between men and women in our sample. + +## OLS Overfitting + +Next we motivate the usage of lasso. We try an "extra" flexible model, where we take interactions of all controls, giving us about 1000 controls. To highlight the potential impact of overfitting on inference, we subset to the first 1000 observations so that $p \approx n$. + +```{r} +set.seed(2724) +subset_size <- 1000 +random <- sample(1:nrow(data), subset_size) +subset <- data[random,] +``` + +For a linear model, the covariance matrix of the estimated $\hat{\beta}$ coefficients is given by $$\Sigma_{\hat{\beta}} = (X'X)^{-1} X' \Omega X (X'X)^{-1}$$ Under homoskedasticity, $\Omega = \sigma^2 I$ so $\Sigma_{\hat{\beta}}$ reduces to $\sigma^2(X'X)^{-1}$ with $\sigma^2$ estimated with the mean squared residuals. Under heteroskedasticity, $\Omega \neq \sigma^2 I$, so we must use an approach that yields valid standard errors. Under heteroskedasticity, there exists a variety of consistent "sandwich" estimators proposed for $\Sigma_{\hat{\beta}}$. With $e_i$ denoting the residual of observation $i: + +$ HC0 = (X'X)^{-1} X' \text{diag} [e_i^2] X(X'X)^{-1}$ + +$ HC1 = \frac{n}{n-p-1} (X'X)^{-1} X' \text{diag} [e_i^2] X(X'X)^{-1}$ + +$ HC2 = (X'X)^{-1} X' \text{diag} \left[\frac{e_i^2}{1-h_{ii}} \right] X(X'X)^{-1}$ + +$ HC3 = (X'X)^{-1} X' \text{diag} \left[\frac{e_i^2}{(1-h_{ii})^2} \right] X(X'X)^{-1}$ + + +For small sample sizes, the errors from HC0 are biased (usually downward). HC1 is a simple degree-of-freedom adjustment. HC2 is inspired by the insight that HC0's bias in finite samples results from points of high leverage in the design matrix $X$ (intuitively, outliers with respect to the independent variables). Thus, HC2 weights the $i$th squared residual by the reciprocal of $(1-h_{ii})$, with leverage values $h_{ii}$ as the $i$th diagonal element of the "hat" matrix $H = X(X'X)^{-1}X'$ to adjust for the finite-sample bias present in HC0. + +HC3 is similar to HC2, weighting by the squared $(1-h_{ii})^2$ in the denominator instead. HC3 is also equivalent to jackknife standard errors. HC3 has been shown to perform well regardless of the absence/presence of homoskedasticity and remains valid, in the sense of being biased upward under regularity conditions, in high dimensional settings. + +```{r} +# extra flexible model +extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2 + +control.fit <- lm(extraflex, data=subset) +#summary(control.fit) +control.est <- summary(control.fit)$coef[2,1] +cat("Number of Extra-Flex Controls", length(control.fit$coef)-1, "\n") +cat("Coefficient for OLS with extra flex controls", control.est) + + +n=subset_size; p=length(control.fit$coef); + +# HC0 SE +HCV.coefs_HC0 <- vcovHC(control.fit, type = 'HC0') +control.se.HC0 <- sqrt(diag(HCV.coefs_HC0))[2] + +# For a more correct approach, we +# would implement the approach of Cattaneo, Jannson, and Newey (2018)'s procedure. + +# Jackknife. Need to trim some leverages or otherwise regularize. Theory shouldn't +# really work here. +# HC3 SE +# estimates +coefs = hatvalues(control.fit) +trim = 0.99999999999 +coefs_trimmed = coefs*(coefs < trim) + trim*(coefs >= trim) +omega = (control.fit$residuals^2)/((1-coefs_trimmed)^2) +HCV.coefs <- vcovHC(control.fit, omega = as.vector(omega), type = 'HC3') +control.se.HC3 <- sqrt(diag(HCV.coefs))[2] +``` + +```{r} +# models +extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for Y +extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for D + +# partialling-out the linear effect of W from Y +t.Y <- rlasso(extraflex.y, data=subset)$res +# partialling-out the linear effect of W from D +t.D <- rlasso(extraflex.d, data=subset)$res + +# regression of Y on D after partialling-out the effect of W +partial.lasso.fit <- lm(t.Y~t.D) +partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1] + +cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) + +# standard error +HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +partial.lasso.se <- sqrt(diag(HCV.coefs))[2] +``` + +```{r} +table<- matrix(0, 3, 2) +table[1,1]<- control.est +table[1,2]<- control.se.HC0 +table[2,1]<- control.est +table[2,2]<- control.se.HC3 +table[3,1]<- partial.lasso.est +table[3,2]<- partial.lasso.se +colnames(table)<- c("Estimate","Std. Error") +rownames(table)<- c("full reg, HC0", "full reg, HC3", "partial reg via lasso") +tab<- xtable(table, digits=c(3, 3, 4)) +tab + +print(tab, type="latex") +``` + +In this case $p/n \approx 1$, that is $p/n$ is no longer small and we start seeing the differences between +unregularized partialling out and regularized partialling out with lasso (double lasso). The results based on +double lasso have rigorous guarantees in this non-small p/n regime under approximate sparsity. The results based on OLS still +have guarantees in p/n< 1 regime under assumptions laid out in Cattaneo, Newey, and Jansson (2018), without approximate +sparsity, although other regularity conditions are needed. + diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd new file mode 100644 index 00000000..9d197589 --- /dev/null +++ b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd @@ -0,0 +1,391 @@ +--- +title: An R Markdown document converted from "PM1/r-ols-and-lasso-for-wage-prediction.irnb" +output: html_document +--- + +## Introduction + +An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective. + +In the following wage example, $Y$ is the (log) hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are: + +* How can we use job-relevant characteristics, such as education and experience, to best predict wages? + +* What is the difference in predicted wages between men and women with the same job-relevant characteristics? + +In this lab, we focus on the prediction question first. + +## Data + +The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$. + +The variable of interest $Y$ is the (log) hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$. + +```{r} +install.packages("xtable") +install.packages("hdm") # a library for high-dimensional metrics +install.packages("glmnet") # for lasso CV + +library(hdm) +library(xtable) +library(glmnet) +``` + +## Data analysis + +We start by loading the data set. + +```{r} +file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +data <- read.csv(file) +dim(data) +``` + +Let's have a look at the structure of the data. + +```{r} +str(data) +``` + +We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data. + +```{r} +# construct matrices for estimation from the data +Y <- log(data$wage) +n <- length(Y) +Z <- data[-which(colnames(data) %in% c("wage","lwage"))] +p <- dim(Z)[2] + +cat("Number of observations:", n, '\n') +cat( "Number of raw regressors:", p) +``` + +For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data. + +```{r} +# generate a table of means of variables +Z_subset <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","mw","so","we","ne","exp1"))] +table <- matrix(0, 12, 1) +table[1:12,1] <- as.numeric(lapply(Z_subset,mean)) +rownames(table) <- c("Log Wage","Sex","Some High School","High School Graduate","Some College","College Graduate", "Advanced Degree","Midwest","South","West","Northeast","Experience") +colnames(table) <- c("Sample mean") +tab<- xtable(table, digits = 2) +tab +``` + +E.g., the share of female workers in our sample is ~44% ($sex=1$ if female). + +Alternatively, using the xtable package, we can also print the table in LaTeX. + +```{r} +print(tab, type="latex") # type="latex" for printing table in LaTeX +``` + +## Prediction Question + +Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$: + +\begin{equation}\label{decompose} +Y = \beta'X+ \epsilon. +\end{equation} + +Our goals are + +* Predict wages using various characteristics of workers. + +* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$. + + +Toward answering the latter, we measure the prediction quality of the two models via data splitting: + +- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider). +- Use the training sample to estimate the parameters of the Basic Model and the Flexible Model. +- Before using the testing sample, we evaluate in-sample fit. + +```{r} +# splitting the data +set.seed(1) # to make the results replicable (we will generate random numbers) +random <- sample(1:n, floor(n*4/5)) # draw (4/5)*n random numbers from 1 to n without replacing +train <- data[random,] +test <- data[-random,] +``` + + +We employ two different specifications for prediction: + + +1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators). + + +2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*. + +Using the **Flexible Model** enables us to approximate the real relationship by a more complex regression model and therefore to reduce the bias. The **Flexible Model** increases the range of potential shapes of the estimated regression function. In general, flexible models often deliver higher prediction accuracy but are harder to interpret. + +## Data-Splitting: In-sample performance + +Let us fit both models to our data by running ordinary least squares (ols): + +```{r} +# 1. basic model +basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2)) +regbasic <- lm(basic, data=train) # perform ols using the defined model +cat( "Number of regressors in the basic model:",length(regbasic$coef), '\n') # number of regressors in the Basic Model +``` + +##### Note that the basic model consists of $51$ regressors. + +```{r} +# 2. flexible model +flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+C(occ2)+C(ind2) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) +regflex <- lm(flex, data=train) +cat( "Number of regressors in the flexible model:",length(regflex$coef)) # number of regressors in the Flexible Model +``` + +##### Note that the flexible model consists of $246$ regressors. + +#### Re-estimating the flexible model using Lasso +We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. + +```{r} +# Flexible model using Lasso, in-sample fit +train_flex <- model.matrix(flex,train) # all regressors +fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family="gaussian", alpha=1, nfolds=5) +yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = "lambda.min") # in-sample fit right now, not out-of-sample using "test" +``` + +#### Evaluating the predictive performance of the basic and flexible models in-sample +Now, we can evaluate the performance of both models based on the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$: + +```{r} +# Assess predictive performance +sumbasic <- summary(regbasic) +sumflex <- summary(regflex) +# no summary() for lassocv + +ntrain = nrow(train) + +# R-squared and adjusted R-squared +R2.1 <- sumbasic$r.squared +cat("R-squared for the basic model: ", R2.1, "\n") +R2.adj1 <- sumbasic$adj.r.squared +cat("adjusted R-squared for the basic model: ", R2.adj1, "\n") + +R2.2 <- sumflex$r.squared +cat("R-squared for the flexible model: ", R2.2, "\n") +R2.adj2 <- sumflex$adj.r.squared +cat("adjusted R-squared for the flexible model: ", R2.adj2, "\n") + +pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]] +R2.L <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio # +cat("R-squared for the lasso with flexible model: ", R2.L, "\n") +R2.adjL <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1)) +cat("adjusted R-squared for the flexible model: ", R2.adjL, "\n") + +# MSE and adjusted MSE +MSE1 <- mean(sumbasic$res^2) +cat("MSE for the basic model: ", MSE1, "\n") +p1 <- sumbasic$df[1] # number of regressors +MSE.adj1 <- (ntrain/(ntrain-p1))*MSE1 +cat("adjusted MSE for the basic model: ", MSE.adj1, "\n") + +MSE2 <-mean(sumflex$res^2) +cat("MSE for the flexible model: ", MSE2, "\n") +p2 <- sumflex$df[1] +MSE.adj2 <- (ntrain/(ntrain-p2))*MSE2 +cat("adjusted MSE for the lasso flexible model: ", MSE.adj2, "\n") + +lasso.res <- train$lwage - yhat.lasso.cv +MSEL <-mean(lasso.res^2) +cat("MSE for the lasso flexible model: ", MSEL, "\n") +MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL +cat("adjusted MSE for the lasso flexible model: ", MSE.adjL, "\n") +``` + +```{r} +# Output the table +table <- matrix(0, 3, 5) +table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1) +table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2) +table[3,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL) +colnames(table)<- c("p","$R^2_{sample}$","$MSE_{sample}$","$R^2_{adjusted}$", "$MSE_{adjusted}$") +rownames(table)<- c("basic reg","flexible reg", "lasso flex") +tab<- xtable(table, digits =c(0,0,2,2,2,2)) +print(tab,type="latex") +tab +``` + +Considering the measures above, the flexible model performs slightly better than the basic model. + +As $p/n$ is not large, the discrepancy between the adjusted and unadjusted measures is not large. However, if it were, we might still like to apply **data splitting** as a more general procedure to deal with potential overfitting if $p/n$. We illustrate the approach in the following. + +## Data Splitting: Out-of-sample performance + +Now that we have seen in-sample fit, we evaluate our models on the out-of-sample performance: +- Use the testing sample for evaluation. Predict the $\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample. +- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for both prediction models. + +```{r} +# basic model +options(warn=-1) # ignore warnings +regbasic <- lm(basic, data=train) + +# calculating the out-of-sample MSE +yhat.bas <- predict(regbasic, newdata=test) +y.test <- test$lwage +mean.train = mean(train$lwage) +MSE.test1 <- sum((y.test-yhat.bas)^2)/length(y.test) +R2.test1<- 1- MSE.test1/mean((y.test-mean.train)^2) + +cat("Test MSE for the basic model: ", MSE.test1, " ") +cat("Test R2 for the basic model: ", R2.test1) +``` + +In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$. + +```{r} +# flexible model +options(warn=-1) # ignore warnings +regflex <- lm(flex, data=train) + +# calculating the out-of-sample MSE +yhat.flex<- predict(regflex, newdata=test) +y.test <- test$lwage +mean.train = mean(train$lwage) +MSE.test2 <- sum((y.test-yhat.flex)^2)/length(y.test) +R2.test2<- 1- MSE.test2/mean((y.test-mean.train)^2) + +cat("Test MSE for the flexible model: ", MSE.test2, " ") + +cat("Test R2 for the flexible model: ", R2.test2) +``` + +In the flexible model too, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is not large. + +It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results. + +Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model. + +Next, let us use lasso regression in the flexible model instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in the flexible model to ols regression. + +```{r} +# Flexible model using Lasso +# model matrix should be formed before train/test as some levels dropped +flex_data = model.matrix(flex,data) +train_flex <- flex_data[random,] +test_flex <- flex_data[-random,] + +fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family="gaussian", alpha=1, nfolds=5) +yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = "lambda.min") + +# calculating the out-of-sample MSE +MSE.lasso <- sum((y.test-yhat.lasso.cv)^2)/length(y.test) +R2.lasso<- 1- MSE.lasso/mean((y.test-mean(train$lwage))^2) + +cat("Test MSE for the lasso on flexible model: ", MSE.lasso, " ") + +cat("Test R2 for the lasso flexible model: ", R2.lasso) +``` + +Finally, let us summarize the results: + +```{r} +# Output the comparison table +table2 <- matrix(0, 3,2) +table2[1,1] <- MSE.test1 +table2[2,1] <- MSE.test2 +table2[3,1] <- MSE.lasso +table2[1,2] <- R2.test1 +table2[2,2] <- R2.test2 +table2[3,2] <- R2.lasso + +rownames(table2)<- c("basic reg","flexible reg","lasso regression") +colnames(table2)<- c("$MSE_{test}$", "$R^2_{test}$") +tab2 <- xtable(table2, digits =3) +tab2 +``` + +```{r} +print(tab2,type="latex") +``` + +## Extra flexible model and Overfitting +Given the results above, it is not immediately clear why one would choose to use Lasso as results are fairly similar. To motivate, we consider an extra flexible model to show how OLS can overfit significantly to the in-sample train data and perform poorly on the out-of-sample testing data. + + +```{r} +# extra flexible model +extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2 +regextra <- lm(extraflex, data=train) +sumextra <- summary(regextra) +cat("Number of Extra-Flex Controls", length(regextra$coef)-1, "\n") +n= length(data$wage); p =length(regextra$coef); +ntrain = length(train$wage) +``` + +```{r} +## In-sample +# R-squared and adjusted R-squared +R2.extra <- sumextra$r.squared +cat("R-squared for the extra flexible model (in-sample): ", R2.extra, "\n") +R2.adjextra <- sumextra$adj.r.squared +cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjextra, "\n") + +# MSE and adjusted MSE +MSE.extra <- mean(sumextra$res^2) +cat("MSE for the extra flexible model (in-sample): ", MSE.extra, "\n") +MSE.adjextra <- (ntrain/(ntrain-p))*MSE.extra +cat("adjusted MSE for the basic model (in-sample): ", MSE.adj1, "\n") +``` + +```{r} +## Out-of-sample +yhat.ex <- predict(regextra, newdata=test) +y.test.ex <- test$lwage +MSE.test.ex <- sum((y.test.ex-yhat.ex)^2)/length(y.test.ex) +R2.test.ex<- 1- MSE.test.ex/mean((y.test.ex-mean(train$lwage))^2) + +cat("Test MSE for the basic model: ", MSE.test.ex, " ") +cat("Test R2 for the basic model: ", R2.test.ex) +``` + +As we can see, a simple OLS overfits when the dimensionality of covariates is high, as the out-of-sample performance suffers dramatically in comparison to the in-sample performance. + +Contrast this with Lasso: + +```{r} +# model matrix should be formed before train/test as some levels dropped +flex_data = model.matrix(extraflex,data) +train_flex <- flex_data[random,] +test_flex <- flex_data[-random,] + +# fit model +fit.lcv <- cv.glmnet(train_flex, train$lwage, family="gaussian", alpha=1, nfolds=5) + +# in-sample +yhat.lcv <- predict(fit.lcv, newx = train_flex, s = "lambda.min") + +R2.L <- 1-(sum((yhat.lcv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio # +pL <- fit.lcv$nzero[fit.lcv$index[1]] +R2.adjL <- 1-(sum((yhat.lcv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1)) + +lasso.res <- train$lwage - yhat.lcv +MSEL <-mean(lasso.res^2) +MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL + +cat("R-squared for the lasso with the extra flexible model (in-sample): ", R2.L, "\n") +cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjL, "\n") +cat("MSE for the lasso with the extra flexible model (in-sample): ", MSEL, "\n") +cat("adjusted MSE for the lasso with the extraflexible model (in-sample): ", MSE.adjL, "\n") + +# out-of-sample +yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = "lambda.min") +MSE.lasso <- sum((test$lwage-yhat.lcv.test)^2)/length(test$lwage) +R2.lasso <- 1- MSE.lasso/mean((test$lwage-mean(train$lwage))^2) + +cat("\n") +cat("Test R2 for the lasso the extra flexible model: ", R2.lasso,"\n") +cat("Test MSE for the lasso on the extra flexible model: ", MSE.lasso) +``` + +As shown above, the overfitting effect is mitigated with the penalized regression model. + From 4f7891ee7cf002679cb6fdc6ec6e598f3c816c48 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 12 Jul 2024 23:52:49 -0700 Subject: [PATCH 007/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 303cd3e9..c36dbb18 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -11,6 +11,22 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Specify your Python version here + + - name: Install nbstripout + run: | + python -m pip install --upgrade pip + pip install nbstripout + + - name: Strip outputs from notebooks + run: | + for notebook in PM1/*.irnb; do + nbstripout "$notebook" + done + - name: Set up R uses: r-lib/actions/setup-r@v2 @@ -41,12 +57,12 @@ jobs: name: r-scripts path: r_scripts.zip - - name: Commit and push .Rmd files + - name: Commit and push stripped notebooks and .Rmd files run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add PM1/*.Rmd # Update this path to your directory - git commit -m 'Convert .irnb to .Rmd' + git add PM1/*.irnb PM1/*.Rmd + git commit -m 'Strip outputs from .irnb and convert to .Rmd' git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 38e24c859f4e47cf91c84fce690d8e820a121ddb Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 00:14:28 -0700 Subject: [PATCH 008/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 38 ++++++++++++++++-------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index c36dbb18..3898c7c6 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -1,10 +1,10 @@ -name: Convert R Notebooks to Rmd and R Scripts +name: Convert and Lint R Notebooks on: push jobs: - convert-irnb-to-rmd: + convert-lint-notebooks: runs-on: ubuntu-latest steps: @@ -21,18 +21,21 @@ jobs: python -m pip install --upgrade pip pip install nbstripout - - name: Strip outputs from notebooks - run: | - for notebook in PM1/*.irnb; do - nbstripout "$notebook" - done - - name: Set up R uses: r-lib/actions/setup-r@v2 - - name: Install rmarkdown and knitr packages + - name: Install rmarkdown, knitr, and lintr packages run: | - R -e 'install.packages(c("rmarkdown", "knitr", "xfun"), repos="https://cloud.r-project.org")' + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' + + - name: Strip outputs from .irnb files + run: | + for notebook in PM1/*.irnb; do + ipynb_notebook="${notebook%.irnb}.ipynb" + mv "$notebook" "$ipynb_notebook" + nbstripout "$ipynb_notebook" + mv "$ipynb_notebook" "$notebook" + done - name: Convert .irnb to .Rmd and .R run: | @@ -45,6 +48,17 @@ jobs: }) ' + - name: Lint .Rmd files + run: | + R -e ' + library(lintr) + linters <- with_defaults(line_length_linter = line_length_linter(120)) + rmd_files <- list.files(path = "PM1", pattern = "\\.Rmd$", full.names = TRUE) + lapply(rmd_files, function(file) { + lint(file, linters) + }) + ' + - name: Zip .R files run: | mkdir r_scripts @@ -57,12 +71,12 @@ jobs: name: r-scripts path: r_scripts.zip - - name: Commit and push stripped notebooks and .Rmd files + - name: Commit and push stripped .irnb and .Rmd files run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' git add PM1/*.irnb PM1/*.Rmd - git commit -m 'Strip outputs from .irnb and convert to .Rmd' + git commit -m 'Strip outputs from .irnb, convert to .Rmd, and lint .Rmd files' git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From bc339e9e265870cc43e7a3c10269e690605f291b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 07:18:57 +0000 Subject: [PATCH 009/261] Strip outputs from .irnb, convert to .Rmd, and lint .Rmd files --- PM1/r-linear-model-overfitting.irnb | 136 +- ...-ols-and-lasso-for-wage-gap-inference.irnb | 823 +-------- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 1622 ++++++++--------- 3 files changed, 934 insertions(+), 1647 deletions(-) diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index 9e117e23..dbf5adc2 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -1 +1,135 @@ -{"cells":[{"metadata":{"_uuid":"051d70d956493feee0c6d64651c6a088724dca2a","_execution_state":"idle","trusted":false,"id":"UE_OM9YaynNP"},"cell_type":"markdown","source":["# Simple Exercise on Overfitting\n"]},{"metadata":{"id":"AhJbONGfynNR"},"cell_type":"markdown","source":["First set p=n"]},{"metadata":{"trusted":true,"id":"cbgHLwp5ynNS"},"cell_type":"code","source":["\n","set.seed(123)\n","n = 1000\n","\n","p = n\n","X<- matrix(rnorm(n*p), n, p)\n","Y<- rnorm(n)\n","\n","print(\"p/n is\")\n","print(p/n)\n","print(\"R2 is\")\n","print(summary(lm(Y~X))$r.squared)\n","print(\"Adjusted R2 is\")\n","print(summary(lm(Y~X))$adj.r.squared)\n"],"execution_count":null,"outputs":[]},{"metadata":{"id":"3sPgPQ7eynNU"},"cell_type":"markdown","source":["Second, set p=n/2."]},{"metadata":{"trusted":true,"id":"gWbDboRYynNV"},"cell_type":"code","source":["\n","set.seed(123)\n","n = 1000\n","\n","p = n/2\n","X<- matrix(rnorm(n*p), n, p)\n","Y<- rnorm(n)\n","\n","print(\"p/n is\")\n","print(p/n)\n","print(\"R2 is\")\n","print(summary(lm(Y~X))$r.squared)\n","print(\"Adjusted R2 is\")\n","print(summary(lm(Y~X))$adj.r.squared)\n"],"execution_count":null,"outputs":[]},{"metadata":{"id":"m0BfMzhCynNV"},"cell_type":"markdown","source":["Third, set p/n =.05"]},{"metadata":{"trusted":true,"id":"uF5tT-MdynNV"},"cell_type":"code","source":["\n","set.seed(123)\n","n = 1000\n","\n","p = .05*n\n","X<- matrix(rnorm(n*p), n, p)\n","Y<- rnorm(n)\n","\n","print(\"p/n is\")\n","print(p/n)\n","print(\"R2 is\")\n","print(summary(lm(Y~X))$r.squared)\n","print(\"Adjusted R2 is\")\n","print(summary(lm(Y~X))$adj.r.squared)\n","\n"],"execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"ir","display_name":"R","language":"R"},"language_info":{"name":"R","codemirror_mode":"r","pygments_lexer":"r","mimetype":"text/x-r-source","file_extension":".r","version":"3.6.3"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "UE_OM9YaynNP" + }, + "source": [ + "# Simple Exercise on Overfitting\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AhJbONGfynNR" + }, + "source": [ + "First set p=n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cbgHLwp5ynNS" + }, + "outputs": [], + "source": [ + "\n", + "set.seed(123)\n", + "n = 1000\n", + "\n", + "p = n\n", + "X<- matrix(rnorm(n*p), n, p)\n", + "Y<- rnorm(n)\n", + "\n", + "print(\"p/n is\")\n", + "print(p/n)\n", + "print(\"R2 is\")\n", + "print(summary(lm(Y~X))$r.squared)\n", + "print(\"Adjusted R2 is\")\n", + "print(summary(lm(Y~X))$adj.r.squared)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3sPgPQ7eynNU" + }, + "source": [ + "Second, set p=n/2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gWbDboRYynNV" + }, + "outputs": [], + "source": [ + "\n", + "set.seed(123)\n", + "n = 1000\n", + "\n", + "p = n/2\n", + "X<- matrix(rnorm(n*p), n, p)\n", + "Y<- rnorm(n)\n", + "\n", + "print(\"p/n is\")\n", + "print(p/n)\n", + "print(\"R2 is\")\n", + "print(summary(lm(Y~X))$r.squared)\n", + "print(\"Adjusted R2 is\")\n", + "print(summary(lm(Y~X))$adj.r.squared)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m0BfMzhCynNV" + }, + "source": [ + "Third, set p/n =.05" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uF5tT-MdynNV" + }, + "outputs": [], + "source": [ + "\n", + "set.seed(123)\n", + "n = 1000\n", + "\n", + "p = .05*n\n", + "X<- matrix(rnorm(n*p), n, p)\n", + "Y<- rnorm(n)\n", + "\n", + "print(\"p/n is\")\n", + "print(p/n)\n", + "print(\"R2 is\")\n", + "print(summary(lm(Y~X))$r.squared)\n", + "print(\"Adjusted R2 is\")\n", + "print(summary(lm(Y~X))$adj.r.squared)\n", + "\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 167433ee..7084e181 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -49,31 +49,7 @@ "id": "XuSVp1TShFKs", "outputId": "6e38357e-eef8-44bf-c11b-d4e4aa4139a8" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "The downloaded binary packages are in\n", - "\t/var/folders/35/c51w3jrd19xb7bn278b2_zy40000gn/T//Rtmpy1P2xH/downloaded_packages\n", - "\n", - "The downloaded binary packages are in\n", - "\t/var/folders/35/c51w3jrd19xb7bn278b2_zy40000gn/T//Rtmpy1P2xH/downloaded_packages\n", - "\n", - "The downloaded binary packages are in\n", - "\t/var/folders/35/c51w3jrd19xb7bn278b2_zy40000gn/T//Rtmpy1P2xH/downloaded_packages\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning message:\n", - "“package ‘hdm’ was built under R version 4.3.2”\n" - ] - } - ], + "outputs": [], "source": [ "install.packages(\"xtable\")\n", "install.packages(\"hdm\") # a library for high-dimensional metrics\n", @@ -105,51 +81,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:32.679042Z", - "iopub.status.busy": "2021-07-12T16:59:32.67653Z", - "iopub.status.idle": "2021-07-12T16:59:32.765537Z" - }, "id": "T46lur9zyorw", "outputId": "bad9c980-6655-4027-f9dd-b07a2216ab0a" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
  1. 5150
  2. 20
\n" - ], - "text/latex": [ - "\\begin{enumerate*}\n", - "\\item 5150\n", - "\\item 20\n", - "\\end{enumerate*}\n" - ], - "text/markdown": [ - "1. 5150\n", - "2. 20\n", - "\n", - "\n" - ], - "text/plain": [ - "[1] 5150 20" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# load(\"../input/wage2015-inference/wage2015_subsample_inference.Rdata\")\n", "# attach(data)\n", @@ -171,106 +112,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 474 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:32.772745Z", - "iopub.status.busy": "2021-07-12T16:59:32.770047Z", - "iopub.status.idle": "2021-07-12T16:59:32.877413Z" - }, "id": "hsx7vuc2yor3", "outputId": "2f3378b9-4534-40c0-98e4-36a4b135f1e9" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A xtable: 12 × 3
AllMenWomen
<dbl><dbl><dbl>
Log Wage 2.97078670 2.98782963 2.94948490
Sex 0.44446602 0.00000000 1.00000000
Less then High School 0.02330097 0.03180706 0.01266929
High School Graduate 0.24388350 0.29430269 0.18086501
Some College 0.27805825 0.27333100 0.28396680
College Graduate 0.31766990 0.29395316 0.34731324
Advanced Degree 0.13708738 0.10660608 0.17518567
Northeast 0.25961165 0.25900035 0.26037571
Midwest 0.29650485 0.29814750 0.29445173
South 0.21611650 0.22090178 0.21013543
West 0.22776699 0.22195037 0.23503713
Experience13.7605825213.7839916113.73132372
\n" - ], - "text/latex": [ - "A xtable: 12 × 3\n", - "\\begin{tabular}{r|lll}\n", - " & All & Men & Women\\\\\n", - " & & & \\\\\n", - "\\hline\n", - "\tLog Wage & 2.97078670 & 2.98782963 & 2.94948490\\\\\n", - "\tSex & 0.44446602 & 0.00000000 & 1.00000000\\\\\n", - "\tLess then High School & 0.02330097 & 0.03180706 & 0.01266929\\\\\n", - "\tHigh School Graduate & 0.24388350 & 0.29430269 & 0.18086501\\\\\n", - "\tSome College & 0.27805825 & 0.27333100 & 0.28396680\\\\\n", - "\tCollege Graduate & 0.31766990 & 0.29395316 & 0.34731324\\\\\n", - "\tAdvanced Degree & 0.13708738 & 0.10660608 & 0.17518567\\\\\n", - "\tNortheast & 0.25961165 & 0.25900035 & 0.26037571\\\\\n", - "\tMidwest & 0.29650485 & 0.29814750 & 0.29445173\\\\\n", - "\tSouth & 0.21611650 & 0.22090178 & 0.21013543\\\\\n", - "\tWest & 0.22776699 & 0.22195037 & 0.23503713\\\\\n", - "\tExperience & 13.76058252 & 13.78399161 & 13.73132372\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A xtable: 12 × 3\n", - "\n", - "| | All <dbl> | Men <dbl> | Women <dbl> |\n", - "|---|---|---|---|\n", - "| Log Wage | 2.97078670 | 2.98782963 | 2.94948490 |\n", - "| Sex | 0.44446602 | 0.00000000 | 1.00000000 |\n", - "| Less then High School | 0.02330097 | 0.03180706 | 0.01266929 |\n", - "| High School Graduate | 0.24388350 | 0.29430269 | 0.18086501 |\n", - "| Some College | 0.27805825 | 0.27333100 | 0.28396680 |\n", - "| College Graduate | 0.31766990 | 0.29395316 | 0.34731324 |\n", - "| Advanced Degree | 0.13708738 | 0.10660608 | 0.17518567 |\n", - "| Northeast | 0.25961165 | 0.25900035 | 0.26037571 |\n", - "| Midwest | 0.29650485 | 0.29814750 | 0.29445173 |\n", - "| South | 0.21611650 | 0.22090178 | 0.21013543 |\n", - "| West | 0.22776699 | 0.22195037 | 0.23503713 |\n", - "| Experience | 13.76058252 | 13.78399161 | 13.73132372 |\n", - "\n" - ], - "text/plain": [ - " All Men Women \n", - "Log Wage 2.97078670 2.98782963 2.94948490\n", - "Sex 0.44446602 0.00000000 1.00000000\n", - "Less then High School 0.02330097 0.03180706 0.01266929\n", - "High School Graduate 0.24388350 0.29430269 0.18086501\n", - "Some College 0.27805825 0.27333100 0.28396680\n", - "College Graduate 0.31766990 0.29395316 0.34731324\n", - "Advanced Degree 0.13708738 0.10660608 0.17518567\n", - "Northeast 0.25961165 0.25900035 0.26037571\n", - "Midwest 0.29650485 0.29814750 0.29445173\n", - "South 0.21611650 0.22090178 0.21013543\n", - "West 0.22776699 0.22195037 0.23503713\n", - "Experience 13.76058252 13.78399161 13.73132372" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "Z <- data[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"ne\",\"mw\",\"so\",\"we\",\"exp1\"))]\n", "\n", @@ -292,44 +143,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:32.8861Z", - "iopub.status.busy": "2021-07-12T16:59:32.882357Z", - "iopub.status.idle": "2021-07-12T16:59:32.915279Z" - }, "id": "X81tdQRFyor4", "outputId": "406ea0b4-4dda-4d81-d2a3-ef94505942c7" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
All Men Women
Log Wage 2.9708 2.9878 2.9495
Sex 0.4445 0.0000 1.0000
Less then High School 0.0233 0.0318 0.0127
High School Graduate 0.2439 0.2943 0.1809
Some College 0.2781 0.2733 0.2840
College Graduate 0.3177 0.2940 0.3473
Advanced Degree 0.1371 0.1066 0.1752
Northeast 0.2596 0.2590 0.2604
Midwest 0.2965 0.2981 0.2945
South 0.2161 0.2209 0.2101
West 0.2278 0.2220 0.2350
Experience 13.7606 13.7840 13.7313
\n" - ] - } - ], + "outputs": [], "source": [ "print(tab,type=\"html\") # set type=\"latex\" for printing table in LaTeX" ] @@ -370,40 +192,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:32.923496Z", - "iopub.status.busy": "2021-07-12T16:59:32.920163Z", - "iopub.status.idle": "2021-07-12T16:59:32.950737Z" - }, "id": "r8B46bNgyor6", "outputId": "b4d7a26d-fc67-4595-99d8-178ed1a872fe" }, - "outputs": [ - { - "data": { - "text/html": [ - "-0.0383447336744149" - ], - "text/latex": [ - "-0.0383447336744149" - ], - "text/markdown": [ - "-0.0383447336744149" - ], - "text/plain": [ - "[1] -0.03834473" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "mean(data_female$lwage)-mean(data_male$lwage)" ] @@ -441,28 +239,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:32.959171Z", - "iopub.status.busy": "2021-07-12T16:59:32.956083Z", - "iopub.status.idle": "2021-07-12T16:59:33.005653Z" - }, "id": "2kGIBjpYyor8", "outputId": "bface8f9-2135-43bb-9e54-7ab2d5641b16" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The estimated coefficient on the dummy for gender is -0.03834473 and the corresponding robust standard error is 0.01590824" - ] - } - ], + "outputs": [], "source": [ "nocontrol.fit <- lm(lwage ~ sex, data=data)\n", "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\",1]\n", @@ -508,299 +293,16 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:33.011754Z", - "iopub.status.busy": "2021-07-12T16:59:33.008621Z", - "iopub.status.idle": "2021-07-12T16:59:33.707831Z" - }, "id": "gemX0ZyTyosC", "outputId": "e5255cb1-5bed-491b-98b9-f95c39aa13e6" }, - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "Call:\n", - "lm(formula = flex, data = data)\n", - "\n", - "Residuals:\n", - " Min 1Q Median 3Q Max \n", - "-1.9384 -0.2782 -0.0041 0.2733 3.4934 \n", - "\n", - "Coefficients:\n", - " Estimate Std. Error t value Pr(>|t|) \n", - "(Intercept) 3.8602606 0.4286188 9.006 < 2e-16 ***\n", - "sex -0.0695532 0.0152180 -4.570 4.99e-06 ***\n", - "exp1 -0.0677247 0.1519756 -0.446 0.655885 \n", - "exp2 1.6362944 1.6909253 0.968 0.333246 \n", - "exp3 -0.9154735 0.6880249 -1.331 0.183388 \n", - "exp4 0.1429357 0.0907569 1.575 0.115337 \n", - "shs -0.1233089 0.9068325 -0.136 0.891845 \n", - "hsg -0.5289024 0.1977559 -2.675 0.007508 ** \n", - "scl -0.2920581 0.1260155 -2.318 0.020510 * \n", - "clg -0.0411641 0.0703862 -0.585 0.558688 \n", - "C(occ2)2 0.1613397 0.1297243 1.244 0.213665 \n", - "C(occ2)3 0.2101514 0.1686774 1.246 0.212869 \n", - "C(occ2)4 0.0708570 0.1837167 0.386 0.699746 \n", - "C(occ2)5 -0.3960076 0.1885398 -2.100 0.035745 * \n", - "C(occ2)6 -0.2310611 0.1869662 -1.236 0.216576 \n", - "C(occ2)7 0.3147249 0.1941519 1.621 0.105077 \n", - "C(occ2)8 -0.1875417 0.1692988 -1.108 0.268022 \n", - "C(occ2)9 -0.3390270 0.1672301 -2.027 0.042685 * \n", - "C(occ2)10 0.0209545 0.1564982 0.134 0.893490 \n", - "C(occ2)11 -0.6424177 0.3090899 -2.078 0.037723 * \n", - "C(occ2)12 -0.0674774 0.2520486 -0.268 0.788929 \n", - "C(occ2)13 -0.2329781 0.2315379 -1.006 0.314359 \n", - "C(occ2)14 0.2562009 0.3226729 0.794 0.427236 \n", - "C(occ2)15 -0.1938585 0.2595082 -0.747 0.455086 \n", - "C(occ2)16 -0.0551256 0.1470658 -0.375 0.707798 \n", - "C(occ2)17 -0.4156093 0.1361144 -3.053 0.002275 ** \n", - "C(occ2)18 -0.4822168 1.0443540 -0.462 0.644290 \n", - "C(occ2)19 -0.2579412 0.3325215 -0.776 0.437956 \n", - "C(occ2)20 -0.3010203 0.2341022 -1.286 0.198556 \n", - "C(occ2)21 -0.4271811 0.2206486 -1.936 0.052922 . \n", - "C(occ2)22 -0.8694527 0.2975222 -2.922 0.003490 ** \n", - "C(ind2)3 -1.2473654 0.6454941 -1.932 0.053365 . \n", - "C(ind2)4 -0.0948281 0.4636021 -0.205 0.837935 \n", - "C(ind2)5 -0.5293860 0.4345990 -1.218 0.223244 \n", - "C(ind2)6 -0.6221688 0.4347226 -1.431 0.152441 \n", - "C(ind2)7 -0.5047497 0.5024770 -1.005 0.315176 \n", - "C(ind2)8 -0.7295442 0.4674008 -1.561 0.118623 \n", - "C(ind2)9 -0.8025334 0.4252462 -1.887 0.059190 . \n", - "C(ind2)10 -0.5805840 0.4808776 -1.207 0.227358 \n", - "C(ind2)11 -0.9852350 0.4481566 -2.198 0.027966 * \n", - "C(ind2)12 -0.7375777 0.4243260 -1.738 0.082232 . \n", - "C(ind2)13 -1.0183283 0.4826544 -2.110 0.034922 * \n", - "C(ind2)14 -0.5860174 0.4159033 -1.409 0.158892 \n", - "C(ind2)15 -0.3801359 0.5908517 -0.643 0.520014 \n", - "C(ind2)16 -0.5703905 0.4386579 -1.300 0.193556 \n", - "C(ind2)17 -0.8201843 0.4259846 -1.925 0.054239 . \n", - "C(ind2)18 -0.7613604 0.4238287 -1.796 0.072495 . \n", - "C(ind2)19 -0.8812815 0.4565671 -1.930 0.053635 . \n", - "C(ind2)20 -0.9099021 0.4484198 -2.029 0.042499 * \n", - "C(ind2)21 -0.7586534 0.4405801 -1.722 0.085143 . \n", - "C(ind2)22 -0.4040775 0.4328735 -0.933 0.350620 \n", - "mw 0.1106834 0.0814463 1.359 0.174218 \n", - "so 0.0224244 0.0743855 0.301 0.763075 \n", - "we -0.0215659 0.0841591 -0.256 0.797767 \n", - "exp1:shs -0.1919981 0.1955408 -0.982 0.326206 \n", - "exp1:hsg -0.0173433 0.0572279 -0.303 0.761859 \n", - "exp1:scl -0.0664505 0.0433730 -1.532 0.125570 \n", - "exp1:clg -0.0550346 0.0310279 -1.774 0.076172 . \n", - "exp1:C(occ2)2 -0.0736239 0.0501108 -1.469 0.141837 \n", - "exp1:C(occ2)3 -0.0714859 0.0637688 -1.121 0.262336 \n", - "exp1:C(occ2)4 -0.0723997 0.0747715 -0.968 0.332953 \n", - "exp1:C(occ2)5 0.0946732 0.0794005 1.192 0.233182 \n", - "exp1:C(occ2)6 -0.0348928 0.0712136 -0.490 0.624175 \n", - "exp1:C(occ2)7 -0.2279338 0.0784860 -2.904 0.003699 ** \n", - "exp1:C(occ2)8 -0.0727459 0.0645883 -1.126 0.260094 \n", - "exp1:C(occ2)9 0.0274143 0.0669517 0.409 0.682217 \n", - "exp1:C(occ2)10 0.0075628 0.0581715 0.130 0.896564 \n", - "exp1:C(occ2)11 0.1014221 0.1005094 1.009 0.312986 \n", - "exp1:C(occ2)12 -0.0862744 0.0874768 -0.986 0.324057 \n", - "exp1:C(occ2)13 0.0067149 0.0761825 0.088 0.929768 \n", - "exp1:C(occ2)14 -0.1369153 0.0974458 -1.405 0.160073 \n", - "exp1:C(occ2)15 -0.0400425 0.0898931 -0.445 0.656017 \n", - "exp1:C(occ2)16 -0.0539314 0.0520926 -1.035 0.300580 \n", - "exp1:C(occ2)17 0.0147277 0.0467903 0.315 0.752958 \n", - "exp1:C(occ2)18 0.1074099 0.4718440 0.228 0.819937 \n", - "exp1:C(occ2)19 0.0047165 0.1060745 0.044 0.964536 \n", - "exp1:C(occ2)20 0.0243156 0.0743274 0.327 0.743575 \n", - "exp1:C(occ2)21 0.0791776 0.0696947 1.136 0.255985 \n", - "exp1:C(occ2)22 0.1093246 0.0880828 1.241 0.214607 \n", - "exp1:C(ind2)3 0.4758891 0.2227484 2.136 0.032693 * \n", - "exp1:C(ind2)4 0.0147304 0.1571102 0.094 0.925305 \n", - "exp1:C(ind2)5 0.1256987 0.1531626 0.821 0.411864 \n", - "exp1:C(ind2)6 0.1540275 0.1524289 1.010 0.312312 \n", - "exp1:C(ind2)7 0.1029245 0.1786939 0.576 0.564654 \n", - "exp1:C(ind2)8 0.2357669 0.1689203 1.396 0.162859 \n", - "exp1:C(ind2)9 0.1359079 0.1489486 0.912 0.361578 \n", - "exp1:C(ind2)10 0.1512578 0.1644341 0.920 0.357687 \n", - "exp1:C(ind2)11 0.3174885 0.1590023 1.997 0.045907 * \n", - "exp1:C(ind2)12 0.2591089 0.1510588 1.715 0.086356 . \n", - "exp1:C(ind2)13 0.3396094 0.1669241 2.035 0.041954 * \n", - "exp1:C(ind2)14 0.1441411 0.1477994 0.975 0.329485 \n", - "exp1:C(ind2)15 -0.0568181 0.2349853 -0.242 0.808950 \n", - "exp1:C(ind2)16 0.0847295 0.1550425 0.546 0.584753 \n", - "exp1:C(ind2)17 0.1728867 0.1513280 1.142 0.253317 \n", - "exp1:C(ind2)18 0.1565399 0.1494171 1.048 0.294842 \n", - "exp1:C(ind2)19 0.1516103 0.1620851 0.935 0.349641 \n", - "exp1:C(ind2)20 0.1326629 0.1566883 0.847 0.397222 \n", - "exp1:C(ind2)21 0.2190905 0.1555052 1.409 0.158930 \n", - "exp1:C(ind2)22 0.1145814 0.1523427 0.752 0.452010 \n", - "exp1:mw -0.0279931 0.0296572 -0.944 0.345274 \n", - "exp1:so -0.0099678 0.0266868 -0.374 0.708786 \n", - "exp1:we 0.0063077 0.0301417 0.209 0.834248 \n", - "exp2:shs 1.9005060 1.4502480 1.310 0.190098 \n", - "exp2:hsg 0.1171642 0.5509729 0.213 0.831609 \n", - "exp2:scl 0.6217923 0.4629986 1.343 0.179344 \n", - "exp2:clg 0.4096746 0.3802171 1.077 0.281321 \n", - "exp2:C(occ2)2 0.6632173 0.5523220 1.201 0.229895 \n", - "exp2:C(occ2)3 0.6415456 0.7102783 0.903 0.366448 \n", - "exp2:C(occ2)4 0.9748422 0.8655351 1.126 0.260099 \n", - "exp2:C(occ2)5 -0.9778823 0.9737990 -1.004 0.315335 \n", - "exp2:C(occ2)6 0.1050860 0.8002267 0.131 0.895527 \n", - "exp2:C(occ2)7 3.1407119 0.9389423 3.345 0.000829 ***\n", - "exp2:C(occ2)8 0.6710877 0.7192077 0.933 0.350818 \n", - "exp2:C(occ2)9 0.0231977 0.7629142 0.030 0.975744 \n", - "exp2:C(occ2)10 -0.2692292 0.6405270 -0.420 0.674267 \n", - "exp2:C(occ2)11 -1.0816539 1.0057575 -1.075 0.282221 \n", - "exp2:C(occ2)12 0.8323737 0.9341245 0.891 0.372933 \n", - "exp2:C(occ2)13 -0.2209813 0.7728463 -0.286 0.774942 \n", - "exp2:C(occ2)14 0.7511163 0.9272548 0.810 0.417955 \n", - "exp2:C(occ2)15 -0.0326858 0.9409116 -0.035 0.972290 \n", - "exp2:C(occ2)16 0.3635814 0.5509550 0.660 0.509342 \n", - "exp2:C(occ2)17 -0.2659285 0.4861131 -0.547 0.584369 \n", - "exp2:C(occ2)18 -2.5608762 5.1700911 -0.495 0.620393 \n", - "exp2:C(occ2)19 -0.1291756 1.0616901 -0.122 0.903165 \n", - "exp2:C(occ2)20 -0.3323297 0.7229071 -0.460 0.645743 \n", - "exp2:C(occ2)21 -0.9099997 0.6854114 -1.328 0.184349 \n", - "exp2:C(occ2)22 -0.8550536 0.8279414 -1.033 0.301773 \n", - "exp2:C(ind2)3 -5.9368948 2.4067939 -2.467 0.013670 * \n", - "exp2:C(ind2)4 -1.1053411 1.7101982 -0.646 0.518100 \n", - "exp2:C(ind2)5 -2.0149181 1.6919190 -1.191 0.233748 \n", - "exp2:C(ind2)6 -2.2277748 1.6816902 -1.325 0.185325 \n", - "exp2:C(ind2)7 -1.4648099 2.0137888 -0.727 0.467022 \n", - "exp2:C(ind2)8 -2.9479949 1.8595425 -1.585 0.112955 \n", - "exp2:C(ind2)9 -1.7796219 1.6471248 -1.080 0.279999 \n", - "exp2:C(ind2)10 -2.1973300 1.7738638 -1.239 0.215507 \n", - "exp2:C(ind2)11 -3.8776807 1.7637372 -2.199 0.027956 * \n", - "exp2:C(ind2)12 -3.1690425 1.6819362 -1.884 0.059602 . \n", - "exp2:C(ind2)13 -3.9651983 1.8130709 -2.187 0.028789 * \n", - "exp2:C(ind2)14 -2.0783289 1.6490355 -1.260 0.207610 \n", - "exp2:C(ind2)15 0.1911692 2.6075396 0.073 0.941559 \n", - "exp2:C(ind2)16 -1.3265850 1.7185648 -0.772 0.440202 \n", - "exp2:C(ind2)17 -2.2002873 1.6837183 -1.307 0.191341 \n", - "exp2:C(ind2)18 -2.2006232 1.6566630 -1.328 0.184125 \n", - "exp2:C(ind2)19 -1.9308536 1.7876673 -1.080 0.280152 \n", - "exp2:C(ind2)20 -1.9467267 1.7244008 -1.129 0.258983 \n", - "exp2:C(ind2)21 -3.1127363 1.7237908 -1.806 0.071019 . \n", - "exp2:C(ind2)22 -1.8578340 1.6849542 -1.103 0.270254 \n", - "exp2:mw 0.2005611 0.3172911 0.632 0.527348 \n", - "exp2:so 0.0544354 0.2815662 0.193 0.846708 \n", - "exp2:we 0.0012717 0.3207873 0.004 0.996837 \n", - "exp3:shs -0.6721239 0.4426627 -1.518 0.128987 \n", - "exp3:hsg -0.0179937 0.2083176 -0.086 0.931171 \n", - "exp3:scl -0.1997877 0.1855189 -1.077 0.281572 \n", - "exp3:clg -0.1025230 0.1643648 -0.624 0.532819 \n", - "exp3:C(occ2)2 -0.2039403 0.2211386 -0.922 0.356455 \n", - "exp3:C(occ2)3 -0.2369620 0.2870372 -0.826 0.409103 \n", - "exp3:C(occ2)4 -0.4366958 0.3520168 -1.241 0.214830 \n", - "exp3:C(occ2)5 0.3885298 0.4118861 0.943 0.345577 \n", - "exp3:C(occ2)6 0.0484737 0.3293525 0.147 0.882997 \n", - "exp3:C(occ2)7 -1.3949288 0.4050109 -3.444 0.000578 ***\n", - "exp3:C(occ2)8 -0.2053899 0.2895727 -0.709 0.478181 \n", - "exp3:C(occ2)9 -0.0909660 0.3143348 -0.289 0.772293 \n", - "exp3:C(occ2)10 0.1854753 0.2575565 0.720 0.471477 \n", - "exp3:C(occ2)11 0.3931553 0.3817758 1.030 0.303152 \n", - "exp3:C(occ2)12 -0.2202559 0.3660206 -0.602 0.547363 \n", - "exp3:C(occ2)13 0.0950356 0.2904370 0.327 0.743519 \n", - "exp3:C(occ2)14 -0.1443933 0.3341622 -0.432 0.665684 \n", - "exp3:C(occ2)15 0.1477077 0.3645191 0.405 0.685339 \n", - "exp3:C(occ2)16 -0.0378548 0.2151288 -0.176 0.860330 \n", - "exp3:C(occ2)17 0.1510497 0.1878081 0.804 0.421276 \n", - "exp3:C(occ2)18 1.4084443 1.8852467 0.747 0.455047 \n", - "exp3:C(occ2)19 0.0923425 0.4042308 0.228 0.819314 \n", - "exp3:C(occ2)20 0.1806994 0.2652079 0.681 0.495682 \n", - "exp3:C(occ2)21 0.3779083 0.2553031 1.480 0.138875 \n", - "exp3:C(occ2)22 0.2855058 0.2984206 0.957 0.338754 \n", - "exp3:C(ind2)3 2.6665808 0.9807497 2.719 0.006573 ** \n", - "exp3:C(ind2)4 0.7298431 0.6879811 1.061 0.288811 \n", - "exp3:C(ind2)5 0.9942250 0.6842435 1.453 0.146280 \n", - "exp3:C(ind2)6 1.0641428 0.6800948 1.565 0.117718 \n", - "exp3:C(ind2)7 0.7089089 0.8337963 0.850 0.395245 \n", - "exp3:C(ind2)8 1.2340948 0.7483474 1.649 0.099193 . \n", - "exp3:C(ind2)9 0.8287315 0.6675904 1.241 0.214526 \n", - "exp3:C(ind2)10 1.0448162 0.7066717 1.479 0.139337 \n", - "exp3:C(ind2)11 1.6877578 0.7162155 2.356 0.018487 * \n", - "exp3:C(ind2)12 1.3734455 0.6835570 2.009 0.044564 * \n", - "exp3:C(ind2)13 1.6376669 0.7259301 2.256 0.024117 * \n", - "exp3:C(ind2)14 1.0162910 0.6714525 1.514 0.130199 \n", - "exp3:C(ind2)15 0.1879483 1.0299675 0.182 0.855214 \n", - "exp3:C(ind2)16 0.6889680 0.6968028 0.989 0.322831 \n", - "exp3:C(ind2)17 1.0085540 0.6836992 1.475 0.140238 \n", - "exp3:C(ind2)18 1.0605598 0.6725232 1.577 0.114863 \n", - "exp3:C(ind2)19 0.8959865 0.7225602 1.240 0.215029 \n", - "exp3:C(ind2)20 0.9768944 0.6955822 1.404 0.160255 \n", - "exp3:C(ind2)21 1.4415215 0.6996480 2.060 0.039418 * \n", - "exp3:C(ind2)22 0.9687884 0.6828498 1.419 0.156037 \n", - "exp3:mw -0.0625771 0.1241291 -0.504 0.614194 \n", - "exp3:so -0.0115842 0.1084217 -0.107 0.914917 \n", - "exp3:we -0.0124875 0.1251376 -0.100 0.920515 \n", - "exp4:shs 0.0777418 0.0475427 1.635 0.102071 \n", - "exp4:hsg 0.0004913 0.0265964 0.018 0.985264 \n", - "exp4:scl 0.0210760 0.0245289 0.859 0.390256 \n", - "exp4:clg 0.0078695 0.0227528 0.346 0.729457 \n", - "exp4:C(occ2)2 0.0176389 0.0289257 0.610 0.542021 \n", - "exp4:C(occ2)3 0.0303057 0.0376552 0.805 0.420962 \n", - "exp4:C(occ2)4 0.0584146 0.0457704 1.276 0.201927 \n", - "exp4:C(occ2)5 -0.0515181 0.0549489 -0.938 0.348514 \n", - "exp4:C(occ2)6 -0.0170182 0.0440847 -0.386 0.699488 \n", - "exp4:C(occ2)7 0.1905353 0.0558757 3.410 0.000655 ***\n", - "exp4:C(occ2)8 0.0196522 0.0379084 0.518 0.604195 \n", - "exp4:C(occ2)9 0.0190014 0.0421099 0.451 0.651841 \n", - "exp4:C(occ2)10 -0.0333347 0.0338825 -0.984 0.325246 \n", - "exp4:C(occ2)11 -0.0465914 0.0479018 -0.973 0.330778 \n", - "exp4:C(occ2)12 0.0110212 0.0470536 0.234 0.814820 \n", - "exp4:C(occ2)13 -0.0136895 0.0358988 -0.381 0.702970 \n", - "exp4:C(occ2)14 0.0055582 0.0400331 0.139 0.889581 \n", - "exp4:C(occ2)15 -0.0327444 0.0462379 -0.708 0.478872 \n", - "exp4:C(occ2)16 -0.0089706 0.0275729 -0.325 0.744937 \n", - "exp4:C(occ2)17 -0.0256735 0.0239306 -1.073 0.283400 \n", - "exp4:C(occ2)18 -0.2121372 0.2204003 -0.963 0.335841 \n", - "exp4:C(occ2)19 -0.0169398 0.0513428 -0.330 0.741463 \n", - "exp4:C(occ2)20 -0.0296125 0.0323353 -0.916 0.359819 \n", - "exp4:C(occ2)21 -0.0524577 0.0317251 -1.654 0.098291 . \n", - "exp4:C(occ2)22 -0.0350646 0.0360687 -0.972 0.331018 \n", - "exp4:C(ind2)3 -0.3851791 0.1329065 -2.898 0.003771 ** \n", - "exp4:C(ind2)4 -0.1209478 0.0899580 -1.344 0.178852 \n", - "exp4:C(ind2)5 -0.1441045 0.0897994 -1.605 0.108616 \n", - "exp4:C(ind2)6 -0.1526110 0.0892689 -1.710 0.087410 . \n", - "exp4:C(ind2)7 -0.1001993 0.1119398 -0.895 0.370768 \n", - "exp4:C(ind2)8 -0.1609664 0.0979780 -1.643 0.100471 \n", - "exp4:C(ind2)9 -0.1178080 0.0877821 -1.342 0.179642 \n", - "exp4:C(ind2)10 -0.1482842 0.0918416 -1.615 0.106469 \n", - "exp4:C(ind2)11 -0.2322961 0.0944506 -2.459 0.013949 * \n", - "exp4:C(ind2)12 -0.1872911 0.0899985 -2.081 0.037481 * \n", - "exp4:C(ind2)13 -0.2155617 0.0946011 -2.279 0.022731 * \n", - "exp4:C(ind2)14 -0.1483524 0.0884992 -1.676 0.093740 . \n", - "exp4:C(ind2)15 -0.0532195 0.1313815 -0.405 0.685439 \n", - "exp4:C(ind2)16 -0.1044336 0.0916252 -1.140 0.254429 \n", - "exp4:C(ind2)17 -0.1427349 0.0899315 -1.587 0.112543 \n", - "exp4:C(ind2)18 -0.1546248 0.0885883 -1.745 0.080973 . \n", - "exp4:C(ind2)19 -0.1269592 0.0948784 -1.338 0.180918 \n", - "exp4:C(ind2)20 -0.1468554 0.0911188 -1.612 0.107094 \n", - "exp4:C(ind2)21 -0.2032619 0.0920972 -2.207 0.027358 * \n", - "exp4:C(ind2)22 -0.1480951 0.0897937 -1.649 0.099154 . \n", - "exp4:mw 0.0062439 0.0158699 0.393 0.694007 \n", - "exp4:so 0.0003145 0.0136275 0.023 0.981591 \n", - "exp4:we 0.0017685 0.0159602 0.111 0.911776 \n", - "---\n", - "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", - "\n", - "Residual standard error: 0.4708 on 4904 degrees of freedom\n", - "Multiple R-squared: 0.3511,\tAdjusted R-squared: 0.3187 \n", - "F-statistic: 10.83 on 245 and 4904 DF, p-value: < 2.2e-16\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Coefficient for OLS with controls -0.0695532" - ] - } - ], + "outputs": [], "source": [ "# ols regression with controls\n", "\n", @@ -840,20 +342,9 @@ }, { "cell_type": "code", - "execution_count": 210, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The marginal gap: -0.03834473 \n", - "The unexplained difference: -0.0695532 \n", - "The explained difference: 0.03120847 \n", - "The sum of these differences: -0.03834473 \n" - ] - } - ], + "outputs": [], "source": [ "XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==0,])\n", "y0 = data[data$sex==0,]$lwage\n", @@ -880,21 +371,9 @@ }, { "cell_type": "code", - "execution_count": 239, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The marginal gap: -0.03834473 \n", - "The unexplained difference: -2.320208 \n", - "The difference explained by endowment: 0.001595472 \n", - "The difference explained by coefficient: 2.280268 \n", - "The sum of these differences: -0.03834474 \n" - ] - } - ], + "outputs": [], "source": [ "svd0=svd(XX0)\n", "svd1=svd(XX1)\n", @@ -923,59 +402,16 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 52 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:33.713947Z", - "iopub.status.busy": "2021-07-12T16:59:33.710612Z", - "iopub.status.idle": "2021-07-12T16:59:34.077195Z" - }, "id": "iYSsI8ZEyosD", "outputId": "a11b3ec7-f1ec-4488-9350-69a28481d342" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Coefficient for D via partialling-out -0.0695532" - ] - }, - { - "data": { - "text/html": [ - "
2.5 %
-0.098671423574863
97.5 %
-0.0404349830188283
\n" - ], - "text/latex": [ - "\\begin{description*}\n", - "\\item[2.5 \\textbackslash{}\\%] -0.098671423574863\n", - "\\item[97.5 \\textbackslash{}\\%] -0.0404349830188283\n", - "\\end{description*}\n" - ], - "text/markdown": [ - "2.5 %\n", - ": -0.09867142357486397.5 %\n", - ": -0.0404349830188283\n", - "\n" - ], - "text/plain": [ - " 2.5 % 97.5 % \n", - "-0.09867142 -0.04043498 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Partialling-out using ols\n", "\n", @@ -1026,28 +462,15 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:34.081773Z", - "iopub.status.busy": "2021-07-12T16:59:34.080233Z", - "iopub.status.idle": "2021-07-12T16:59:37.470399Z" - }, "id": "Pd-5O1U8yosH", "outputId": "ffc5adbe-b367-422c-9421-112485563b6e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Coefficient for D via partialling-out using lasso -0.07193551" - ] - } - ], + "outputs": [], "source": [ "# Partialling-out using lasso\n", "\n", @@ -1091,74 +514,16 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:37.47687Z", - "iopub.status.busy": "2021-07-12T16:59:37.473312Z", - "iopub.status.idle": "2021-07-12T16:59:37.536069Z" - }, "id": "IpU4gNZayosI", "outputId": "fdb05e51-0596-4804-bad2-50c729ddd0aa" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A xtable: 4 × 2
EstimateStd. Error
<dbl><dbl>
Without controls-0.038344730.01590824
full reg-0.069553200.01569920
partial reg-0.069553200.01500873
partial reg via lasso-0.071935510.01539897
\n" - ], - "text/latex": [ - "A xtable: 4 × 2\n", - "\\begin{tabular}{r|ll}\n", - " & Estimate & Std. Error\\\\\n", - " & & \\\\\n", - "\\hline\n", - "\tWithout controls & -0.03834473 & 0.01590824\\\\\n", - "\tfull reg & -0.06955320 & 0.01569920\\\\\n", - "\tpartial reg & -0.06955320 & 0.01500873\\\\\n", - "\tpartial reg via lasso & -0.07193551 & 0.01539897\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A xtable: 4 × 2\n", - "\n", - "| | Estimate <dbl> | Std. Error <dbl> |\n", - "|---|---|---|\n", - "| Without controls | -0.03834473 | 0.01590824 |\n", - "| full reg | -0.06955320 | 0.01569920 |\n", - "| partial reg | -0.06955320 | 0.01500873 |\n", - "| partial reg via lasso | -0.07193551 | 0.01539897 |\n", - "\n" - ], - "text/plain": [ - " Estimate Std. Error\n", - "Without controls -0.03834473 0.01590824\n", - "full reg -0.06955320 0.01569920\n", - "partial reg -0.06955320 0.01500873\n", - "partial reg via lasso -0.07193551 0.01539897" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "table<- matrix(0, 4, 2)\n", "table[1,1]<- nocontrol.est\n", @@ -1177,36 +542,15 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:37.539348Z", - "iopub.status.busy": "2021-07-12T16:59:37.538094Z", - "iopub.status.idle": "2021-07-12T16:59:37.552624Z" - }, "id": "wGa29D7NyosJ", "outputId": "36b56e8c-3853-48e6-8740-16db8a5eab9e" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
Estimate Std. Error
Without controls -0.038 0.0159
full reg -0.070 0.0157
partial reg -0.070 0.0150
partial reg via lasso -0.072 0.0154
\n" - ] - } - ], + "outputs": [], "source": [ "print(tab, type=\"html\")" ] @@ -1260,7 +604,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": null, "metadata": { "id": "hjSPZpef1Mfc" }, @@ -1296,29 +640,15 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:37.555574Z", - "iopub.status.busy": "2021-07-12T16:59:37.554654Z", - "iopub.status.idle": "2021-07-12T16:59:43.753096Z" - }, "id": "vZ84pYQVyosL", "outputId": "f73bb150-a229-43b5-e052-7ab8390cc851" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of Extra-Flex Controls 979 \n", - "Coefficient for OLS with extra flex controls -0.06699231" - ] - } - ], + "outputs": [], "source": [ "# extra flexible model\n", "extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\n", @@ -1353,28 +683,15 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "execution": { - "iopub.execute_input": "2021-07-12T16:59:43.758359Z", - "iopub.status.busy": "2021-07-12T16:59:43.756261Z", - "iopub.status.idle": "2021-07-12T17:00:17.223523Z" - }, "id": "SfB4go24yosL", "outputId": "8584501d-8152-4b94-c516-4c901683c7a4" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Coefficient for D via partialling-out using lasso -0.05378994" - ] - } - ], + "outputs": [], "source": [ "# models\n", "extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for Y\n", @@ -1398,90 +715,16 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 442 }, - "execution": { - "iopub.execute_input": "2021-07-12T17:00:17.227972Z", - "iopub.status.busy": "2021-07-12T17:00:17.226397Z", - "iopub.status.idle": "2021-07-12T17:00:17.260738Z" - }, "id": "X9z2zpVtyosL", "outputId": "e68c579d-e447-45fd-ce6f-ee15b04f3055" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A xtable: 3 × 2
EstimateStd. Error
<dbl><dbl>
full reg, HC0-0.066992310.03929531
full reg, HC3-0.066992310.07252316
partial reg via lasso-0.053789940.03436033
\n" - ], - "text/latex": [ - "A xtable: 3 × 2\n", - "\\begin{tabular}{r|ll}\n", - " & Estimate & Std. Error\\\\\n", - " & & \\\\\n", - "\\hline\n", - "\tfull reg, HC0 & -0.06699231 & 0.03929531\\\\\n", - "\tfull reg, HC3 & -0.06699231 & 0.07252316\\\\\n", - "\tpartial reg via lasso & -0.05378994 & 0.03436033\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A xtable: 3 × 2\n", - "\n", - "| | Estimate <dbl> | Std. Error <dbl> |\n", - "|---|---|---|\n", - "| full reg, HC0 | -0.06699231 | 0.03929531 |\n", - "| full reg, HC3 | -0.06699231 | 0.07252316 |\n", - "| partial reg via lasso | -0.05378994 | 0.03436033 |\n", - "\n" - ], - "text/plain": [ - " Estimate Std. Error\n", - "full reg, HC0 -0.06699231 0.03929531\n", - "full reg, HC3 -0.06699231 0.07252316\n", - "partial reg via lasso -0.05378994 0.03436033" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", - "% Wed Feb 7 19:15:54 2024\n", - "\\begin{table}[ht]\n", - "\\centering\n", - "\\begin{tabular}{rrr}\n", - " \\hline\n", - " & Estimate & Std. Error \\\\ \n", - " \\hline\n", - "full reg, HC0 & -0.067 & 0.0393 \\\\ \n", - " full reg, HC3 & -0.067 & 0.0725 \\\\ \n", - " partial reg via lasso & -0.054 & 0.0344 \\\\ \n", - " \\hline\n", - "\\end{tabular}\n", - "\\end{table}\n" - ] - } - ], + "outputs": [], "source": [ "table<- matrix(0, 3, 2)\n", "table[1,1]<- control.est\n", diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 070b6f7d..83e9cc86 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -1,857 +1,767 @@ { - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, - "colab": { - "provenance": [] - } - }, - "nbformat_minor": 0, - "nbformat": 4, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "## Introduction" - ], - "metadata": { - "id": "-kWzJajkykal" - } - }, - { - "cell_type": "markdown", - "source": [ - "An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective.\n", - "\n", - "In the following wage example, $Y$ is the (log) hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are:\n", - "\n", - "* How can we use job-relevant characteristics, such as education and experience, to best predict wages?\n", - "\n", - "* What is the difference in predicted wages between men and women with the same job-relevant characteristics?\n", - "\n", - "In this lab, we focus on the prediction question first." - ], - "metadata": { - "id": "bWDpKmCRykam" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Data\n" - ], - "metadata": { - "id": "uPoP-4dXykan" - } - }, - { - "cell_type": "markdown", - "source": [ - "The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$.\n", - "\n", - "The variable of interest $Y$ is the (log) hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$." - ], - "metadata": { - "id": "_kNNsU5Kykan" - } - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\") # a library for high-dimensional metrics\n", - "install.packages(\"glmnet\") # for lasso CV\n", - "\n", - "library(hdm)\n", - "library(xtable)\n", - "library(glmnet)" - ], - "metadata": { - "id": "_zAzi7q14V1f" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Data analysis" - ], - "metadata": { - "id": "ecuLVkuhykao" - } - }, - { - "cell_type": "markdown", - "source": [ - "We start by loading the data set." - ], - "metadata": { - "id": "bnjk8mxBykao" - } - }, - { - "cell_type": "code", - "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", - "data <- read.csv(file)\n", - "dim(data)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:28:21.788603Z", - "iopub.execute_input": "2021-07-08T18:28:21.79079Z", - "iopub.status.idle": "2021-07-08T18:28:21.939468Z" - }, - "trusted": true, - "id": "eQ7eG0JTykao" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Let's have a look at the structure of the data." - ], - "metadata": { - "id": "pdVofJa0ykas" - } - }, - { - "cell_type": "code", - "source": [ - "str(data)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:28:21.942167Z", - "iopub.execute_input": "2021-07-08T18:28:21.974325Z", - "iopub.status.idle": "2021-07-08T18:28:22.011174Z" - }, - "trusted": true, - "id": "T1JH0uaXykat" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data." - ], - "metadata": { - "id": "Hin9wYrpykau" - } - }, - { - "cell_type": "code", - "source": [ - "# construct matrices for estimation from the data\n", - "Y <- log(data$wage)\n", - "n <- length(Y)\n", - "Z <- data[-which(colnames(data) %in% c(\"wage\",\"lwage\"))]\n", - "p <- dim(Z)[2]\n", - "\n", - "cat(\"Number of observations:\", n, '\\n')\n", - "cat( \"Number of raw regressors:\", p)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:28:22.013426Z", - "iopub.execute_input": "2021-07-08T18:28:22.014776Z", - "iopub.status.idle": "2021-07-08T18:28:22.038842Z" - }, - "trusted": true, - "id": "b8SdPks-ykau" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data." - ], - "metadata": { - "id": "FvqKO5rYykav" - } - }, - { - "cell_type": "code", - "source": [ - "# generate a table of means of variables\n", - "Z_subset <- data[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"mw\",\"so\",\"we\",\"ne\",\"exp1\"))]\n", - "table <- matrix(0, 12, 1)\n", - "table[1:12,1] <- as.numeric(lapply(Z_subset,mean))\n", - "rownames(table) <- c(\"Log Wage\",\"Sex\",\"Some High School\",\"High School Graduate\",\"Some College\",\"College Graduate\", \"Advanced Degree\",\"Midwest\",\"South\",\"West\",\"Northeast\",\"Experience\")\n", - "colnames(table) <- c(\"Sample mean\")\n", - "tab<- xtable(table, digits = 2)\n", - "tab" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:32:13.181466Z", - "iopub.execute_input": "2021-07-08T18:32:13.188888Z", - "iopub.status.idle": "2021-07-08T18:32:13.25268Z" - }, - "trusted": true, - "id": "mLttnFeKykav" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "E.g., the share of female workers in our sample is ~44% ($sex=1$ if female)." - ], - "metadata": { - "id": "ivClVVsCykaw" - } - }, - { - "cell_type": "markdown", - "source": [ - "Alternatively, using the xtable package, we can also print the table in LaTeX." - ], - "metadata": { - "id": "p7tfUT9gykaw" - } - }, - { - "cell_type": "code", - "source": [ - "print(tab, type=\"latex\") # type=\"latex\" for printing table in LaTeX" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:32:52.154266Z", - "iopub.execute_input": "2021-07-08T18:32:52.156136Z", - "iopub.status.idle": "2021-07-08T18:32:52.177001Z" - }, - "trusted": true, - "id": "wIojwGx4ykaw" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Prediction Question" - ], - "metadata": { - "id": "rTNep10kykaw" - } - }, - { - "cell_type": "markdown", - "source": [ - "Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$:\n", - "\n", - "\\begin{equation}\\label{decompose}\n", - "Y = \\beta'X+ \\epsilon.\n", - "\\end{equation}" - ], - "metadata": { - "id": "VAMsBil7ykaw" - } - }, - { - "cell_type": "markdown", - "source": [ - "Our goals are\n", - "\n", - "* Predict wages using various characteristics of workers.\n", - "\n", - "* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$.\n", - "\n", - "\n", - "Toward answering the latter, we measure the prediction quality of the two models via data splitting:\n", - "\n", - "- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider).\n", - "- Use the training sample to estimate the parameters of the Basic Model and the Flexible Model.\n", - "- Before using the testing sample, we evaluate in-sample fit.\n" - ], - "metadata": { - "id": "IwWMP87Cykax" - } - }, - { - "cell_type": "code", - "source": [ - "# splitting the data\n", - "set.seed(1) # to make the results replicable (we will generate random numbers)\n", - "random <- sample(1:n, floor(n*4/5)) # draw (4/5)*n random numbers from 1 to n without replacing\n", - "train <- data[random,]\n", - "test <- data[-random,]" - ], - "metadata": { - "id": "cArtOhBOWaNs" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "We employ two different specifications for prediction:\n", - "\n", - "\n", - "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators).\n", - "\n", - "\n", - "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*.\n", - "\n", - "Using the **Flexible Model** enables us to approximate the real relationship by a more complex regression model and therefore to reduce the bias. The **Flexible Model** increases the range of potential shapes of the estimated regression function. In general, flexible models often deliver higher prediction accuracy but are harder to interpret." - ], - "metadata": { - "id": "o65RUVCzSV9d" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Data-Splitting: In-sample performance" - ], - "metadata": { - "id": "o5zcXyBGg6ch" - } - }, - { - "cell_type": "markdown", - "source": [ - "Let us fit both models to our data by running ordinary least squares (ols):" - ], - "metadata": { - "id": "fPxlwhL9ykax" - } - }, - { - "cell_type": "code", - "source": [ - "# 1. basic model\n", - "basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2))\n", - "regbasic <- lm(basic, data=train) # perform ols using the defined model\n", - "cat( \"Number of regressors in the basic model:\",length(regbasic$coef), '\\n') # number of regressors in the Basic Model\n" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:37:37.891982Z", - "iopub.execute_input": "2021-07-08T18:37:37.895598Z", - "iopub.status.idle": "2021-07-08T18:37:37.976362Z" - }, - "trusted": true, - "id": "BhaBGtEBykax" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "##### Note that the basic model consists of $51$ regressors." - ], - "metadata": { - "id": "u0vJhtj8ykax" - } - }, - { - "cell_type": "code", - "source": [ - "# 2. flexible model\n", - "flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+C(occ2)+C(ind2) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", - "regflex <- lm(flex, data=train)\n", - "cat( \"Number of regressors in the flexible model:\",length(regflex$coef)) # number of regressors in the Flexible Model" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:37:57.22358Z", - "iopub.execute_input": "2021-07-08T18:37:57.225554Z", - "iopub.status.idle": "2021-07-08T18:37:57.431288Z" - }, - "trusted": true, - "id": "y74aI4bhykax" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "##### Note that the flexible model consists of $246$ regressors." - ], - "metadata": { - "id": "zAVUFc0Mykax" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Re-estimating the flexible model using Lasso\n", - "We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. " - ], - "metadata": { - "id": "WxT3rBbzykax" - } - }, - { - "cell_type": "code", - "source": [ - "# Flexible model using Lasso, in-sample fit\n", - "train_flex <- model.matrix(flex,train) # all regressors\n", - "fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = \"lambda.min\") # in-sample fit right now, not out-of-sample using \"test\"" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:52:56.57425Z", - "iopub.execute_input": "2021-07-08T18:52:56.575443Z", - "iopub.status.idle": "2021-07-08T18:53:02.824983Z" - }, - "trusted": true, - "id": "Nx21IQPrykay" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Evaluating the predictive performance of the basic and flexible models in-sample\n", - "Now, we can evaluate the performance of both models based on the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$:" - ], - "metadata": { - "id": "oJhnBR2Fykay" - } - }, - { - "cell_type": "code", - "source": [ - "# Assess predictive performance\n", - "sumbasic <- summary(regbasic)\n", - "sumflex <- summary(regflex)\n", - "# no summary() for lassocv\n", - "\n", - "ntrain = nrow(train)\n", - "\n", - "# R-squared and adjusted R-squared\n", - "R2.1 <- sumbasic$r.squared\n", - "cat(\"R-squared for the basic model: \", R2.1, \"\\n\")\n", - "R2.adj1 <- sumbasic$adj.r.squared\n", - "cat(\"adjusted R-squared for the basic model: \", R2.adj1, \"\\n\")\n", - "\n", - "R2.2 <- sumflex$r.squared\n", - "cat(\"R-squared for the flexible model: \", R2.2, \"\\n\")\n", - "R2.adj2 <- sumflex$adj.r.squared\n", - "cat(\"adjusted R-squared for the flexible model: \", R2.adj2, \"\\n\")\n", - "\n", - "pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]]\n", - "R2.L <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", - "cat(\"R-squared for the lasso with flexible model: \", R2.L, \"\\n\")\n", - "R2.adjL <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", - "cat(\"adjusted R-squared for the flexible model: \", R2.adjL, \"\\n\")\n", - "\n", - "# MSE and adjusted MSE\n", - "MSE1 <- mean(sumbasic$res^2)\n", - "cat(\"MSE for the basic model: \", MSE1, \"\\n\")\n", - "p1 <- sumbasic$df[1] # number of regressors\n", - "MSE.adj1 <- (ntrain/(ntrain-p1))*MSE1\n", - "cat(\"adjusted MSE for the basic model: \", MSE.adj1, \"\\n\")\n", - "\n", - "MSE2 <-mean(sumflex$res^2)\n", - "cat(\"MSE for the flexible model: \", MSE2, \"\\n\")\n", - "p2 <- sumflex$df[1]\n", - "MSE.adj2 <- (ntrain/(ntrain-p2))*MSE2\n", - "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adj2, \"\\n\")\n", - "\n", - "lasso.res <- train$lwage - yhat.lasso.cv\n", - "MSEL <-mean(lasso.res^2)\n", - "cat(\"MSE for the lasso flexible model: \", MSEL, \"\\n\")\n", - "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", - "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adjL, \"\\n\")" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:40:36.165581Z", - "iopub.execute_input": "2021-07-08T18:40:36.167647Z", - "iopub.status.idle": "2021-07-08T18:40:36.313986Z" - }, - "trusted": true, - "id": "cSuifUPiykay" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Output the table\n", - "table <- matrix(0, 3, 5)\n", - "table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1)\n", - "table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2)\n", - "table[3,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL)\n", - "colnames(table)<- c(\"p\",\"$R^2_{sample}$\",\"$MSE_{sample}$\",\"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", - "rownames(table)<- c(\"basic reg\",\"flexible reg\", \"lasso flex\")\n", - "tab<- xtable(table, digits =c(0,0,2,2,2,2))\n", - "print(tab,type=\"latex\")\n", - "tab" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T18:45:38.193447Z", - "iopub.execute_input": "2021-07-08T18:45:38.195503Z", - "iopub.status.idle": "2021-07-08T18:45:38.239301Z" - }, - "trusted": true, - "id": "lo8UazUiykay" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Considering the measures above, the flexible model performs slightly better than the basic model.\n", - "\n", - "As $p/n$ is not large, the discrepancy between the adjusted and unadjusted measures is not large. However, if it were, we might still like to apply **data splitting** as a more general procedure to deal with potential overfitting if $p/n$. We illustrate the approach in the following." - ], - "metadata": { - "id": "vHTewaFjykaz" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Data Splitting: Out-of-sample performance\n", - "\n", - "Now that we have seen in-sample fit, we evaluate our models on the out-of-sample performance:\n", - "- Use the testing sample for evaluation. Predict the $\\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample.\n", - "- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for both prediction models.\n" - ], - "metadata": { - "id": "YL1sJ2Rrykaz" - } - }, - { - "cell_type": "code", - "source": [ - "# basic model\n", - "options(warn=-1) # ignore warnings\n", - "regbasic <- lm(basic, data=train)\n", - "\n", - "# calculating the out-of-sample MSE\n", - "yhat.bas <- predict(regbasic, newdata=test)\n", - "y.test <- test$lwage\n", - "mean.train = mean(train$lwage)\n", - "MSE.test1 <- sum((y.test-yhat.bas)^2)/length(y.test)\n", - "R2.test1<- 1- MSE.test1/mean((y.test-mean.train)^2)\n", - "\n", - "cat(\"Test MSE for the basic model: \", MSE.test1, \" \")\n", - "cat(\"Test R2 for the basic model: \", R2.test1)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T19:09:30.936688Z", - "iopub.execute_input": "2021-07-08T19:09:30.937619Z", - "iopub.status.idle": "2021-07-08T19:09:30.994861Z" - }, - "trusted": true, - "id": "kwdlZV-iykaz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$." - ], - "metadata": { - "id": "IUaPTo3Byka0" - } - }, - { - "cell_type": "code", - "source": [ - "# flexible model\n", - "options(warn=-1) # ignore warnings\n", - "regflex <- lm(flex, data=train)\n", - "\n", - "# calculating the out-of-sample MSE\n", - "yhat.flex<- predict(regflex, newdata=test)\n", - "y.test <- test$lwage\n", - "mean.train = mean(train$lwage)\n", - "MSE.test2 <- sum((y.test-yhat.flex)^2)/length(y.test)\n", - "R2.test2<- 1- MSE.test2/mean((y.test-mean.train)^2)\n", - "\n", - "cat(\"Test MSE for the flexible model: \", MSE.test2, \" \")\n", - "\n", - "cat(\"Test R2 for the flexible model: \", R2.test2)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T19:09:33.397254Z", - "iopub.execute_input": "2021-07-08T19:09:33.398926Z", - "iopub.status.idle": "2021-07-08T19:09:33.540486Z" - }, - "trusted": true, - "id": "U2lQCgJeyka0" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "In the flexible model too, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is not large." - ], - "metadata": { - "id": "F-1niaXxyka0" - } - }, - { - "cell_type": "markdown", - "source": [ - "It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results.\n", - "\n", - "Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model.\n", - "\n", - "Next, let us use lasso regression in the flexible model instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in the flexible model to ols regression." - ], - "metadata": { - "id": "J4RRkCzHyka0" - } - }, - { - "cell_type": "code", - "source": [ - "# Flexible model using Lasso\n", - "# model matrix should be formed before train/test as some levels dropped\n", - "flex_data = model.matrix(flex,data)\n", - "train_flex <- flex_data[random,]\n", - "test_flex <- flex_data[-random,]\n", - "\n", - "fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = \"lambda.min\")\n", - "\n", - "# calculating the out-of-sample MSE\n", - "MSE.lasso <- sum((y.test-yhat.lasso.cv)^2)/length(y.test)\n", - "R2.lasso<- 1- MSE.lasso/mean((y.test-mean(train$lwage))^2)\n", - "\n", - "cat(\"Test MSE for the lasso on flexible model: \", MSE.lasso, \" \")\n", - "\n", - "cat(\"Test R2 for the lasso flexible model: \", R2.lasso)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T19:09:35.618543Z", - "iopub.execute_input": "2021-07-08T19:09:35.619332Z", - "iopub.status.idle": "2021-07-08T19:09:37.560353Z" - }, - "trusted": true, - "id": "5xFTH78Kyka1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Finally, let us summarize the results:" - ], - "metadata": { - "id": "c_dnTXxryka1" - } - }, - { - "cell_type": "code", - "source": [ - "# Output the comparison table\n", - "table2 <- matrix(0, 3,2)\n", - "table2[1,1] <- MSE.test1\n", - "table2[2,1] <- MSE.test2\n", - "table2[3,1] <- MSE.lasso\n", - "table2[1,2] <- R2.test1\n", - "table2[2,2] <- R2.test2\n", - "table2[3,2] <- R2.lasso\n", - "\n", - "rownames(table2)<- c(\"basic reg\",\"flexible reg\",\"lasso regression\")\n", - "colnames(table2)<- c(\"$MSE_{test}$\", \"$R^2_{test}$\")\n", - "tab2 <- xtable(table2, digits =3)\n", - "tab2" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T19:09:39.761011Z", - "iopub.execute_input": "2021-07-08T19:09:39.762739Z", - "iopub.status.idle": "2021-07-08T19:09:39.795945Z" - }, - "trusted": true, - "id": "PS3YeUT_yka2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "print(tab2,type=\"latex\")" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-07-08T19:09:42.229018Z", - "iopub.execute_input": "2021-07-08T19:09:42.230825Z", - "iopub.status.idle": "2021-07-08T19:09:42.245937Z" - }, - "trusted": true, - "id": "kFuPPNytyka2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Extra flexible model and Overfitting\n", - "Given the results above, it is not immediately clear why one would choose to use Lasso as results are fairly similar. To motivate, we consider an extra flexible model to show how OLS can overfit significantly to the in-sample train data and perform poorly on the out-of-sample testing data.\n", - "\n" - ], - "metadata": { - "id": "vLFGDU2lnHTM" - } - }, - { - "cell_type": "code", - "source": [ - "# extra flexible model\n", - "extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\n", - "regextra <- lm(extraflex, data=train)\n", - "sumextra <- summary(regextra)\n", - "cat(\"Number of Extra-Flex Controls\", length(regextra$coef)-1, \"\\n\")\n", - "n= length(data$wage); p =length(regextra$coef);\n", - "ntrain = length(train$wage)" - ], - "metadata": { - "id": "G_Mm2gG3nwMn" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## In-sample\n", - "# R-squared and adjusted R-squared\n", - "R2.extra <- sumextra$r.squared\n", - "cat(\"R-squared for the extra flexible model (in-sample): \", R2.extra, \"\\n\")\n", - "R2.adjextra <- sumextra$adj.r.squared\n", - "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjextra, \"\\n\")\n", - "\n", - "# MSE and adjusted MSE\n", - "MSE.extra <- mean(sumextra$res^2)\n", - "cat(\"MSE for the extra flexible model (in-sample): \", MSE.extra, \"\\n\")\n", - "MSE.adjextra <- (ntrain/(ntrain-p))*MSE.extra\n", - "cat(\"adjusted MSE for the basic model (in-sample): \", MSE.adj1, \"\\n\")" - ], - "metadata": { - "id": "OXSNAxauoJ3h" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## Out-of-sample\n", - "yhat.ex <- predict(regextra, newdata=test)\n", - "y.test.ex <- test$lwage\n", - "MSE.test.ex <- sum((y.test.ex-yhat.ex)^2)/length(y.test.ex)\n", - "R2.test.ex<- 1- MSE.test.ex/mean((y.test.ex-mean(train$lwage))^2)\n", - "\n", - "cat(\"Test MSE for the basic model: \", MSE.test.ex, \" \")\n", - "cat(\"Test R2 for the basic model: \", R2.test.ex)" - ], - "metadata": { - "id": "vFrRnnlmo9yG" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As we can see, a simple OLS overfits when the dimensionality of covariates is high, as the out-of-sample performance suffers dramatically in comparison to the in-sample performance.\n", - "\n", - "Contrast this with Lasso:" - ], - "metadata": { - "id": "Bat-VbZ-pL5R" - } - }, - { - "cell_type": "code", - "source": [ - "# model matrix should be formed before train/test as some levels dropped\n", - "flex_data = model.matrix(extraflex,data)\n", - "train_flex <- flex_data[random,]\n", - "test_flex <- flex_data[-random,]\n", - "\n", - "# fit model\n", - "fit.lcv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", - "\n", - "# in-sample\n", - "yhat.lcv <- predict(fit.lcv, newx = train_flex, s = \"lambda.min\")\n", - "\n", - "R2.L <- 1-(sum((yhat.lcv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", - "pL <- fit.lcv$nzero[fit.lcv$index[1]]\n", - "R2.adjL <- 1-(sum((yhat.lcv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", - "\n", - "lasso.res <- train$lwage - yhat.lcv\n", - "MSEL <-mean(lasso.res^2)\n", - "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", - "\n", - "cat(\"R-squared for the lasso with the extra flexible model (in-sample): \", R2.L, \"\\n\")\n", - "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjL, \"\\n\")\n", - "cat(\"MSE for the lasso with the extra flexible model (in-sample): \", MSEL, \"\\n\")\n", - "cat(\"adjusted MSE for the lasso with the extraflexible model (in-sample): \", MSE.adjL, \"\\n\")\n", - "\n", - "# out-of-sample\n", - "yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = \"lambda.min\")\n", - "MSE.lasso <- sum((test$lwage-yhat.lcv.test)^2)/length(test$lwage)\n", - "R2.lasso <- 1- MSE.lasso/mean((test$lwage-mean(train$lwage))^2)\n", - "\n", - "cat(\"\\n\")\n", - "cat(\"Test R2 for the lasso the extra flexible model: \", R2.lasso,\"\\n\")\n", - "cat(\"Test MSE for the lasso on the extra flexible model: \", MSE.lasso)\n" - ], - "metadata": { - "id": "TYvDJ3QepUgl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As shown above, the overfitting effect is mitigated with the penalized regression model." - ], - "metadata": { - "id": "oIdTLf8Uq41n" - } - } - ] -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-kWzJajkykal" + }, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bWDpKmCRykam" + }, + "source": [ + "An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective.\n", + "\n", + "In the following wage example, $Y$ is the (log) hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are:\n", + "\n", + "* How can we use job-relevant characteristics, such as education and experience, to best predict wages?\n", + "\n", + "* What is the difference in predicted wages between men and women with the same job-relevant characteristics?\n", + "\n", + "In this lab, we focus on the prediction question first." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uPoP-4dXykan" + }, + "source": [ + "## Data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_kNNsU5Kykan" + }, + "source": [ + "The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$.\n", + "\n", + "The variable of interest $Y$ is the (log) hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_zAzi7q14V1f" + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\") # a library for high-dimensional metrics\n", + "install.packages(\"glmnet\") # for lasso CV\n", + "\n", + "library(hdm)\n", + "library(xtable)\n", + "library(glmnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ecuLVkuhykao" + }, + "source": [ + "## Data analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bnjk8mxBykao" + }, + "source": [ + "We start by loading the data set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eQ7eG0JTykao" + }, + "outputs": [], + "source": [ + "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "data <- read.csv(file)\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pdVofJa0ykas" + }, + "source": [ + "Let's have a look at the structure of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T1JH0uaXykat" + }, + "outputs": [], + "source": [ + "str(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hin9wYrpykau" + }, + "source": [ + "We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b8SdPks-ykau" + }, + "outputs": [], + "source": [ + "# construct matrices for estimation from the data\n", + "Y <- log(data$wage)\n", + "n <- length(Y)\n", + "Z <- data[-which(colnames(data) %in% c(\"wage\",\"lwage\"))]\n", + "p <- dim(Z)[2]\n", + "\n", + "cat(\"Number of observations:\", n, '\\n')\n", + "cat( \"Number of raw regressors:\", p)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FvqKO5rYykav" + }, + "source": [ + "For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mLttnFeKykav" + }, + "outputs": [], + "source": [ + "# generate a table of means of variables\n", + "Z_subset <- data[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"mw\",\"so\",\"we\",\"ne\",\"exp1\"))]\n", + "table <- matrix(0, 12, 1)\n", + "table[1:12,1] <- as.numeric(lapply(Z_subset,mean))\n", + "rownames(table) <- c(\"Log Wage\",\"Sex\",\"Some High School\",\"High School Graduate\",\"Some College\",\"College Graduate\", \"Advanced Degree\",\"Midwest\",\"South\",\"West\",\"Northeast\",\"Experience\")\n", + "colnames(table) <- c(\"Sample mean\")\n", + "tab<- xtable(table, digits = 2)\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ivClVVsCykaw" + }, + "source": [ + "E.g., the share of female workers in our sample is ~44% ($sex=1$ if female)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p7tfUT9gykaw" + }, + "source": [ + "Alternatively, using the xtable package, we can also print the table in LaTeX." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wIojwGx4ykaw" + }, + "outputs": [], + "source": [ + "print(tab, type=\"latex\") # type=\"latex\" for printing table in LaTeX" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rTNep10kykaw" + }, + "source": [ + "## Prediction Question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VAMsBil7ykaw" + }, + "source": [ + "Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$:\n", + "\n", + "\\begin{equation}\\label{decompose}\n", + "Y = \\beta'X+ \\epsilon.\n", + "\\end{equation}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IwWMP87Cykax" + }, + "source": [ + "Our goals are\n", + "\n", + "* Predict wages using various characteristics of workers.\n", + "\n", + "* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$.\n", + "\n", + "\n", + "Toward answering the latter, we measure the prediction quality of the two models via data splitting:\n", + "\n", + "- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider).\n", + "- Use the training sample to estimate the parameters of the Basic Model and the Flexible Model.\n", + "- Before using the testing sample, we evaluate in-sample fit.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cArtOhBOWaNs" + }, + "outputs": [], + "source": [ + "# splitting the data\n", + "set.seed(1) # to make the results replicable (we will generate random numbers)\n", + "random <- sample(1:n, floor(n*4/5)) # draw (4/5)*n random numbers from 1 to n without replacing\n", + "train <- data[random,]\n", + "test <- data[-random,]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o65RUVCzSV9d" + }, + "source": [ + "\n", + "We employ two different specifications for prediction:\n", + "\n", + "\n", + "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators).\n", + "\n", + "\n", + "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*.\n", + "\n", + "Using the **Flexible Model** enables us to approximate the real relationship by a more complex regression model and therefore to reduce the bias. The **Flexible Model** increases the range of potential shapes of the estimated regression function. In general, flexible models often deliver higher prediction accuracy but are harder to interpret." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o5zcXyBGg6ch" + }, + "source": [ + "## Data-Splitting: In-sample performance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fPxlwhL9ykax" + }, + "source": [ + "Let us fit both models to our data by running ordinary least squares (ols):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BhaBGtEBykax" + }, + "outputs": [], + "source": [ + "# 1. basic model\n", + "basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2))\n", + "regbasic <- lm(basic, data=train) # perform ols using the defined model\n", + "cat( \"Number of regressors in the basic model:\",length(regbasic$coef), '\\n') # number of regressors in the Basic Model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u0vJhtj8ykax" + }, + "source": [ + "##### Note that the basic model consists of $51$ regressors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y74aI4bhykax" + }, + "outputs": [], + "source": [ + "# 2. flexible model\n", + "flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+C(occ2)+C(ind2) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", + "regflex <- lm(flex, data=train)\n", + "cat( \"Number of regressors in the flexible model:\",length(regflex$coef)) # number of regressors in the Flexible Model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zAVUFc0Mykax" + }, + "source": [ + "##### Note that the flexible model consists of $246$ regressors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WxT3rBbzykax" + }, + "source": [ + "#### Re-estimating the flexible model using Lasso\n", + "We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx21IQPrykay" + }, + "outputs": [], + "source": [ + "# Flexible model using Lasso, in-sample fit\n", + "train_flex <- model.matrix(flex,train) # all regressors\n", + "fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", + "yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = \"lambda.min\") # in-sample fit right now, not out-of-sample using \"test\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oJhnBR2Fykay" + }, + "source": [ + "#### Evaluating the predictive performance of the basic and flexible models in-sample\n", + "Now, we can evaluate the performance of both models based on the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cSuifUPiykay" + }, + "outputs": [], + "source": [ + "# Assess predictive performance\n", + "sumbasic <- summary(regbasic)\n", + "sumflex <- summary(regflex)\n", + "# no summary() for lassocv\n", + "\n", + "ntrain = nrow(train)\n", + "\n", + "# R-squared and adjusted R-squared\n", + "R2.1 <- sumbasic$r.squared\n", + "cat(\"R-squared for the basic model: \", R2.1, \"\\n\")\n", + "R2.adj1 <- sumbasic$adj.r.squared\n", + "cat(\"adjusted R-squared for the basic model: \", R2.adj1, \"\\n\")\n", + "\n", + "R2.2 <- sumflex$r.squared\n", + "cat(\"R-squared for the flexible model: \", R2.2, \"\\n\")\n", + "R2.adj2 <- sumflex$adj.r.squared\n", + "cat(\"adjusted R-squared for the flexible model: \", R2.adj2, \"\\n\")\n", + "\n", + "pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]]\n", + "R2.L <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", + "cat(\"R-squared for the lasso with flexible model: \", R2.L, \"\\n\")\n", + "R2.adjL <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", + "cat(\"adjusted R-squared for the flexible model: \", R2.adjL, \"\\n\")\n", + "\n", + "# MSE and adjusted MSE\n", + "MSE1 <- mean(sumbasic$res^2)\n", + "cat(\"MSE for the basic model: \", MSE1, \"\\n\")\n", + "p1 <- sumbasic$df[1] # number of regressors\n", + "MSE.adj1 <- (ntrain/(ntrain-p1))*MSE1\n", + "cat(\"adjusted MSE for the basic model: \", MSE.adj1, \"\\n\")\n", + "\n", + "MSE2 <-mean(sumflex$res^2)\n", + "cat(\"MSE for the flexible model: \", MSE2, \"\\n\")\n", + "p2 <- sumflex$df[1]\n", + "MSE.adj2 <- (ntrain/(ntrain-p2))*MSE2\n", + "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adj2, \"\\n\")\n", + "\n", + "lasso.res <- train$lwage - yhat.lasso.cv\n", + "MSEL <-mean(lasso.res^2)\n", + "cat(\"MSE for the lasso flexible model: \", MSEL, \"\\n\")\n", + "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", + "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adjL, \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lo8UazUiykay" + }, + "outputs": [], + "source": [ + "# Output the table\n", + "table <- matrix(0, 3, 5)\n", + "table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1)\n", + "table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2)\n", + "table[3,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL)\n", + "colnames(table)<- c(\"p\",\"$R^2_{sample}$\",\"$MSE_{sample}$\",\"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", + "rownames(table)<- c(\"basic reg\",\"flexible reg\", \"lasso flex\")\n", + "tab<- xtable(table, digits =c(0,0,2,2,2,2))\n", + "print(tab,type=\"latex\")\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vHTewaFjykaz" + }, + "source": [ + "Considering the measures above, the flexible model performs slightly better than the basic model.\n", + "\n", + "As $p/n$ is not large, the discrepancy between the adjusted and unadjusted measures is not large. However, if it were, we might still like to apply **data splitting** as a more general procedure to deal with potential overfitting if $p/n$. We illustrate the approach in the following." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YL1sJ2Rrykaz" + }, + "source": [ + "## Data Splitting: Out-of-sample performance\n", + "\n", + "Now that we have seen in-sample fit, we evaluate our models on the out-of-sample performance:\n", + "- Use the testing sample for evaluation. Predict the $\\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample.\n", + "- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for both prediction models.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kwdlZV-iykaz" + }, + "outputs": [], + "source": [ + "# basic model\n", + "options(warn=-1) # ignore warnings\n", + "regbasic <- lm(basic, data=train)\n", + "\n", + "# calculating the out-of-sample MSE\n", + "yhat.bas <- predict(regbasic, newdata=test)\n", + "y.test <- test$lwage\n", + "mean.train = mean(train$lwage)\n", + "MSE.test1 <- sum((y.test-yhat.bas)^2)/length(y.test)\n", + "R2.test1<- 1- MSE.test1/mean((y.test-mean.train)^2)\n", + "\n", + "cat(\"Test MSE for the basic model: \", MSE.test1, \" \")\n", + "cat(\"Test R2 for the basic model: \", R2.test1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IUaPTo3Byka0" + }, + "source": [ + "In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U2lQCgJeyka0" + }, + "outputs": [], + "source": [ + "# flexible model\n", + "options(warn=-1) # ignore warnings\n", + "regflex <- lm(flex, data=train)\n", + "\n", + "# calculating the out-of-sample MSE\n", + "yhat.flex<- predict(regflex, newdata=test)\n", + "y.test <- test$lwage\n", + "mean.train = mean(train$lwage)\n", + "MSE.test2 <- sum((y.test-yhat.flex)^2)/length(y.test)\n", + "R2.test2<- 1- MSE.test2/mean((y.test-mean.train)^2)\n", + "\n", + "cat(\"Test MSE for the flexible model: \", MSE.test2, \" \")\n", + "\n", + "cat(\"Test R2 for the flexible model: \", R2.test2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F-1niaXxyka0" + }, + "source": [ + "In the flexible model too, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is not large." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J4RRkCzHyka0" + }, + "source": [ + "It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results.\n", + "\n", + "Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model.\n", + "\n", + "Next, let us use lasso regression in the flexible model instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in the flexible model to ols regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5xFTH78Kyka1" + }, + "outputs": [], + "source": [ + "# Flexible model using Lasso\n", + "# model matrix should be formed before train/test as some levels dropped\n", + "flex_data = model.matrix(flex,data)\n", + "train_flex <- flex_data[random,]\n", + "test_flex <- flex_data[-random,]\n", + "\n", + "fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", + "yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = \"lambda.min\")\n", + "\n", + "# calculating the out-of-sample MSE\n", + "MSE.lasso <- sum((y.test-yhat.lasso.cv)^2)/length(y.test)\n", + "R2.lasso<- 1- MSE.lasso/mean((y.test-mean(train$lwage))^2)\n", + "\n", + "cat(\"Test MSE for the lasso on flexible model: \", MSE.lasso, \" \")\n", + "\n", + "cat(\"Test R2 for the lasso flexible model: \", R2.lasso)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c_dnTXxryka1" + }, + "source": [ + "Finally, let us summarize the results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PS3YeUT_yka2" + }, + "outputs": [], + "source": [ + "# Output the comparison table\n", + "table2 <- matrix(0, 3,2)\n", + "table2[1,1] <- MSE.test1\n", + "table2[2,1] <- MSE.test2\n", + "table2[3,1] <- MSE.lasso\n", + "table2[1,2] <- R2.test1\n", + "table2[2,2] <- R2.test2\n", + "table2[3,2] <- R2.lasso\n", + "\n", + "rownames(table2)<- c(\"basic reg\",\"flexible reg\",\"lasso regression\")\n", + "colnames(table2)<- c(\"$MSE_{test}$\", \"$R^2_{test}$\")\n", + "tab2 <- xtable(table2, digits =3)\n", + "tab2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kFuPPNytyka2" + }, + "outputs": [], + "source": [ + "print(tab2,type=\"latex\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vLFGDU2lnHTM" + }, + "source": [ + "## Extra flexible model and Overfitting\n", + "Given the results above, it is not immediately clear why one would choose to use Lasso as results are fairly similar. To motivate, we consider an extra flexible model to show how OLS can overfit significantly to the in-sample train data and perform poorly on the out-of-sample testing data.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G_Mm2gG3nwMn" + }, + "outputs": [], + "source": [ + "# extra flexible model\n", + "extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\n", + "regextra <- lm(extraflex, data=train)\n", + "sumextra <- summary(regextra)\n", + "cat(\"Number of Extra-Flex Controls\", length(regextra$coef)-1, \"\\n\")\n", + "n= length(data$wage); p =length(regextra$coef);\n", + "ntrain = length(train$wage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OXSNAxauoJ3h" + }, + "outputs": [], + "source": [ + "## In-sample\n", + "# R-squared and adjusted R-squared\n", + "R2.extra <- sumextra$r.squared\n", + "cat(\"R-squared for the extra flexible model (in-sample): \", R2.extra, \"\\n\")\n", + "R2.adjextra <- sumextra$adj.r.squared\n", + "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjextra, \"\\n\")\n", + "\n", + "# MSE and adjusted MSE\n", + "MSE.extra <- mean(sumextra$res^2)\n", + "cat(\"MSE for the extra flexible model (in-sample): \", MSE.extra, \"\\n\")\n", + "MSE.adjextra <- (ntrain/(ntrain-p))*MSE.extra\n", + "cat(\"adjusted MSE for the basic model (in-sample): \", MSE.adj1, \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vFrRnnlmo9yG" + }, + "outputs": [], + "source": [ + "## Out-of-sample\n", + "yhat.ex <- predict(regextra, newdata=test)\n", + "y.test.ex <- test$lwage\n", + "MSE.test.ex <- sum((y.test.ex-yhat.ex)^2)/length(y.test.ex)\n", + "R2.test.ex<- 1- MSE.test.ex/mean((y.test.ex-mean(train$lwage))^2)\n", + "\n", + "cat(\"Test MSE for the basic model: \", MSE.test.ex, \" \")\n", + "cat(\"Test R2 for the basic model: \", R2.test.ex)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bat-VbZ-pL5R" + }, + "source": [ + "As we can see, a simple OLS overfits when the dimensionality of covariates is high, as the out-of-sample performance suffers dramatically in comparison to the in-sample performance.\n", + "\n", + "Contrast this with Lasso:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TYvDJ3QepUgl" + }, + "outputs": [], + "source": [ + "# model matrix should be formed before train/test as some levels dropped\n", + "flex_data = model.matrix(extraflex,data)\n", + "train_flex <- flex_data[random,]\n", + "test_flex <- flex_data[-random,]\n", + "\n", + "# fit model\n", + "fit.lcv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", + "\n", + "# in-sample\n", + "yhat.lcv <- predict(fit.lcv, newx = train_flex, s = \"lambda.min\")\n", + "\n", + "R2.L <- 1-(sum((yhat.lcv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", + "pL <- fit.lcv$nzero[fit.lcv$index[1]]\n", + "R2.adjL <- 1-(sum((yhat.lcv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", + "\n", + "lasso.res <- train$lwage - yhat.lcv\n", + "MSEL <-mean(lasso.res^2)\n", + "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", + "\n", + "cat(\"R-squared for the lasso with the extra flexible model (in-sample): \", R2.L, \"\\n\")\n", + "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjL, \"\\n\")\n", + "cat(\"MSE for the lasso with the extra flexible model (in-sample): \", MSEL, \"\\n\")\n", + "cat(\"adjusted MSE for the lasso with the extraflexible model (in-sample): \", MSE.adjL, \"\\n\")\n", + "\n", + "# out-of-sample\n", + "yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = \"lambda.min\")\n", + "MSE.lasso <- sum((test$lwage-yhat.lcv.test)^2)/length(test$lwage)\n", + "R2.lasso <- 1- MSE.lasso/mean((test$lwage-mean(train$lwage))^2)\n", + "\n", + "cat(\"\\n\")\n", + "cat(\"Test R2 for the lasso the extra flexible model: \", R2.lasso,\"\\n\")\n", + "cat(\"Test MSE for the lasso on the extra flexible model: \", MSE.lasso)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oIdTLf8Uq41n" + }, + "source": [ + "As shown above, the overfitting effect is mitigated with the penalized regression model." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 9d03a6245dd3056378953973db3d6cfcc5e075b7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 00:38:53 -0700 Subject: [PATCH 010/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 3898c7c6..67f8d407 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -49,14 +49,23 @@ jobs: ' - name: Lint .Rmd files + id: lint run: | R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120)) rmd_files <- list.files(path = "PM1", pattern = "\\.Rmd$", full.names = TRUE) - lapply(rmd_files, function(file) { + results <- lapply(rmd_files, function(file) { lint(file, linters) }) + warnings <- unlist(lapply(results, function(res) { + res[grep("^Warning:", res$message)] + })) + if (length(warnings) > 0) { + cat("Warnings found during linting:\n") + cat(warnings, sep="\n") + stop("Linting failed with warnings") + } ' - name: Zip .R files From 3f2dce9fe0c804f5ab6099ed0364bf6f8a7805c4 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 00:56:59 -0700 Subject: [PATCH 011/261] Update r-linear-model-overfitting.irnb --- PM1/r-linear-model-overfitting.irnb | 42 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index dbf5adc2..e5d691a3 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -30,18 +30,18 @@ "source": [ "\n", "set.seed(123)\n", - "n = 1000\n", + "n <- 1000\n", "\n", - "p = n\n", - "X<- matrix(rnorm(n*p), n, p)\n", - "Y<- rnorm(n)\n", + "p <- n\n", + "x <- matrix(rnorm(n * p), n, p)\n", + "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", - "print(p/n)\n", + "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(Y~X))$r.squared)\n", + "print(summary(lm(y ~ x))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(Y~X))$adj.r.squared)\n" + "print(summary(lm(y ~ x))$adj.r.squared)\n" ] }, { @@ -63,18 +63,18 @@ "source": [ "\n", "set.seed(123)\n", - "n = 1000\n", + "n <- 1000\n", "\n", - "p = n/2\n", - "X<- matrix(rnorm(n*p), n, p)\n", - "Y<- rnorm(n)\n", + "p <- n/2\n", + "x <- matrix(rnorm(n * p), n, p)\n", + "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", - "print(p/n)\n", + "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(Y~X))$r.squared)\n", + "print(summary(lm(y ~ x))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(Y~X))$adj.r.squared)\n" + "print(summary(lm(y ~ x))$adj.r.squared)\n" ] }, { @@ -96,18 +96,18 @@ "source": [ "\n", "set.seed(123)\n", - "n = 1000\n", + "n <- 1000\n", "\n", - "p = .05*n\n", - "X<- matrix(rnorm(n*p), n, p)\n", - "Y<- rnorm(n)\n", + "p <- .05*n\n", + "x <- matrix(rnorm(n*p), n, p)\n", + "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", - "print(p/n)\n", + "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(Y~X))$r.squared)\n", + "print(summary(lm(y ~ x))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(Y~X))$adj.r.squared)\n", + "print(summary(lm(y ~ x))$adj.r.squared)\n", "\n" ] } From 6f76400db8701fb3d68e0b92120d5087656caac1 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 00:58:53 -0700 Subject: [PATCH 012/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 67f8d407..2ead5c16 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -56,16 +56,13 @@ jobs: linters <- with_defaults(line_length_linter = line_length_linter(120)) rmd_files <- list.files(path = "PM1", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { - lint(file, linters) + lints <- lint(file, linters) + if (length(lints) > 0) { + cat("Warnings found during linting:\n") + print(lints) + stop("Linting failed with warnings") + } }) - warnings <- unlist(lapply(results, function(res) { - res[grep("^Warning:", res$message)] - })) - if (length(warnings) > 0) { - cat("Warnings found during linting:\n") - cat(warnings, sep="\n") - stop("Linting failed with warnings") - } ' - name: Zip .R files @@ -80,6 +77,11 @@ jobs: name: r-scripts path: r_scripts.zip + - name: Delete .R files and zip + run: | + rm -rf r_scripts + rm r_scripts.zip + - name: Commit and push stripped .irnb and .Rmd files run: | git config --global user.name 'github-actions[bot]' From 2710c31bca206c876af61e7613319e588676fc9b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 01:00:45 -0700 Subject: [PATCH 013/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 25 +++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 2ead5c16..e16acf97 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -3,6 +3,10 @@ name: Convert and Lint R Notebooks on: push +concurrency: + group: convert-lint-notebooks + cancel-in-progress: true + jobs: convert-lint-notebooks: runs-on: ubuntu-latest @@ -65,6 +69,25 @@ jobs: }) ' + - name: Execute R scripts + id: execute + run: | + R -e ' + files <- list.files(path = "PM1", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) + for (file in files) { + tryCatch( + { + source(file) + }, + error = function(e) { + cat("Error found in file:", file, "\n") + cat("Error message:", e$message, "\n") + stop("Execution failed due to an error in ", file) + } + ) + } + ' + - name: Zip .R files run: | mkdir r_scripts @@ -87,7 +110,7 @@ jobs: git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' git add PM1/*.irnb PM1/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, and lint .Rmd files' + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files' git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From ecf773239c83a80a5dd39bd33e2f9ad498a061e9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 01:06:47 -0700 Subject: [PATCH 014/261] Update r-linear-model-overfitting.irnb --- PM1/r-linear-model-overfitting.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index e5d691a3..6d129e0d 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -65,7 +65,7 @@ "set.seed(123)\n", "n <- 1000\n", "\n", - "p <- n/2\n", + "p <- n / 2\n", "x <- matrix(rnorm(n * p), n, p)\n", "y <- rnorm(n)\n", "\n", From 8fe17b25ab29ee48b9c3deb2a10142f725c84bf1 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 01:08:28 -0700 Subject: [PATCH 015/261] Update r-linear-model-overfitting.irnb --- PM1/r-linear-model-overfitting.irnb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index 6d129e0d..e02072f0 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -98,8 +98,8 @@ "set.seed(123)\n", "n <- 1000\n", "\n", - "p <- .05*n\n", - "x <- matrix(rnorm(n*p), n, p)\n", + "p <- .05 * n\n", + "x <- matrix(rnorm(n * p), n, p)\n", "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", @@ -107,8 +107,7 @@ "print(\"R2 is\")\n", "print(summary(lm(y ~ x))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(y ~ x))$adj.r.squared)\n", - "\n" + "print(summary(lm(y ~ x))$adj.r.squared)\n" ] } ], From 21f5a3b861414903ce61958017cbc746efa45528 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 01:39:39 -0700 Subject: [PATCH 016/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- ...-ols-and-lasso-for-wage-gap-inference.irnb | 188 ++++++++---------- 1 file changed, 88 insertions(+), 100 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 7084e181..91a2b00a 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -92,11 +92,7 @@ }, "outputs": [], "source": [ - "# load(\"../input/wage2015-inference/wage2015_subsample_inference.Rdata\")\n", - "# attach(data)\n", - "# dim(data)\n", - "\n", - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", "data <- read.csv(file)\n", "dim(data)" ] @@ -123,21 +119,21 @@ }, "outputs": [], "source": [ - "Z <- data[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"ne\",\"mw\",\"so\",\"we\",\"exp1\"))]\n", + "z <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", - "data_female <- data[data$sex==1,]\n", - "Z_female <- data_female[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"ne\",\"mw\",\"so\",\"we\",\"exp1\"))]\n", + "data_female <- data[data$sex == 1, ]\n", + "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", - "data_male <- data[data$sex==0,]\n", - "Z_male <- data_male[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"ne\",\"mw\",\"so\",\"we\",\"exp1\"))]\n", + "data_male <- data[data$sex == 0, ]\n", + "z_male <- data_male[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "table <- matrix(0, 12, 3)\n", - "table[1:12,1] <- as.numeric(lapply(Z,mean))\n", - "table[1:12,2] <- as.numeric(lapply(Z_male,mean))\n", - "table[1:12,3] <- as.numeric(lapply(Z_female,mean))\n", - "rownames(table) <- c(\"Log Wage\",\"Sex\",\"Less then High School\",\"High School Graduate\",\"Some College\",\"College Graduate\",\"Advanced Degree\", \"Northeast\",\"Midwest\",\"South\",\"West\",\"Experience\")\n", - "colnames(table) <- c(\"All\",\"Men\",\"Women\")\n", - "tab<- xtable(table, digits = 4)\n", + "table[1:12,1] <- as.numeric(lapply(z, mean))\n", + "table[1:12,2] <- as.numeric(lapply(z_male, mean))\n", + "table[1:12,3] <- as.numeric(lapply(z_female, mean))\n", + "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\", \"College Graduate\", \"Advanced Degree\", \"Northeast\", \"Midwest\", \"South\", \"West\", \"Experience\")\n", + "colnames(table) <- c(\"All\", \"Men\", \"Women\")\n", + "tab <- xtable(table, digits=4)\n", "tab" ] }, @@ -250,8 +246,8 @@ "outputs": [], "source": [ "nocontrol.fit <- lm(lwage ~ sex, data=data)\n", - "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\",1]\n", - "HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3'); # HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", + "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\", 1]\n", + "HCV.coefs <- vcovHC(nocontrol.fit, type='HC3'); # HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", "nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gender and the corresponding standard error\n", @@ -306,7 +302,7 @@ "source": [ "# ols regression with controls\n", "\n", - "flex <- lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", + "flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# Note that ()*() operation in formula objects in R creates a formula of the sort:\n", "# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", @@ -346,20 +342,20 @@ "metadata": {}, "outputs": [], "source": [ - "XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==0,])\n", - "y0 = data[data$sex==0,]$lwage\n", - "XX1 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==1,])\n", - "y1 = data[data$sex==1,]$lwage\n", - "mu1 = colMeans(XX1)\n", - "mu0 = colMeans(XX0)\n", - "betarest = summary(control.fit)$coef[3:(ncol(XX0)+1),1] # the coefficients excluding intercept and \"sex\"\n", + "xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), data=data[data$sex == 0, ])\n", + "y0 <- data[data$sex == 0, ]$lwage\n", + "xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), data=data[data$sex == 1, ])\n", + "y1 <- data[data$sex == 1, ]$lwage\n", + "mu1 <- colMeans(xx1)\n", + "mu0 <- colMeans(xx0)\n", + "betarest <- summary(control.fit)$coef[3:(ncol(xx0) + 1), 1] # the coefficients excluding intercept and \"sex\"\n", "\n", - "cat(\"The marginal gap:\",mean(data_female$lwage)-mean(data_male$lwage),\"\\n\")\n", - "diff.unexplained = control.est\n", - "cat(\"The unexplained difference: \",diff.unexplained,\"\\n\")\n", - "diff.explained = sum(betarest*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)]))\n", - "cat(\"The explained difference:\",diff.explained,\"\\n\")\n", - "cat(\"The sum of these differences:\",diff.unexplained + diff.explained,\"\\n\")" + "cat(\"The marginal gap:\", mean(data_female$lwage) - mean(data_male$lwage), \"\\n\")\n", + "diff.unexplained <- control.est\n", + "cat(\"The unexplained difference: \", diff.unexplained, \"\\n\")\n", + "diff.explained <- sum(betarest * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)]))\n", + "cat(\"The explained difference:\", diff.explained, \"\\n\")\n", + "cat(\"The sum of these differences:\", diff.unexplained + diff.explained, \"\\n\")" ] }, { @@ -375,20 +371,20 @@ "metadata": {}, "outputs": [], "source": [ - "svd0=svd(XX0)\n", - "svd1=svd(XX1)\n", - "svd0$d[svd0$d<=1e-10]=0\n", - "svd0$d[svd0$d>1e-10]=1/svd0$d[svd0$d>1e-10]\n", - "beta0 = (svd0$v %*% (svd0$d*svd0$d*t(svd0$v))) %*% t(XX0) %*% y0\n", - "svd1$d[svd1$d<=1e-10]=0\n", - "svd1$d[svd1$d>1e-10]=1/svd1$d[svd1$d>1e-10]\n", - "beta1 = (svd1$v %*% (svd1$d*svd1$d*t(svd1$v))) %*% t(XX1) %*% y1\n", + "svd0 <- svd(xx0)\n", + "svd1 <- svd(xx1)\n", + "svd0$d[svd0$d <= 1e-10] = 0\n", + "svd0$d[svd0$d > 1e-10] = 1 / svd0$d[svd0$d > 1e-10]\n", + "beta0 <- (svd0$v %*% (svd0$d * svd0$d * t(svd0$v))) %*% t(xx0) %*% y0\n", + "svd1$d[svd1$d <= 1e-10] <- 0\n", + "svd1$d[svd1$d > 1e-10] <- 1 / svd1$d[svd1$d > 1e-10]\n", + "beta1 <- (svd1$v %*% (svd1$d * svd1$d * t(svd1$v))) %*% t(xx1) %*% y1\n", "\n", - "cat(\"The marginal gap:\",mean(data_female$lwage)-mean(data_male$lwage),\"\\n\")\n", - "cat(\"The unexplained difference:\",beta1[1]-beta0[1],\"\\n\")\n", - "cat(\"The difference explained by endowment:\",sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])),\"\\n\")\n", - "cat(\"The difference explained by coefficient:\",sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),\"\\n\")\n", - "cat(\"The sum of these differences:\",beta1[1]-beta0[1] + sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])) + sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),\"\\n\")" + "cat(\"The marginal gap:\", mean(data_female$lwage) - mean(data_male$lwage), \"\\n\")\n", + "cat(\"The unexplained difference:\", beta1[1] - beta0[1], \"\\n\")\n", + "cat(\"The difference explained by endowment:\", sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])), \"\\n\")\n", + "cat(\"The difference explained by coefficient:\", sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), \"\\n\")\n", + "cat(\"The sum of these differences:\", beta1[1] - beta0[1] + sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) + sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), \"\\n\")" ] }, { @@ -416,8 +412,8 @@ "# Partialling-out using ols\n", "\n", "# models\n", - "flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y\n", - "flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D\n", + "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for Y\n", + "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D\n", "\n", "# partialling-out the linear effect of W from Y\n", "t.Y <- lm(flex.y, data=data)$res\n", @@ -425,17 +421,17 @@ "t.D <- lm(flex.d, data=data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.fit <- lm(t.Y~t.D)\n", - "partial.est <- summary(partial.fit)$coef[2,1]\n", + "partial.fit <- lm(t.Y ~ t.D)\n", + "partial.est <- summary(partial.fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out\", partial.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.fit, type = 'HC3')\n", + "HCV.coefs <- vcovHC(partial.fit, type='HC3')\n", "partial.se <- sqrt(diag(HCV.coefs))[2]\n", "\n", "# confidence interval\n", - "confint(partial.fit)[2,]" + "confint(partial.fit)[2, ]" ] }, { @@ -475,8 +471,8 @@ "# Partialling-out using lasso\n", "\n", "# models\n", - "flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y\n", - "flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D\n", + "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for Y\n", + "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D\n", "\n", "# partialling-out the linear effect of W from Y\n", "t.Y <- rlasso(flex.y, data=data)$res\n", @@ -484,13 +480,13 @@ "t.D <- rlasso(flex.d, data=data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t.Y~t.D)\n", - "partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1]\n", + "partial.lasso.fit <- lm(t.Y ~ t.D)\n", + "partial.lasso.est <- summary(partial.lasso.fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "HCV.coefs <- vcovHC(partial.lasso.fit, type='HC3')\n", "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" ] }, @@ -525,17 +521,17 @@ }, "outputs": [], "source": [ - "table<- matrix(0, 4, 2)\n", - "table[1,1]<- nocontrol.est\n", - "table[1,2]<- nocontrol.se\n", - "table[2,1]<- control.est\n", - "table[2,2]<- control.se\n", - "table[3,1]<- partial.est\n", - "table[3,2]<- partial.se\n", - "table[4,1]<- partial.lasso.est\n", - "table[4,2]<- partial.lasso.se\n", - "colnames(table)<- c(\"Estimate\",\"Std. Error\")\n", - "rownames(table)<- c(\"Without controls\", \"full reg\", \"partial reg\", \"partial reg via lasso\")\n", + "table <- matrix(0, 4, 2)\n", + "table[1, 1] <- nocontrol.est\n", + "table[1, 2] <- nocontrol.se\n", + "table[2, 1] <- control.est\n", + "table[2, 2] <- control.se\n", + "table[3, 1] <- partial.est\n", + "table[3, 2] <- partial.se\n", + "table[4, 1] <- partial.lasso.est\n", + "table[4, 2] <- partial.lasso.se\n", + "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", + "rownames(table) <- c(\"Without controls\", \"full reg\", \"partial reg\", \"partial reg via lasso\")\n", "tab<- xtable(table, digits=c(3, 3, 4))\n", "tab" ] @@ -651,19 +647,20 @@ "outputs": [], "source": [ "# extra flexible model\n", - "extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\n", + "extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\n", "\n", "control.fit <- lm(extraflex, data=subset)\n", "#summary(control.fit)\n", - "control.est <- summary(control.fit)$coef[2,1]\n", - "cat(\"Number of Extra-Flex Controls\", length(control.fit$coef)-1, \"\\n\")\n", + "control.est <- summary(control.fit)$coef[2, 1]\n", + "cat(\"Number of Extra-Flex Controls\", length(control.fit$coef) - 1, \"\\n\")\n", "cat(\"Coefficient for OLS with extra flex controls\", control.est)\n", "\n", "\n", - "n=subset_size; p=length(control.fit$coef);\n", + "n <- subset_size\n", + "p <- length(control.fit$coef)\n", "\n", "# HC0 SE\n", - "HCV.coefs_HC0 <- vcovHC(control.fit, type = 'HC0')\n", + "HCV.coefs_HC0 <- vcovHC(control.fit, type='HC0')\n", "control.se.HC0 <- sqrt(diag(HCV.coefs_HC0))[2]\n", "\n", "# For a more correct approach, we\n", @@ -673,11 +670,11 @@ "# really work here.\n", "# HC3 SE\n", "# estimates\n", - "coefs = hatvalues(control.fit)\n", - "trim = 0.99999999999\n", - "coefs_trimmed = coefs*(coefs < trim) + trim*(coefs >= trim)\n", - "omega = (control.fit$residuals^2)/((1-coefs_trimmed)^2)\n", - "HCV.coefs <- vcovHC(control.fit, omega = as.vector(omega), type = 'HC3')\n", + "coefs <- hatvalues(control.fit)\n", + "trim <- 0.99999999999\n", + "coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim)\n", + "omega <- (control.fit$residuals^2) / ((1 - coefs_trimmed)^2)\n", + "HCV.coefs <- vcovHC(control.fit, omega=as.vector(omega), type='HC3')\n", "control.se.HC3 <- sqrt(diag(HCV.coefs))[2]" ] }, @@ -694,8 +691,8 @@ "outputs": [], "source": [ "# models\n", - "extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for Y\n", - "extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for D\n", + "extraflex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 # model for Y\n", + "extraflex.d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 # model for D\n", "\n", "# partialling-out the linear effect of W from Y\n", "t.Y <- rlasso(extraflex.y, data=subset)$res\n", @@ -703,13 +700,13 @@ "t.D <- rlasso(extraflex.d, data=subset)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t.Y~t.D)\n", - "partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1]\n", + "partial.lasso.fit <- lm(t.Y ~ t.D)\n", + "partial.lasso.est <- summary(partial.lasso.fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "HCV.coefs <- vcovHC(partial.lasso.fit, type='HC3')\n", "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" ] }, @@ -726,15 +723,15 @@ }, "outputs": [], "source": [ - "table<- matrix(0, 3, 2)\n", - "table[1,1]<- control.est\n", - "table[1,2]<- control.se.HC0\n", - "table[2,1]<- control.est\n", - "table[2,2]<- control.se.HC3\n", - "table[3,1]<- partial.lasso.est\n", - "table[3,2]<- partial.lasso.se\n", - "colnames(table)<- c(\"Estimate\",\"Std. Error\")\n", - "rownames(table)<- c(\"full reg, HC0\", \"full reg, HC3\", \"partial reg via lasso\")\n", + "table <- matrix(0, 3, 2)\n", + "table[1, 1] <- control.est\n", + "table[1, 2] <- control.se.HC0\n", + "table[2, 1] <- control.est\n", + "table[2, 2] <- control.se.HC3\n", + "table[3, 1] <- partial.lasso.est\n", + "table[3, 2] <- partial.lasso.se\n", + "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", + "rownames(table) <- c(\"full reg, HC0\", \"full reg, HC3\", \"partial reg via lasso\")\n", "tab<- xtable(table, digits=c(3, 3, 4))\n", "tab\n", "\n", @@ -753,15 +750,6 @@ "have guarantees in p/n< 1 regime under assumptions laid out in Cattaneo, Newey, and Jansson (2018), without approximate\n", "sparsity, although other regularity conditions are needed.\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ERTT0mdfdPrh" - }, - "outputs": [], - "source": [] } ], "metadata": { From c833e8018122ebe41e2258ffeb54a6d7f65ed30d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 06:47:00 -0700 Subject: [PATCH 017/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- ...-ols-and-lasso-for-wage-gap-inference.irnb | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 91a2b00a..b9ce081a 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -128,12 +128,12 @@ "z_male <- data_male[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "table <- matrix(0, 12, 3)\n", - "table[1:12,1] <- as.numeric(lapply(z, mean))\n", - "table[1:12,2] <- as.numeric(lapply(z_male, mean))\n", - "table[1:12,3] <- as.numeric(lapply(z_female, mean))\n", + "table[1:12, 1] <- as.numeric(lapply(z, mean))\n", + "table[1:12, 2] <- as.numeric(lapply(z_male, mean))\n", + "table[1:12, 3] <- as.numeric(lapply(z_female, mean))\n", "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\", \"College Graduate\", \"Advanced Degree\", \"Northeast\", \"Midwest\", \"South\", \"West\", \"Experience\")\n", "colnames(table) <- c(\"All\", \"Men\", \"Women\")\n", - "tab <- xtable(table, digits=4)\n", + "tab <- xtable(table, digits = 4)\n", "tab" ] }, @@ -149,7 +149,7 @@ }, "outputs": [], "source": [ - "print(tab,type=\"html\") # set type=\"latex\" for printing table in LaTeX" + "print(tab, type = \"html\") # set type=\"latex\" for printing table in LaTeX" ] }, { @@ -245,9 +245,9 @@ }, "outputs": [], "source": [ - "nocontrol.fit <- lm(lwage ~ sex, data=data)\n", + "nocontrol.fit <- lm(lwage ~ sex, data = data)\n", "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\", 1]\n", - "HCV.coefs <- vcovHC(nocontrol.fit, type='HC3'); # HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", + "HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3'); # HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", "nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gender and the corresponding standard error\n", @@ -308,7 +308,7 @@ "# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", "# This is not intuitive at all, but that's what it does.\n", "\n", - "control.fit <- lm(flex, data=data)\n", + "control.fit <- lm(flex, data = data)\n", "control.est <- summary(control.fit)$coef[2,1]\n", "\n", "summary(control.fit)\n", @@ -416,9 +416,9 @@ "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- lm(flex.y, data=data)$res\n", + "t.Y <- lm(flex.y, data = data)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- lm(flex.d, data=data)$res\n", + "t.D <- lm(flex.d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", "partial.fit <- lm(t.Y ~ t.D)\n", @@ -427,7 +427,7 @@ "cat(\"Coefficient for D via partialling-out\", partial.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.fit, type='HC3')\n", + "HCV.coefs <- vcovHC(partial.fit, type = 'HC3')\n", "partial.se <- sqrt(diag(HCV.coefs))[2]\n", "\n", "# confidence interval\n", @@ -486,7 +486,7 @@ "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type='HC3')\n", + "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" ] }, @@ -532,7 +532,7 @@ "table[4, 2] <- partial.lasso.se\n", "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", "rownames(table) <- c(\"Without controls\", \"full reg\", \"partial reg\", \"partial reg via lasso\")\n", - "tab<- xtable(table, digits=c(3, 3, 4))\n", + "tab<- xtable(table, digits = c(3, 3, 4))\n", "tab" ] }, @@ -649,7 +649,7 @@ "# extra flexible model\n", "extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\n", "\n", - "control.fit <- lm(extraflex, data=subset)\n", + "control.fit <- lm(extraflex, data = subset)\n", "#summary(control.fit)\n", "control.est <- summary(control.fit)$coef[2, 1]\n", "cat(\"Number of Extra-Flex Controls\", length(control.fit$coef) - 1, \"\\n\")\n", @@ -660,7 +660,7 @@ "p <- length(control.fit$coef)\n", "\n", "# HC0 SE\n", - "HCV.coefs_HC0 <- vcovHC(control.fit, type='HC0')\n", + "HCV.coefs_HC0 <- vcovHC(control.fit, type = 'HC0')\n", "control.se.HC0 <- sqrt(diag(HCV.coefs_HC0))[2]\n", "\n", "# For a more correct approach, we\n", @@ -674,7 +674,7 @@ "trim <- 0.99999999999\n", "coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim)\n", "omega <- (control.fit$residuals^2) / ((1 - coefs_trimmed)^2)\n", - "HCV.coefs <- vcovHC(control.fit, omega=as.vector(omega), type='HC3')\n", + "HCV.coefs <- vcovHC(control.fit, omega = as.vector(omega), type = 'HC3')\n", "control.se.HC3 <- sqrt(diag(HCV.coefs))[2]" ] }, @@ -706,7 +706,7 @@ "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type='HC3')\n", + "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" ] }, From db0b8de0a7a622bd9b8ecaef318abab9e385d061 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 10:05:28 -0700 Subject: [PATCH 018/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- ...-ols-and-lasso-for-wage-gap-inference.irnb | 168 ++++++++++++------ 1 file changed, 115 insertions(+), 53 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index b9ce081a..149bd59d 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -47,7 +47,10 @@ "base_uri": "https://localhost:8080/" }, "id": "XuSVp1TShFKs", - "outputId": "6e38357e-eef8-44bf-c11b-d4e4aa4139a8" + "outputId": "6e38357e-eef8-44bf-c11b-d4e4aa4139a8", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -55,10 +58,9 @@ "install.packages(\"hdm\") # a library for high-dimensional metrics\n", "install.packages(\"sandwich\") # a package used to compute robust standard errors\n", "\n", - "\n", "library(hdm)\n", "library(xtable)\n", - "library(sandwich)\n" + "library(sandwich)" ] }, { @@ -88,7 +90,10 @@ "height": 34 }, "id": "T46lur9zyorw", - "outputId": "bad9c980-6655-4027-f9dd-b07a2216ab0a" + "outputId": "bad9c980-6655-4027-f9dd-b07a2216ab0a", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -115,23 +120,29 @@ "height": 474 }, "id": "hsx7vuc2yor3", - "outputId": "2f3378b9-4534-40c0-98e4-36a4b135f1e9" + "outputId": "2f3378b9-4534-40c0-98e4-36a4b135f1e9", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "z <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "data_female <- data[data$sex == 1, ]\n", - "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", + "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \n", + " \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "data_male <- data[data$sex == 0, ]\n", - "z_male <- data_male[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", + "z_male <- data_male[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\",\n", + " \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "table <- matrix(0, 12, 3)\n", "table[1:12, 1] <- as.numeric(lapply(z, mean))\n", "table[1:12, 2] <- as.numeric(lapply(z_male, mean))\n", "table[1:12, 3] <- as.numeric(lapply(z_female, mean))\n", - "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\", \"College Graduate\", \"Advanced Degree\", \"Northeast\", \"Midwest\", \"South\", \"West\", \"Experience\")\n", + "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\", \n", + " \"College Graduate\", \"Advanced Degree\", \"Northeast\", \"Midwest\", \"South\", \"West\", \"Experience\")\n", "colnames(table) <- c(\"All\", \"Men\", \"Women\")\n", "tab <- xtable(table, digits = 4)\n", "tab" @@ -145,7 +156,10 @@ "base_uri": "https://localhost:8080/" }, "id": "X81tdQRFyor4", - "outputId": "406ea0b4-4dda-4d81-d2a3-ef94505942c7" + "outputId": "406ea0b4-4dda-4d81-d2a3-ef94505942c7", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -195,11 +209,14 @@ "height": 34 }, "id": "r8B46bNgyor6", - "outputId": "b4d7a26d-fc67-4595-99d8-178ed1a872fe" + "outputId": "b4d7a26d-fc67-4595-99d8-178ed1a872fe", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "mean(data_female$lwage)-mean(data_male$lwage)" + "mean(data_female$lwage) - mean(data_male$lwage)" ] }, { @@ -241,17 +258,22 @@ "base_uri": "https://localhost:8080/" }, "id": "2kGIBjpYyor8", - "outputId": "bface8f9-2135-43bb-9e54-7ab2d5641b16" + "outputId": "bface8f9-2135-43bb-9e54-7ab2d5641b16", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "nocontrol.fit <- lm(lwage ~ sex, data = data)\n", "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\", 1]\n", - "HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3'); # HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", + "# HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", + "HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3')\n", "nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gender and the corresponding standard error\n", - "cat (\"The estimated coefficient on the dummy for gender is\",nocontrol.est,\" and the corresponding robust standard error is\",nocontrol.se)\n" + "cat (\"The estimated coefficient on the dummy for gender is\", nocontrol.est,\n", + " \" and the corresponding robust standard error is\", nocontrol.se)\n" ] }, { @@ -296,7 +318,10 @@ "height": 1000 }, "id": "gemX0ZyTyosC", - "outputId": "e5255cb1-5bed-491b-98b9-f95c39aa13e6" + "outputId": "e5255cb1-5bed-491b-98b9-f95c39aa13e6", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -304,12 +329,13 @@ "\n", "flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", - "# Note that ()*() operation in formula objects in R creates a formula of the sort:\n", - "# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", - "# This is not intuitive at all, but that's what it does.\n", + "# Note that ()*() operation in formula objects in R creates a formula of the sort:\n", + "# (exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) \n", + "# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", + "# This is not intuitive at all, but that's what it does.\n", "\n", "control.fit <- lm(flex, data = data)\n", - "control.est <- summary(control.fit)$coef[2,1]\n", + "control.est <- summary(control.fit)$coef[2, 1]\n", "\n", "summary(control.fit)\n", "\n", @@ -339,12 +365,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ - "xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), data=data[data$sex == 0, ])\n", + "xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we),\n", + " data=data[data$sex == 0, ])\n", "y0 <- data[data$sex == 0, ]$lwage\n", - "xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), data=data[data$sex == 1, ])\n", + "xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we),\n", + " data=data[data$sex == 1, ])\n", "y1 <- data[data$sex == 1, ]$lwage\n", "mu1 <- colMeans(xx1)\n", "mu0 <- colMeans(xx0)\n", @@ -368,7 +400,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "svd0 <- svd(xx0)\n", @@ -384,7 +420,9 @@ "cat(\"The unexplained difference:\", beta1[1] - beta0[1], \"\\n\")\n", "cat(\"The difference explained by endowment:\", sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])), \"\\n\")\n", "cat(\"The difference explained by coefficient:\", sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), \"\\n\")\n", - "cat(\"The sum of these differences:\", beta1[1] - beta0[1] + sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) + sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), \"\\n\")" + "cat(\"The sum of these differences:\",\n", + " (beta1[1] - beta0[1] + sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)]))\n", + " + sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)])), \"\\n\")" ] }, { @@ -405,15 +443,19 @@ "height": 52 }, "id": "iYSsI8ZEyosD", - "outputId": "a11b3ec7-f1ec-4488-9350-69a28481d342" + "outputId": "a11b3ec7-f1ec-4488-9350-69a28481d342", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Partialling-out using ols\n", "\n", - "# models\n", - "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for Y\n", - "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D\n", + "# model for Y\n", + "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "# model for D\n", + "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# partialling-out the linear effect of W from Y\n", "t.Y <- lm(flex.y, data = data)$res\n", @@ -464,20 +506,24 @@ "base_uri": "https://localhost:8080/" }, "id": "Pd-5O1U8yosH", - "outputId": "ffc5adbe-b367-422c-9421-112485563b6e" + "outputId": "ffc5adbe-b367-422c-9421-112485563b6e", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Partialling-out using lasso\n", "\n", - "# models\n", - "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for Y\n", - "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D\n", + "# model for Y\n", + "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "# model for D\n", + "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- rlasso(flex.y, data=data)$res\n", + "t.Y <- rlasso(flex.y, data = data)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- rlasso(flex.d, data=data)$res\n", + "t.D <- rlasso(flex.d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", "partial.lasso.fit <- lm(t.Y ~ t.D)\n", @@ -517,7 +563,10 @@ "height": 224 }, "id": "IpU4gNZayosI", - "outputId": "fdb05e51-0596-4804-bad2-50c729ddd0aa" + "outputId": "fdb05e51-0596-4804-bad2-50c729ddd0aa", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -544,11 +593,14 @@ "base_uri": "https://localhost:8080/" }, "id": "wGa29D7NyosJ", - "outputId": "36b56e8c-3853-48e6-8740-16db8a5eab9e" + "outputId": "36b56e8c-3853-48e6-8740-16db8a5eab9e", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print(tab, type=\"html\")" + "print(tab, type = \"html\")" ] }, { @@ -602,14 +654,17 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "hjSPZpef1Mfc" + "id": "hjSPZpef1Mfc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(2724)\n", "subset_size <- 1000\n", "random <- sample(1:nrow(data), subset_size)\n", - "subset <- data[random,]" + "subset <- data[random, ]" ] }, { @@ -642,7 +697,10 @@ "base_uri": "https://localhost:8080/" }, "id": "vZ84pYQVyosL", - "outputId": "f73bb150-a229-43b5-e052-7ab8390cc851" + "outputId": "f73bb150-a229-43b5-e052-7ab8390cc851", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -650,7 +708,6 @@ "extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\n", "\n", "control.fit <- lm(extraflex, data = subset)\n", - "#summary(control.fit)\n", "control.est <- summary(control.fit)$coef[2, 1]\n", "cat(\"Number of Extra-Flex Controls\", length(control.fit$coef) - 1, \"\\n\")\n", "cat(\"Coefficient for OLS with extra flex controls\", control.est)\n", @@ -668,8 +725,6 @@ "\n", "# Jackknife. Need to trim some leverages or otherwise regularize. Theory shouldn't\n", "# really work here.\n", - "# HC3 SE\n", - "# estimates\n", "coefs <- hatvalues(control.fit)\n", "trim <- 0.99999999999\n", "coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim)\n", @@ -686,18 +741,22 @@ "base_uri": "https://localhost:8080/" }, "id": "SfB4go24yosL", - "outputId": "8584501d-8152-4b94-c516-4c901683c7a4" + "outputId": "8584501d-8152-4b94-c516-4c901683c7a4", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "# models\n", - "extraflex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 # model for Y\n", - "extraflex.d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 # model for D\n", + "# model for Y\n", + "extraflex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", + "# model for D\n", + "extraflex.d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- rlasso(extraflex.y, data=subset)$res\n", + "t.Y <- rlasso(extraflex.y, data = subset)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- rlasso(extraflex.d, data=subset)$res\n", + "t.D <- rlasso(extraflex.d, data = subset)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", "partial.lasso.fit <- lm(t.Y ~ t.D)\n", @@ -719,7 +778,10 @@ "height": 442 }, "id": "X9z2zpVtyosL", - "outputId": "e68c579d-e447-45fd-ce6f-ee15b04f3055" + "outputId": "e68c579d-e447-45fd-ce6f-ee15b04f3055", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -728,14 +790,14 @@ "table[1, 2] <- control.se.HC0\n", "table[2, 1] <- control.est\n", "table[2, 2] <- control.se.HC3\n", - "table[3, 1] <- partial.lasso.est\n", + "table[3, 1] <- partial.lasso.est\n", "table[3, 2] <- partial.lasso.se\n", "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", "rownames(table) <- c(\"full reg, HC0\", \"full reg, HC3\", \"partial reg via lasso\")\n", - "tab<- xtable(table, digits=c(3, 3, 4))\n", + "tab <- xtable(table, digits = c(3, 3, 4))\n", "tab\n", "\n", - "print(tab, type=\"latex\")" + "print(tab, type = \"latex\")" ] }, { From 9ca28d0d9c60ccfe57d72b20d963ba6fdf98b7d9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 10:22:51 -0700 Subject: [PATCH 019/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- ...-ols-and-lasso-for-wage-gap-inference.irnb | 136 +++++++++--------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 149bd59d..6ad20300 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -130,7 +130,7 @@ "z <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "data_female <- data[data$sex == 1, ]\n", - "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \n", + "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", " \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "data_male <- data[data$sex == 0, ]\n", @@ -141,7 +141,7 @@ "table[1:12, 1] <- as.numeric(lapply(z, mean))\n", "table[1:12, 2] <- as.numeric(lapply(z_male, mean))\n", "table[1:12, 3] <- as.numeric(lapply(z_female, mean))\n", - "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\", \n", + "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Less then High School\", \"High School Graduate\", \"Some College\",\n", " \"College Graduate\", \"Advanced Degree\", \"Northeast\", \"Midwest\", \"South\", \"West\", \"Experience\")\n", "colnames(table) <- c(\"All\", \"Men\", \"Women\")\n", "tab <- xtable(table, digits = 4)\n", @@ -265,15 +265,15 @@ }, "outputs": [], "source": [ - "nocontrol.fit <- lm(lwage ~ sex, data = data)\n", - "nocontrol.est <- summary(nocontrol.fit)$coef[\"sex\", 1]\n", + "nocontrol_fit <- lm(lwage ~ sex, data = data)\n", + "nocontrol_est <- summary(nocontrol_fit)$coef[\"sex\", 1]\n", "# HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", - "HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3')\n", - "nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", + "hcv_coefs <- vcovHC(nocontrol_fit, type = 'HC3')\n", + "nocontrol_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gender and the corresponding standard error\n", - "cat (\"The estimated coefficient on the dummy for gender is\", nocontrol.est,\n", - " \" and the corresponding robust standard error is\", nocontrol.se)\n" + "cat (\"The estimated coefficient on the dummy for gender is\", nocontrol_est,\n", + " \" and the corresponding robust standard error is\", nocontrol_se)\n" ] }, { @@ -334,15 +334,15 @@ "# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", "# This is not intuitive at all, but that's what it does.\n", "\n", - "control.fit <- lm(flex, data = data)\n", - "control.est <- summary(control.fit)$coef[2, 1]\n", + "control_fit <- lm(flex, data = data)\n", + "control_est <- summary(control_fit)$coef[2, 1]\n", "\n", - "summary(control.fit)\n", + "summary(control_fit)\n", "\n", - "cat(\"Coefficient for OLS with controls\", control.est)\n", + "cat(\"Coefficient for OLS with controls\", control_est)\n", "\n", - "HCV.coefs <- vcovHC(control.fit, type = 'HC3');\n", - "control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors" + "hcv_coefs <- vcovHC(control_fit, type = 'HC3');\n", + "control_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors" ] }, { @@ -380,10 +380,10 @@ "y1 <- data[data$sex == 1, ]$lwage\n", "mu1 <- colMeans(xx1)\n", "mu0 <- colMeans(xx0)\n", - "betarest <- summary(control.fit)$coef[3:(ncol(xx0) + 1), 1] # the coefficients excluding intercept and \"sex\"\n", + "betarest <- summary(control_fit)$coef[3:(ncol(xx0) + 1), 1] # the coefficients excluding intercept and \"sex\"\n", "\n", "cat(\"The marginal gap:\", mean(data_female$lwage) - mean(data_male$lwage), \"\\n\")\n", - "diff.unexplained <- control.est\n", + "diff.unexplained <- control_est\n", "cat(\"The unexplained difference: \", diff.unexplained, \"\\n\")\n", "diff.explained <- sum(betarest * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)]))\n", "cat(\"The explained difference:\", diff.explained, \"\\n\")\n", @@ -453,24 +453,24 @@ "# Partialling-out using ols\n", "\n", "# model for Y\n", - "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "# model for D\n", - "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- lm(flex.y, data = data)$res\n", + "t_y <- lm(flex_y, data = data)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- lm(flex.d, data = data)$res\n", + "t_d <- lm(flex_d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.fit <- lm(t.Y ~ t.D)\n", - "partial.est <- summary(partial.fit)$coef[2, 1]\n", + "partial.fit <- lm(t_y ~ t_d)\n", + "partial_est <- summary(partial.fit)$coef[2, 1]\n", "\n", - "cat(\"Coefficient for D via partialling-out\", partial.est)\n", + "cat(\"Coefficient for D via partialling-out\", partial_est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.fit, type = 'HC3')\n", - "partial.se <- sqrt(diag(HCV.coefs))[2]\n", + "hcv_coefs <- vcovHC(partial.fit, type = 'HC3')\n", + "partial_se <- sqrt(diag(hcv_coefs))[2]\n", "\n", "# confidence interval\n", "confint(partial.fit)[2, ]" @@ -516,24 +516,24 @@ "# Partialling-out using lasso\n", "\n", "# model for Y\n", - "flex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "# model for D\n", - "flex.d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- rlasso(flex.y, data = data)$res\n", + "t_y <- rlasso(flex_y, data = data)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- rlasso(flex.d, data = data)$res\n", + "t_d <- rlasso(flex_d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t.Y ~ t.D)\n", - "partial.lasso.est <- summary(partial.lasso.fit)$coef[2, 1]\n", + "partial.lasso.fit <- lm(t_y ~ t_d)\n", + "partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1]\n", "\n", - "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", + "cat(\"Coefficient for D via partialling-out using lasso\", partial_lasso_est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", - "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" + "hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "partial_lasso_se <- sqrt(diag(hcv_coefs))[2]" ] }, { @@ -571,14 +571,14 @@ "outputs": [], "source": [ "table <- matrix(0, 4, 2)\n", - "table[1, 1] <- nocontrol.est\n", - "table[1, 2] <- nocontrol.se\n", - "table[2, 1] <- control.est\n", - "table[2, 2] <- control.se\n", - "table[3, 1] <- partial.est\n", - "table[3, 2] <- partial.se\n", - "table[4, 1] <- partial.lasso.est\n", - "table[4, 2] <- partial.lasso.se\n", + "table[1, 1] <- nocontrol_est\n", + "table[1, 2] <- nocontrol_se\n", + "table[2, 1] <- control_est\n", + "table[2, 2] <- control_se\n", + "table[3, 1] <- partial_est\n", + "table[3, 2] <- partial_se\n", + "table[4, 1] <- partial_lasso_est\n", + "table[4, 2] <- partial_lasso_se\n", "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", "rownames(table) <- c(\"Without controls\", \"full reg\", \"partial reg\", \"partial reg via lasso\")\n", "tab<- xtable(table, digits = c(3, 3, 4))\n", @@ -707,30 +707,30 @@ "# extra flexible model\n", "extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\n", "\n", - "control.fit <- lm(extraflex, data = subset)\n", - "control.est <- summary(control.fit)$coef[2, 1]\n", - "cat(\"Number of Extra-Flex Controls\", length(control.fit$coef) - 1, \"\\n\")\n", - "cat(\"Coefficient for OLS with extra flex controls\", control.est)\n", + "control_fit <- lm(extraflex, data = subset)\n", + "control_est <- summary(control_fit)$coef[2, 1]\n", + "cat(\"Number of Extra-Flex Controls\", length(control_fit$coef) - 1, \"\\n\")\n", + "cat(\"Coefficient for OLS with extra flex controls\", control_est)\n", "\n", "\n", "n <- subset_size\n", - "p <- length(control.fit$coef)\n", + "p <- length(control_fit$coef)\n", "\n", "# HC0 SE\n", - "HCV.coefs_HC0 <- vcovHC(control.fit, type = 'HC0')\n", - "control.se.HC0 <- sqrt(diag(HCV.coefs_HC0))[2]\n", + "hcv_coefs_hc0 <- vcovHC(control_fit, type = 'HC0')\n", + "control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2]\n", "\n", "# For a more correct approach, we\n", "# would implement the approach of Cattaneo, Jannson, and Newey (2018)'s procedure.\n", "\n", "# Jackknife. Need to trim some leverages or otherwise regularize. Theory shouldn't\n", "# really work here.\n", - "coefs <- hatvalues(control.fit)\n", + "coefs <- hatvalues(control_fit)\n", "trim <- 0.99999999999\n", "coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim)\n", - "omega <- (control.fit$residuals^2) / ((1 - coefs_trimmed)^2)\n", - "HCV.coefs <- vcovHC(control.fit, omega = as.vector(omega), type = 'HC3')\n", - "control.se.HC3 <- sqrt(diag(HCV.coefs))[2]" + "omega <- (control_fit$residuals^2) / ((1 - coefs_trimmed)^2)\n", + "hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = 'HC3')\n", + "control_se_hc3 <- sqrt(diag(hcv_coefs))[2]" ] }, { @@ -749,24 +749,24 @@ "outputs": [], "source": [ "# model for Y\n", - "extraflex.y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", + "extraflex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", "# model for D\n", - "extraflex.d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", + "extraflex_d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2\n", "\n", "# partialling-out the linear effect of W from Y\n", - "t.Y <- rlasso(extraflex.y, data = subset)$res\n", + "t_y <- rlasso(extraflex_y, data = subset)$res\n", "# partialling-out the linear effect of W from D\n", - "t.D <- rlasso(extraflex.d, data = subset)$res\n", + "t_d <- rlasso(extraflex_d, data = subset)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t.Y ~ t.D)\n", - "partial.lasso.est <- summary(partial.lasso.fit)$coef[2, 1]\n", + "partial.lasso.fit <- lm(t_y ~ t_d)\n", + "partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1]\n", "\n", - "cat(\"Coefficient for D via partialling-out using lasso\", partial.lasso.est)\n", + "cat(\"Coefficient for D via partialling-out using lasso\", partial_lasso_est)\n", "\n", "# standard error\n", - "HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", - "partial.lasso.se <- sqrt(diag(HCV.coefs))[2]" + "hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "partial_lasso_se <- sqrt(diag(hcv_coefs))[2]" ] }, { @@ -786,12 +786,12 @@ "outputs": [], "source": [ "table <- matrix(0, 3, 2)\n", - "table[1, 1] <- control.est\n", - "table[1, 2] <- control.se.HC0\n", - "table[2, 1] <- control.est\n", - "table[2, 2] <- control.se.HC3\n", - "table[3, 1] <- partial.lasso.est\n", - "table[3, 2] <- partial.lasso.se\n", + "table[1, 1] <- control_est\n", + "table[1, 2] <- control_se_hc0\n", + "table[2, 1] <- control_est\n", + "table[2, 2] <- control_se_hc3\n", + "table[3, 1] <- partial_lasso_est\n", + "table[3, 2] <- partial_lasso_se\n", "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", "rownames(table) <- c(\"full reg, HC0\", \"full reg, HC3\", \"partial reg via lasso\")\n", "tab <- xtable(table, digits = c(3, 3, 4))\n", From 89135af7e1ef323a5106b42fcfefb6950a4823c7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 10:24:02 -0700 Subject: [PATCH 020/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index e16acf97..192a5fed 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -64,7 +64,7 @@ jobs: if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) - stop("Linting failed with warnings") + # stop("Linting failed with warnings") } }) ' @@ -82,7 +82,7 @@ jobs: error = function(e) { cat("Error found in file:", file, "\n") cat("Error message:", e$message, "\n") - stop("Execution failed due to an error in ", file) + # stop("Execution failed due to an error in ", file) } ) } From 8b505b629edb6e25959bb3edebb1e78b99d271c4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 17:34:14 +0000 Subject: [PATCH 021/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files --- PM1/r-linear-model-overfitting.Rmd | 43 ++- ...r-ols-and-lasso-for-wage-gap-inference.Rmd | 276 +++++++++--------- 2 files changed, 162 insertions(+), 157 deletions(-) diff --git a/PM1/r-linear-model-overfitting.Rmd b/PM1/r-linear-model-overfitting.Rmd index 5f1cb3b2..68ef059c 100644 --- a/PM1/r-linear-model-overfitting.Rmd +++ b/PM1/r-linear-model-overfitting.Rmd @@ -10,18 +10,18 @@ First set p=n ```{r} set.seed(123) -n = 1000 +n <- 1000 -p = n -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) +p <- n +x <- matrix(rnorm(n * p), n, p) +y <- rnorm(n) print("p/n is") -print(p/n) +print(p / n) print("R2 is") -print(summary(lm(Y~X))$r.squared) +print(summary(lm(y ~ x))$r.squared) print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) +print(summary(lm(y ~ x))$adj.r.squared) ``` Second, set p=n/2. @@ -29,18 +29,18 @@ Second, set p=n/2. ```{r} set.seed(123) -n = 1000 +n <- 1000 -p = n/2 -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) +p <- n / 2 +x <- matrix(rnorm(n * p), n, p) +y <- rnorm(n) print("p/n is") -print(p/n) +print(p / n) print("R2 is") -print(summary(lm(Y~X))$r.squared) +print(summary(lm(y ~ x))$r.squared) print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) +print(summary(lm(y ~ x))$adj.r.squared) ``` Third, set p/n =.05 @@ -48,18 +48,17 @@ Third, set p/n =.05 ```{r} set.seed(123) -n = 1000 +n <- 1000 -p = .05*n -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) +p <- .05 * n +x <- matrix(rnorm(n * p), n, p) +y <- rnorm(n) print("p/n is") -print(p/n) +print(p / n) print("R2 is") -print(summary(lm(Y~X))$r.squared) +print(summary(lm(y ~ x))$r.squared) print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) - +print(summary(lm(y ~ x))$adj.r.squared) ``` diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd index 574ce287..a662a61f 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -26,7 +26,6 @@ install.packages("xtable") install.packages("hdm") # a library for high-dimensional metrics install.packages("sandwich") # a package used to compute robust standard errors - library(hdm) library(xtable) library(sandwich) @@ -37,11 +36,7 @@ library(sandwich) We consider the same subsample of the U.S. Current Population Survey (2015) as in the previous lab. Let us load the data set. ```{r} -# load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") -# attach(data) -# dim(data) - -file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" data <- read.csv(file) dim(data) ``` @@ -49,26 +44,29 @@ dim(data) To start our (causal) analysis, we compare the sample means given gender: ```{r} -Z <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] +z <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", "ad", "ne", "mw", "so", "we", "exp1"))] -data_female <- data[data$sex==1,] -Z_female <- data_female[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] +data_female <- data[data$sex == 1, ] +z_female <- data_female[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", + "clg", "ad", "ne", "mw", "so", "we", "exp1"))] -data_male <- data[data$sex==0,] -Z_male <- data_male[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] +data_male <- data[data$sex == 0, ] +z_male <- data_male[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", "ad", + "ne", "mw", "so", "we", "exp1"))] table <- matrix(0, 12, 3) -table[1:12,1] <- as.numeric(lapply(Z,mean)) -table[1:12,2] <- as.numeric(lapply(Z_male,mean)) -table[1:12,3] <- as.numeric(lapply(Z_female,mean)) -rownames(table) <- c("Log Wage","Sex","Less then High School","High School Graduate","Some College","College Graduate","Advanced Degree", "Northeast","Midwest","South","West","Experience") -colnames(table) <- c("All","Men","Women") -tab<- xtable(table, digits = 4) +table[1:12, 1] <- as.numeric(lapply(z, mean)) +table[1:12, 2] <- as.numeric(lapply(z_male, mean)) +table[1:12, 3] <- as.numeric(lapply(z_female, mean)) +rownames(table) <- c("Log Wage", "Sex", "Less then High School", "High School Graduate", "Some College", + "College Graduate", "Advanced Degree", "Northeast", "Midwest", "South", "West", "Experience") +colnames(table) <- c("All", "Men", "Women") +tab <- xtable(table, digits = 4) tab ``` ```{r} -print(tab,type="html") # set type="latex" for printing table in LaTeX +print(tab, type = "html") # set type="latex" for printing table in LaTeX ``` @@ -92,7 +90,7 @@ print(tab,type="html") # set type="latex" for printing table in LaTeX In particular, the table above shows that the difference in average *logwage* between men and women is equal to $0.038$ ```{r} -mean(data_female$lwage)-mean(data_male$lwage) +mean(data_female$lwage) - mean(data_male$lwage) ``` Thus, the unconditional gender wage gap is about $3,8$\% for the group of never married workers (women get paid less on average in our sample). We also observe that never married working women are relatively more educated than working men and have lower working experience. @@ -106,13 +104,15 @@ This unconditional (predictive) effect of gender equals the coefficient $\beta$ We verify this by running an ols regression in R. ```{r} -nocontrol.fit <- lm(lwage ~ sex, data=data) -nocontrol.est <- summary(nocontrol.fit)$coef["sex",1] -HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC3'); # HC - "heteroskedasticity cosistent" -- HC3 is the SE that remains consistent in high dimensions -nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors +nocontrol_fit <- lm(lwage ~ sex, data = data) +nocontrol_est <- summary(nocontrol_fit)$coef["sex", 1] +# HC - "heteroskedasticity cosistent" -- HC3 is the SE that remains consistent in high dimensions +hcv_coefs <- vcovHC(nocontrol_fit, type = 'HC3') +nocontrol_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors # print unconditional effect of gender and the corresponding standard error -cat ("The estimated coefficient on the dummy for gender is",nocontrol.est," and the corresponding robust standard error is",nocontrol.se) +cat ("The estimated coefficient on the dummy for gender is", nocontrol_est, + " and the corresponding robust standard error is", nocontrol_se) ``` Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. @@ -130,21 +130,22 @@ Let us run the ols regression with controls. ```{r} # ols regression with controls -flex <- lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) +flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) -# Note that ()*() operation in formula objects in R creates a formula of the sort: -# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) -# This is not intuitive at all, but that's what it does. +# Note that ()*() operation in formula objects in R creates a formula of the sort: +# (exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# This is not intuitive at all, but that's what it does. -control.fit <- lm(flex, data=data) -control.est <- summary(control.fit)$coef[2,1] +control_fit <- lm(flex, data = data) +control_est <- summary(control_fit)$coef[2, 1] -summary(control.fit) +summary(control_fit) -cat("Coefficient for OLS with controls", control.est) +cat("Coefficient for OLS with controls", control_est) -HCV.coefs <- vcovHC(control.fit, type = 'HC3'); -control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors +hcv_coefs <- vcovHC(control_fit, type = 'HC3'); +control_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors ``` The estimated regression coefficient $\beta_1\approx-0.0696$ measures how our linear prediction of wage changes if we set the gender variable $D$ from 0 to 1, holding the controls $W$ fixed. @@ -153,39 +154,43 @@ We can call this the *predictive effect* (PE), as it measures the impact of a va We now show how the conditional gap and the remainder decompose the marginal wage gap into the parts explained and unexplained by the additional controls. (Note that this does *not* explain why there is a difference in the controls to begin with in the two groups.) ```{r} -XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==0,]) -y0 = data[data$sex==0,]$lwage -XX1 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we), data = data[data$sex==1,]) -y1 = data[data$sex==1,]$lwage -mu1 = colMeans(XX1) -mu0 = colMeans(XX0) -betarest = summary(control.fit)$coef[3:(ncol(XX0)+1),1] # the coefficients excluding intercept and "sex" - -cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage),"\n") -diff.unexplained = control.est -cat("The unexplained difference: ",diff.unexplained,"\n") -diff.explained = sum(betarest*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])) -cat("The explained difference:",diff.explained,"\n") -cat("The sum of these differences:",diff.unexplained + diff.explained,"\n") +xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), + data=data[data$sex == 0, ]) +y0 <- data[data$sex == 0, ]$lwage +xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), + data=data[data$sex == 1, ]) +y1 <- data[data$sex == 1, ]$lwage +mu1 <- colMeans(xx1) +mu0 <- colMeans(xx0) +betarest <- summary(control_fit)$coef[3:(ncol(xx0) + 1), 1] # the coefficients excluding intercept and "sex" + +cat("The marginal gap:", mean(data_female$lwage) - mean(data_male$lwage), "\n") +diff.unexplained <- control_est +cat("The unexplained difference: ", diff.unexplained, "\n") +diff.explained <- sum(betarest * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) +cat("The explained difference:", diff.explained, "\n") +cat("The sum of these differences:", diff.unexplained + diff.explained, "\n") ``` We next consider a Oaxaca-Blinder decomposition that also incorporates an interaction term. ```{r} -svd0=svd(XX0) -svd1=svd(XX1) -svd0$d[svd0$d<=1e-10]=0 -svd0$d[svd0$d>1e-10]=1/svd0$d[svd0$d>1e-10] -beta0 = (svd0$v %*% (svd0$d*svd0$d*t(svd0$v))) %*% t(XX0) %*% y0 -svd1$d[svd1$d<=1e-10]=0 -svd1$d[svd1$d>1e-10]=1/svd1$d[svd1$d>1e-10] -beta1 = (svd1$v %*% (svd1$d*svd1$d*t(svd1$v))) %*% t(XX1) %*% y1 - -cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage),"\n") -cat("The unexplained difference:",beta1[1]-beta0[1],"\n") -cat("The difference explained by endowment:",sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])),"\n") -cat("The difference explained by coefficient:",sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),"\n") -cat("The sum of these differences:",beta1[1]-beta0[1] + sum(beta0[2:ncol(XX0)]*(mu1[2:ncol(XX0)]-mu0[2:ncol(XX0)])) + sum((beta1[2:ncol(XX0)]-beta0[2:ncol(XX0)])*mu1[2:ncol(XX0)]),"\n") +svd0 <- svd(xx0) +svd1 <- svd(xx1) +svd0$d[svd0$d <= 1e-10] = 0 +svd0$d[svd0$d > 1e-10] = 1 / svd0$d[svd0$d > 1e-10] +beta0 <- (svd0$v %*% (svd0$d * svd0$d * t(svd0$v))) %*% t(xx0) %*% y0 +svd1$d[svd1$d <= 1e-10] <- 0 +svd1$d[svd1$d > 1e-10] <- 1 / svd1$d[svd1$d > 1e-10] +beta1 <- (svd1$v %*% (svd1$d * svd1$d * t(svd1$v))) %*% t(xx1) %*% y1 + +cat("The marginal gap:", mean(data_female$lwage) - mean(data_male$lwage), "\n") +cat("The unexplained difference:", beta1[1] - beta0[1], "\n") +cat("The difference explained by endowment:", sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])), "\n") +cat("The difference explained by coefficient:", sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), "\n") +cat("The sum of these differences:", + (beta1[1] - beta0[1] + sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) + + sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)])), "\n") ``` Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out the linear effect of the controls via ols. @@ -193,27 +198,28 @@ Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out ```{r} # Partialling-out using ols -# models -flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y -flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D +# model for Y +flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) +# model for D +flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # partialling-out the linear effect of W from Y -t.Y <- lm(flex.y, data=data)$res +t_y <- lm(flex_y, data = data)$res # partialling-out the linear effect of W from D -t.D <- lm(flex.d, data=data)$res +t_d <- lm(flex_d, data = data)$res # regression of Y on D after partialling-out the effect of W -partial.fit <- lm(t.Y~t.D) -partial.est <- summary(partial.fit)$coef[2,1] +partial.fit <- lm(t_y ~ t_d) +partial_est <- summary(partial.fit)$coef[2, 1] -cat("Coefficient for D via partialling-out", partial.est) +cat("Coefficient for D via partialling-out", partial_est) # standard error -HCV.coefs <- vcovHC(partial.fit, type = 'HC3') -partial.se <- sqrt(diag(HCV.coefs))[2] +hcv_coefs <- vcovHC(partial.fit, type = 'HC3') +partial_se <- sqrt(diag(hcv_coefs))[2] # confidence interval -confint(partial.fit)[2,] +confint(partial.fit)[2, ] ``` Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. @@ -227,24 +233,25 @@ In the following, we illustrate the partialling-out approach using lasso instead ```{r} # Partialling-out using lasso -# models -flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for Y -flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) # model for D +# model for Y +flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) +# model for D +flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # partialling-out the linear effect of W from Y -t.Y <- rlasso(flex.y, data=data)$res +t_y <- rlasso(flex_y, data = data)$res # partialling-out the linear effect of W from D -t.D <- rlasso(flex.d, data=data)$res +t_d <- rlasso(flex_d, data = data)$res # regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t.Y~t.D) -partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1] +partial.lasso.fit <- lm(t_y ~ t_d) +partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1] -cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) +cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) # standard error -HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') -partial.lasso.se <- sqrt(diag(HCV.coefs))[2] +hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +partial_lasso_se <- sqrt(diag(hcv_coefs))[2] ``` Using lasso for partialling-out here provides similar results as using ols. @@ -252,23 +259,23 @@ Using lasso for partialling-out here provides similar results as using ols. Next, we summarize the results. ```{r} -table<- matrix(0, 4, 2) -table[1,1]<- nocontrol.est -table[1,2]<- nocontrol.se -table[2,1]<- control.est -table[2,2]<- control.se -table[3,1]<- partial.est -table[3,2]<- partial.se -table[4,1]<- partial.lasso.est -table[4,2]<- partial.lasso.se -colnames(table)<- c("Estimate","Std. Error") -rownames(table)<- c("Without controls", "full reg", "partial reg", "partial reg via lasso") -tab<- xtable(table, digits=c(3, 3, 4)) +table <- matrix(0, 4, 2) +table[1, 1] <- nocontrol_est +table[1, 2] <- nocontrol_se +table[2, 1] <- control_est +table[2, 2] <- control_se +table[3, 1] <- partial_est +table[3, 2] <- partial_se +table[4, 1] <- partial_lasso_est +table[4, 2] <- partial_lasso_se +colnames(table) <- c("Estimate", "Std. Error") +rownames(table) <- c("Without controls", "full reg", "partial reg", "partial reg via lasso") +tab<- xtable(table, digits = c(3, 3, 4)) tab ``` ```{r} -print(tab, type="html") +print(tab, type = "html") ``` @@ -294,7 +301,7 @@ Next we motivate the usage of lasso. We try an "extra" flexible model, where we set.seed(2724) subset_size <- 1000 random <- sample(1:nrow(data), subset_size) -subset <- data[random,] +subset <- data[random, ] ``` For a linear model, the covariance matrix of the estimated $\hat{\beta}$ coefficients is given by $$\Sigma_{\hat{\beta}} = (X'X)^{-1} X' \Omega X (X'X)^{-1}$$ Under homoskedasticity, $\Omega = \sigma^2 I$ so $\Sigma_{\hat{\beta}}$ reduces to $\sigma^2(X'X)^{-1}$ with $\sigma^2$ estimated with the mean squared residuals. Under heteroskedasticity, $\Omega \neq \sigma^2 I$, so we must use an approach that yields valid standard errors. Under heteroskedasticity, there exists a variety of consistent "sandwich" estimators proposed for $\Sigma_{\hat{\beta}}$. With $e_i$ denoting the residual of observation $i: @@ -314,71 +321,70 @@ HC3 is similar to HC2, weighting by the squared $(1-h_{ii})^2$ in the denominato ```{r} # extra flexible model -extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2 +extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2 -control.fit <- lm(extraflex, data=subset) -#summary(control.fit) -control.est <- summary(control.fit)$coef[2,1] -cat("Number of Extra-Flex Controls", length(control.fit$coef)-1, "\n") -cat("Coefficient for OLS with extra flex controls", control.est) +control_fit <- lm(extraflex, data = subset) +control_est <- summary(control_fit)$coef[2, 1] +cat("Number of Extra-Flex Controls", length(control_fit$coef) - 1, "\n") +cat("Coefficient for OLS with extra flex controls", control_est) -n=subset_size; p=length(control.fit$coef); +n <- subset_size +p <- length(control_fit$coef) # HC0 SE -HCV.coefs_HC0 <- vcovHC(control.fit, type = 'HC0') -control.se.HC0 <- sqrt(diag(HCV.coefs_HC0))[2] +hcv_coefs_hc0 <- vcovHC(control_fit, type = 'HC0') +control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2] # For a more correct approach, we # would implement the approach of Cattaneo, Jannson, and Newey (2018)'s procedure. # Jackknife. Need to trim some leverages or otherwise regularize. Theory shouldn't # really work here. -# HC3 SE -# estimates -coefs = hatvalues(control.fit) -trim = 0.99999999999 -coefs_trimmed = coefs*(coefs < trim) + trim*(coefs >= trim) -omega = (control.fit$residuals^2)/((1-coefs_trimmed)^2) -HCV.coefs <- vcovHC(control.fit, omega = as.vector(omega), type = 'HC3') -control.se.HC3 <- sqrt(diag(HCV.coefs))[2] +coefs <- hatvalues(control_fit) +trim <- 0.99999999999 +coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim) +omega <- (control_fit$residuals^2) / ((1 - coefs_trimmed)^2) +hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = 'HC3') +control_se_hc3 <- sqrt(diag(hcv_coefs))[2] ``` ```{r} -# models -extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for Y -extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2))^2 # model for D +# model for Y +extraflex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 +# model for D +extraflex_d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 # partialling-out the linear effect of W from Y -t.Y <- rlasso(extraflex.y, data=subset)$res +t_y <- rlasso(extraflex_y, data = subset)$res # partialling-out the linear effect of W from D -t.D <- rlasso(extraflex.d, data=subset)$res +t_d <- rlasso(extraflex_d, data = subset)$res # regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t.Y~t.D) -partial.lasso.est <- summary(partial.lasso.fit)$coef[2,1] +partial.lasso.fit <- lm(t_y ~ t_d) +partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1] -cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) +cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) # standard error -HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') -partial.lasso.se <- sqrt(diag(HCV.coefs))[2] +hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +partial_lasso_se <- sqrt(diag(hcv_coefs))[2] ``` ```{r} -table<- matrix(0, 3, 2) -table[1,1]<- control.est -table[1,2]<- control.se.HC0 -table[2,1]<- control.est -table[2,2]<- control.se.HC3 -table[3,1]<- partial.lasso.est -table[3,2]<- partial.lasso.se -colnames(table)<- c("Estimate","Std. Error") -rownames(table)<- c("full reg, HC0", "full reg, HC3", "partial reg via lasso") -tab<- xtable(table, digits=c(3, 3, 4)) +table <- matrix(0, 3, 2) +table[1, 1] <- control_est +table[1, 2] <- control_se_hc0 +table[2, 1] <- control_est +table[2, 2] <- control_se_hc3 +table[3, 1] <- partial_lasso_est +table[3, 2] <- partial_lasso_se +colnames(table) <- c("Estimate", "Std. Error") +rownames(table) <- c("full reg, HC0", "full reg, HC3", "partial reg via lasso") +tab <- xtable(table, digits = c(3, 3, 4)) tab -print(tab, type="latex") +print(tab, type = "latex") ``` In this case $p/n \approx 1$, that is $p/n$ is no longer small and we start seeing the differences between From c5b2f6a27fa20a8d9b16eb1f7db4470e2763860c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 10:38:02 -0700 Subject: [PATCH 022/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 273 ++++++++++++------- 1 file changed, 171 insertions(+), 102 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 83e9cc86..957df718 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -50,7 +50,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "_zAzi7q14V1f" + "id": "_zAzi7q14V1f", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -85,11 +88,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "eQ7eG0JTykao" + "id": "eQ7eG0JTykao", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", "data <- read.csv(file)\n", "dim(data)" ] @@ -107,7 +113,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "T1JH0uaXykat" + "id": "T1JH0uaXykat", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -127,14 +136,17 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "b8SdPks-ykau" + "id": "b8SdPks-ykau", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# construct matrices for estimation from the data\n", "Y <- log(data$wage)\n", "n <- length(Y)\n", - "Z <- data[-which(colnames(data) %in% c(\"wage\",\"lwage\"))]\n", + "Z <- data[- which(colnames(data) %in% c(\"wage\", \"lwage\"))]\n", "p <- dim(Z)[2]\n", "\n", "cat(\"Number of observations:\", n, '\\n')\n", @@ -154,17 +166,23 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "mLttnFeKykav" + "id": "mLttnFeKykav", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# generate a table of means of variables\n", - "Z_subset <- data[which(colnames(data) %in% c(\"lwage\",\"sex\",\"shs\",\"hsg\",\"scl\",\"clg\",\"ad\",\"mw\",\"so\",\"we\",\"ne\",\"exp1\"))]\n", + "z_subset <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", + " \"clg\", \"ad\", \"mw\", \"so\", \"we\", \"ne\", \"exp1\"))]\n", "table <- matrix(0, 12, 1)\n", - "table[1:12,1] <- as.numeric(lapply(Z_subset,mean))\n", - "rownames(table) <- c(\"Log Wage\",\"Sex\",\"Some High School\",\"High School Graduate\",\"Some College\",\"College Graduate\", \"Advanced Degree\",\"Midwest\",\"South\",\"West\",\"Northeast\",\"Experience\")\n", + "table[1:12, 1] <- as.numeric(lapply(z_subset, mean))\n", + "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Some High School\",\n", + " \"High School Graduate\", \"Some College\", \"College Graduate\",\n", + " \"Advanced Degree\", \"Midwest\", \"South\", \"West\", \"Northeast\", \"Experience\")\n", "colnames(table) <- c(\"Sample mean\")\n", - "tab<- xtable(table, digits = 2)\n", + "tab <- xtable(table, digits = 2)\n", "tab" ] }, @@ -190,11 +208,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "wIojwGx4ykaw" + "id": "wIojwGx4ykaw", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print(tab, type=\"latex\") # type=\"latex\" for printing table in LaTeX" + "print(tab, type = \"latex\") # type=\"latex\" for printing table in LaTeX" ] }, { @@ -214,7 +235,7 @@ "source": [ "Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$:\n", "\n", - "\\begin{equation}\\label{decompose}\n", + "\\begin{equation}\n", "Y = \\beta'X+ \\epsilon.\n", "\\end{equation}" ] @@ -243,15 +264,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "cArtOhBOWaNs" + "id": "cArtOhBOWaNs", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# splitting the data\n", "set.seed(1) # to make the results replicable (we will generate random numbers)\n", - "random <- sample(1:n, floor(n*4/5)) # draw (4/5)*n random numbers from 1 to n without replacing\n", - "train <- data[random,]\n", - "test <- data[-random,]" + "random <- sample(1:n, floor(n * 4 / 5)) # draw (4/5)*n random numbers from 1 to n without replacing\n", + "train <- data[random, ]\n", + "test <- data[-random, ]" ] }, { @@ -294,14 +318,17 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "BhaBGtEBykax" + "id": "BhaBGtEBykax", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# 1. basic model\n", - "basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2))\n", - "regbasic <- lm(basic, data=train) # perform ols using the defined model\n", - "cat( \"Number of regressors in the basic model:\",length(regbasic$coef), '\\n') # number of regressors in the Basic Model\n" + "basic <- lwage ~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2))\n", + "regbasic <- lm(basic, data = train) # perform ols using the defined model\n", + "cat(\"Number of regressors in the basic model:\", length(regbasic$coef), '\\n') # number of regressors in the Basic Model" ] }, { @@ -317,14 +344,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "y74aI4bhykax" + "id": "y74aI4bhykax", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# 2. flexible model\n", - "flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+C(occ2)+C(ind2) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", - "regflex <- lm(flex, data=train)\n", - "cat( \"Number of regressors in the flexible model:\",length(regflex$coef)) # number of regressors in the Flexible Model" + "flex <- lwage ~ sex + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2) +\n", + " (exp1 + exp2 + exp3 + exp4) * (shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", + "regflex <- lm(flex, data = train)\n", + "cat( \"Number of regressors in the flexible model:\", length(regflex$coef)) # number of regressors in the Flexible Model" ] }, { @@ -350,14 +381,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "Nx21IQPrykay" + "id": "Nx21IQPrykay", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Flexible model using Lasso, in-sample fit\n", - "train_flex <- model.matrix(flex,train) # all regressors\n", - "fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = \"lambda.min\") # in-sample fit right now, not out-of-sample using \"test\"" + "train_flex <- model.matrix(flex, train) # all regressors\n", + "fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# in-sample fit right now, not out-of-sample using \"test\"\n", + "yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = \"lambda.min\")" ] }, { @@ -374,7 +409,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "cSuifUPiykay" + "id": "cSuifUPiykay", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -397,28 +435,28 @@ "cat(\"adjusted R-squared for the flexible model: \", R2.adj2, \"\\n\")\n", "\n", "pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]]\n", - "R2.L <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", + "R2.L <- 1 - sum((yhat.lasso.cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", "cat(\"R-squared for the lasso with flexible model: \", R2.L, \"\\n\")\n", - "R2.adjL <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", + "R2.adjL <- 1 - (sum((yhat.lasso.cv - train$lwage)^2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "cat(\"adjusted R-squared for the flexible model: \", R2.adjL, \"\\n\")\n", "\n", "# MSE and adjusted MSE\n", "MSE1 <- mean(sumbasic$res^2)\n", "cat(\"MSE for the basic model: \", MSE1, \"\\n\")\n", "p1 <- sumbasic$df[1] # number of regressors\n", - "MSE.adj1 <- (ntrain/(ntrain-p1))*MSE1\n", + "MSE.adj1 <- (ntrain / (ntrain - p1)) * MSE1\n", "cat(\"adjusted MSE for the basic model: \", MSE.adj1, \"\\n\")\n", "\n", "MSE2 <-mean(sumflex$res^2)\n", "cat(\"MSE for the flexible model: \", MSE2, \"\\n\")\n", "p2 <- sumflex$df[1]\n", - "MSE.adj2 <- (ntrain/(ntrain-p2))*MSE2\n", + "MSE.adj2 <- (ntrain / (ntrain - p2)) * MSE2\n", "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adj2, \"\\n\")\n", "\n", "lasso.res <- train$lwage - yhat.lasso.cv\n", "MSEL <-mean(lasso.res^2)\n", "cat(\"MSE for the lasso flexible model: \", MSEL, \"\\n\")\n", - "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", + "MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL\n", "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adjL, \"\\n\")" ] }, @@ -426,19 +464,22 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "lo8UazUiykay" + "id": "lo8UazUiykay", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Output the table\n", "table <- matrix(0, 3, 5)\n", - "table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1)\n", - "table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2)\n", - "table[3,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL)\n", - "colnames(table)<- c(\"p\",\"$R^2_{sample}$\",\"$MSE_{sample}$\",\"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", - "rownames(table)<- c(\"basic reg\",\"flexible reg\", \"lasso flex\")\n", - "tab<- xtable(table, digits =c(0,0,2,2,2,2))\n", - "print(tab,type=\"latex\")\n", + "table[1, 1:5] <- c(p1, R2.1, MSE1, R2.adj1, MSE.adj1)\n", + "table[2, 1:5] <- c(p2, R2.2, MSE2, R2.adj2, MSE.adj2)\n", + "table[3, 1:5] <- c(pL, R2.L, MSEL, R2.adjL, MSE.adjL)\n", + "colnames(table) <- c(\"p\", \"$R^2_{sample}$\", \"$MSE_{sample}$\", \"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", + "rownames(table) <- c(\"basic reg\", \"flexible reg\", \"lasso flex\")\n", + "tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2))\n", + "print(tab, type = \"latex\")\n", "tab" ] }, @@ -470,20 +511,23 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "kwdlZV-iykaz" + "id": "kwdlZV-iykaz", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# basic model\n", - "options(warn=-1) # ignore warnings\n", - "regbasic <- lm(basic, data=train)\n", + "options(warn = -1) # ignore warnings\n", + "regbasic <- lm(basic, data = train)\n", "\n", "# calculating the out-of-sample MSE\n", - "yhat.bas <- predict(regbasic, newdata=test)\n", + "yhat.bas <- predict(regbasic, newdata = test)\n", "y.test <- test$lwage\n", - "mean.train = mean(train$lwage)\n", - "MSE.test1 <- sum((y.test-yhat.bas)^2)/length(y.test)\n", - "R2.test1<- 1- MSE.test1/mean((y.test-mean.train)^2)\n", + "mean.train <- mean(train$lwage)\n", + "MSE.test1 <- sum((y.test - yhat.bas)^2) / length(y.test)\n", + "R2.test1 <- 1 - MSE.test1 / mean((y.test - mean.train)^2)\n", "\n", "cat(\"Test MSE for the basic model: \", MSE.test1, \" \")\n", "cat(\"Test R2 for the basic model: \", R2.test1)" @@ -502,20 +546,23 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "U2lQCgJeyka0" + "id": "U2lQCgJeyka0", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# flexible model\n", - "options(warn=-1) # ignore warnings\n", - "regflex <- lm(flex, data=train)\n", + "options(warn = -1) # ignore warnings\n", + "regflex <- lm(flex, data = train)\n", "\n", "# calculating the out-of-sample MSE\n", - "yhat.flex<- predict(regflex, newdata=test)\n", + "yhat.flex<- predict(regflex, newdata = test)\n", "y.test <- test$lwage\n", - "mean.train = mean(train$lwage)\n", - "MSE.test2 <- sum((y.test-yhat.flex)^2)/length(y.test)\n", - "R2.test2<- 1- MSE.test2/mean((y.test-mean.train)^2)\n", + "mean.train <- mean(train$lwage)\n", + "MSE.test2 <- sum((y.test - yhat.flex)^2) / length(y.test)\n", + "R2.test2 <- 1 - MSE.test2 / mean((y.test - mean.train)^2)\n", "\n", "cat(\"Test MSE for the flexible model: \", MSE.test2, \" \")\n", "\n", @@ -548,22 +595,25 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "5xFTH78Kyka1" + "id": "5xFTH78Kyka1", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Flexible model using Lasso\n", "# model matrix should be formed before train/test as some levels dropped\n", - "flex_data = model.matrix(flex,data)\n", - "train_flex <- flex_data[random,]\n", - "test_flex <- flex_data[-random,]\n", + "flex_data <- model.matrix(flex, data)\n", + "train_flex <- flex_data[random, ]\n", + "test_flex <- flex_data[-random, ]\n", "\n", - "fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = \"lambda.min\")\n", + "fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = \"lambda.min\")\n", "\n", "# calculating the out-of-sample MSE\n", - "MSE.lasso <- sum((y.test-yhat.lasso.cv)^2)/length(y.test)\n", - "R2.lasso<- 1- MSE.lasso/mean((y.test-mean(train$lwage))^2)\n", + "MSE.lasso <- sum((y.test - yhat.lasso.cv)^2) / length(y.test)\n", + "R2.lasso <- 1 - MSE.lasso / mean((y.test - mean(train$lwage))^2)\n", "\n", "cat(\"Test MSE for the lasso on flexible model: \", MSE.lasso, \" \")\n", "\n", @@ -583,22 +633,25 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "PS3YeUT_yka2" + "id": "PS3YeUT_yka2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Output the comparison table\n", - "table2 <- matrix(0, 3,2)\n", - "table2[1,1] <- MSE.test1\n", - "table2[2,1] <- MSE.test2\n", - "table2[3,1] <- MSE.lasso\n", - "table2[1,2] <- R2.test1\n", - "table2[2,2] <- R2.test2\n", - "table2[3,2] <- R2.lasso\n", - "\n", - "rownames(table2)<- c(\"basic reg\",\"flexible reg\",\"lasso regression\")\n", - "colnames(table2)<- c(\"$MSE_{test}$\", \"$R^2_{test}$\")\n", - "tab2 <- xtable(table2, digits =3)\n", + "table2 <- matrix(0, 3, 2)\n", + "table2[1, 1] <- MSE.test1\n", + "table2[2, 1] <- MSE.test2\n", + "table2[3, 1] <- MSE.lasso\n", + "table2[1, 2] <- R2.test1\n", + "table2[2, 2] <- R2.test2\n", + "table2[3, 2] <- R2.lasso\n", + "\n", + "rownames(table2) <- c(\"basic reg\", \"flexible reg\", \"lasso regression\")\n", + "colnames(table2) <- c(\"$MSE_{test}$\", \"$R^2_{test}$\")\n", + "tab2 <- xtable(table2, digits = 3)\n", "tab2" ] }, @@ -606,11 +659,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "kFuPPNytyka2" + "id": "kFuPPNytyka2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print(tab2,type=\"latex\")" + "print(tab2, type = \"latex\")" ] }, { @@ -628,24 +684,31 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "G_Mm2gG3nwMn" + "id": "G_Mm2gG3nwMn", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# extra flexible model\n", - "extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\n", - "regextra <- lm(extraflex, data=train)\n", + "extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\n", + "regextra <- lm(extraflex, data = train)\n", "sumextra <- summary(regextra)\n", - "cat(\"Number of Extra-Flex Controls\", length(regextra$coef)-1, \"\\n\")\n", - "n= length(data$wage); p =length(regextra$coef);\n", - "ntrain = length(train$wage)" + "cat(\"Number of Extra-Flex Controls\", length(regextra$coef) - 1, \"\\n\")\n", + "n <- length(data$wage)\n", + "p <- length(regextra$coef);\n", + "ntrain <- length(train$wage)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "OXSNAxauoJ3h" + "id": "OXSNAxauoJ3h", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -659,7 +722,7 @@ "# MSE and adjusted MSE\n", "MSE.extra <- mean(sumextra$res^2)\n", "cat(\"MSE for the extra flexible model (in-sample): \", MSE.extra, \"\\n\")\n", - "MSE.adjextra <- (ntrain/(ntrain-p))*MSE.extra\n", + "MSE.adjextra <- (ntrain / (ntrain - p)) * MSE.extra\n", "cat(\"adjusted MSE for the basic model (in-sample): \", MSE.adj1, \"\\n\")" ] }, @@ -667,15 +730,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "vFrRnnlmo9yG" + "id": "vFrRnnlmo9yG", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "## Out-of-sample\n", "yhat.ex <- predict(regextra, newdata=test)\n", "y.test.ex <- test$lwage\n", - "MSE.test.ex <- sum((y.test.ex-yhat.ex)^2)/length(y.test.ex)\n", - "R2.test.ex<- 1- MSE.test.ex/mean((y.test.ex-mean(train$lwage))^2)\n", + "MSE.test.ex <- sum((y.test.ex - yhat.ex)^2) / length(y.test.ex)\n", + "R2.test.ex <- 1 - MSE.test.ex / mean((y.test.ex - mean(train$lwage))^2)\n", "\n", "cat(\"Test MSE for the basic model: \", MSE.test.ex, \" \")\n", "cat(\"Test R2 for the basic model: \", R2.test.ex)" @@ -696,28 +762,31 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "TYvDJ3QepUgl" + "id": "TYvDJ3QepUgl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# model matrix should be formed before train/test as some levels dropped\n", - "flex_data = model.matrix(extraflex,data)\n", - "train_flex <- flex_data[random,]\n", - "test_flex <- flex_data[-random,]\n", + "flex_data <- model.matrix(extraflex, data)\n", + "train_flex <- flex_data[random, ]\n", + "test_flex <- flex_data[-random, ]\n", "\n", "# fit model\n", - "fit.lcv <- cv.glmnet(train_flex, train$lwage, family=\"gaussian\", alpha=1, nfolds=5)\n", + "fit.lcv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", "\n", "# in-sample\n", "yhat.lcv <- predict(fit.lcv, newx = train_flex, s = \"lambda.min\")\n", "\n", - "R2.L <- 1-(sum((yhat.lcv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio #\n", + "R2.L <- 1 - sum((yhat.lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", "pL <- fit.lcv$nzero[fit.lcv$index[1]]\n", - "R2.adjL <- 1-(sum((yhat.lcv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1))\n", + "R2.adjL <- 1 - (sum((yhat.lcv - train$lwage) ^ 2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "\n", "lasso.res <- train$lwage - yhat.lcv\n", - "MSEL <-mean(lasso.res^2)\n", - "MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL\n", + "MSEL <- mean(lasso.res^2)\n", + "MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL\n", "\n", "cat(\"R-squared for the lasso with the extra flexible model (in-sample): \", R2.L, \"\\n\")\n", "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjL, \"\\n\")\n", @@ -726,8 +795,8 @@ "\n", "# out-of-sample\n", "yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = \"lambda.min\")\n", - "MSE.lasso <- sum((test$lwage-yhat.lcv.test)^2)/length(test$lwage)\n", - "R2.lasso <- 1- MSE.lasso/mean((test$lwage-mean(train$lwage))^2)\n", + "MSE.lasso <- sum((test$lwage - yhat.lcv.test)^2) / length(test$lwage)\n", + "R2.lasso <- 1 - MSE.lasso / mean((test$lwage - mean(train$lwage))^2)\n", "\n", "cat(\"\\n\")\n", "cat(\"Test R2 for the lasso the extra flexible model: \", R2.lasso,\"\\n\")\n", From f26da76624a04914e97296834bc906b5319e9a92 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 17:48:49 +0000 Subject: [PATCH 023/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files --- PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 168 ++++++++++---------- 1 file changed, 87 insertions(+), 81 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd index 9d197589..d53fffcf 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd @@ -36,7 +36,7 @@ library(glmnet) We start by loading the data set. ```{r} -file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" data <- read.csv(file) dim(data) ``` @@ -53,7 +53,7 @@ We construct the output variable $Y$ and the matrix $Z$ which includes the chara # construct matrices for estimation from the data Y <- log(data$wage) n <- length(Y) -Z <- data[-which(colnames(data) %in% c("wage","lwage"))] +Z <- data[- which(colnames(data) %in% c("wage", "lwage"))] p <- dim(Z)[2] cat("Number of observations:", n, '\n') @@ -64,12 +64,15 @@ For the outcome variable *wage* and a subset of the raw regressors, we calculate ```{r} # generate a table of means of variables -Z_subset <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","mw","so","we","ne","exp1"))] +z_subset <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", + "clg", "ad", "mw", "so", "we", "ne", "exp1"))] table <- matrix(0, 12, 1) -table[1:12,1] <- as.numeric(lapply(Z_subset,mean)) -rownames(table) <- c("Log Wage","Sex","Some High School","High School Graduate","Some College","College Graduate", "Advanced Degree","Midwest","South","West","Northeast","Experience") +table[1:12, 1] <- as.numeric(lapply(z_subset, mean)) +rownames(table) <- c("Log Wage", "Sex", "Some High School", + "High School Graduate", "Some College", "College Graduate", + "Advanced Degree", "Midwest", "South", "West", "Northeast", "Experience") colnames(table) <- c("Sample mean") -tab<- xtable(table, digits = 2) +tab <- xtable(table, digits = 2) tab ``` @@ -78,14 +81,14 @@ E.g., the share of female workers in our sample is ~44% ($sex=1$ if female). Alternatively, using the xtable package, we can also print the table in LaTeX. ```{r} -print(tab, type="latex") # type="latex" for printing table in LaTeX +print(tab, type = "latex") # type="latex" for printing table in LaTeX ``` ## Prediction Question Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$: -\begin{equation}\label{decompose} +\begin{equation} Y = \beta'X+ \epsilon. \end{equation} @@ -105,9 +108,9 @@ Toward answering the latter, we measure the prediction quality of the two models ```{r} # splitting the data set.seed(1) # to make the results replicable (we will generate random numbers) -random <- sample(1:n, floor(n*4/5)) # draw (4/5)*n random numbers from 1 to n without replacing -train <- data[random,] -test <- data[-random,] +random <- sample(1:n, floor(n * 4 / 5)) # draw (4/5)*n random numbers from 1 to n without replacing +train <- data[random, ] +test <- data[-random, ] ``` @@ -127,18 +130,19 @@ Let us fit both models to our data by running ordinary least squares (ols): ```{r} # 1. basic model -basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2)) -regbasic <- lm(basic, data=train) # perform ols using the defined model -cat( "Number of regressors in the basic model:",length(regbasic$coef), '\n') # number of regressors in the Basic Model +basic <- lwage ~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2)) +regbasic <- lm(basic, data = train) # perform ols using the defined model +cat("Number of regressors in the basic model:", length(regbasic$coef), '\n') # number of regressors in the Basic Model ``` ##### Note that the basic model consists of $51$ regressors. ```{r} # 2. flexible model -flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+C(occ2)+C(ind2) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) -regflex <- lm(flex, data=train) -cat( "Number of regressors in the flexible model:",length(regflex$coef)) # number of regressors in the Flexible Model +flex <- lwage ~ sex + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2) + + (exp1 + exp2 + exp3 + exp4) * (shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) +regflex <- lm(flex, data = train) +cat( "Number of regressors in the flexible model:", length(regflex$coef)) # number of regressors in the Flexible Model ``` ##### Note that the flexible model consists of $246$ regressors. @@ -148,9 +152,10 @@ We re-estimate the flexible model using Lasso (the least absolute shrinkage and ```{r} # Flexible model using Lasso, in-sample fit -train_flex <- model.matrix(flex,train) # all regressors -fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family="gaussian", alpha=1, nfolds=5) -yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = "lambda.min") # in-sample fit right now, not out-of-sample using "test" +train_flex <- model.matrix(flex, train) # all regressors +fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = "gaussian", alpha = 1, nfolds = 5) +# in-sample fit right now, not out-of-sample using "test" +yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = "lambda.min") ``` #### Evaluating the predictive performance of the basic and flexible models in-sample @@ -176,41 +181,41 @@ R2.adj2 <- sumflex$adj.r.squared cat("adjusted R-squared for the flexible model: ", R2.adj2, "\n") pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]] -R2.L <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio # +R2.L <- 1 - sum((yhat.lasso.cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) cat("R-squared for the lasso with flexible model: ", R2.L, "\n") -R2.adjL <- 1-(sum((yhat.lasso.cv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1)) +R2.adjL <- 1 - (sum((yhat.lasso.cv - train$lwage)^2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) cat("adjusted R-squared for the flexible model: ", R2.adjL, "\n") # MSE and adjusted MSE MSE1 <- mean(sumbasic$res^2) cat("MSE for the basic model: ", MSE1, "\n") p1 <- sumbasic$df[1] # number of regressors -MSE.adj1 <- (ntrain/(ntrain-p1))*MSE1 +MSE.adj1 <- (ntrain / (ntrain - p1)) * MSE1 cat("adjusted MSE for the basic model: ", MSE.adj1, "\n") MSE2 <-mean(sumflex$res^2) cat("MSE for the flexible model: ", MSE2, "\n") p2 <- sumflex$df[1] -MSE.adj2 <- (ntrain/(ntrain-p2))*MSE2 +MSE.adj2 <- (ntrain / (ntrain - p2)) * MSE2 cat("adjusted MSE for the lasso flexible model: ", MSE.adj2, "\n") lasso.res <- train$lwage - yhat.lasso.cv MSEL <-mean(lasso.res^2) cat("MSE for the lasso flexible model: ", MSEL, "\n") -MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL +MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL cat("adjusted MSE for the lasso flexible model: ", MSE.adjL, "\n") ``` ```{r} # Output the table table <- matrix(0, 3, 5) -table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1) -table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2) -table[3,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL) -colnames(table)<- c("p","$R^2_{sample}$","$MSE_{sample}$","$R^2_{adjusted}$", "$MSE_{adjusted}$") -rownames(table)<- c("basic reg","flexible reg", "lasso flex") -tab<- xtable(table, digits =c(0,0,2,2,2,2)) -print(tab,type="latex") +table[1, 1:5] <- c(p1, R2.1, MSE1, R2.adj1, MSE.adj1) +table[2, 1:5] <- c(p2, R2.2, MSE2, R2.adj2, MSE.adj2) +table[3, 1:5] <- c(pL, R2.L, MSEL, R2.adjL, MSE.adjL) +colnames(table) <- c("p", "$R^2_{sample}$", "$MSE_{sample}$", "$R^2_{adjusted}$", "$MSE_{adjusted}$") +rownames(table) <- c("basic reg", "flexible reg", "lasso flex") +tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2)) +print(tab, type = "latex") tab ``` @@ -226,15 +231,15 @@ Now that we have seen in-sample fit, we evaluate our models on the out-of-sample ```{r} # basic model -options(warn=-1) # ignore warnings -regbasic <- lm(basic, data=train) +options(warn = -1) # ignore warnings +regbasic <- lm(basic, data = train) # calculating the out-of-sample MSE -yhat.bas <- predict(regbasic, newdata=test) +yhat.bas <- predict(regbasic, newdata = test) y.test <- test$lwage -mean.train = mean(train$lwage) -MSE.test1 <- sum((y.test-yhat.bas)^2)/length(y.test) -R2.test1<- 1- MSE.test1/mean((y.test-mean.train)^2) +mean.train <- mean(train$lwage) +MSE.test1 <- sum((y.test - yhat.bas)^2) / length(y.test) +R2.test1 <- 1 - MSE.test1 / mean((y.test - mean.train)^2) cat("Test MSE for the basic model: ", MSE.test1, " ") cat("Test R2 for the basic model: ", R2.test1) @@ -244,15 +249,15 @@ In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$. ```{r} # flexible model -options(warn=-1) # ignore warnings -regflex <- lm(flex, data=train) +options(warn = -1) # ignore warnings +regflex <- lm(flex, data = train) # calculating the out-of-sample MSE -yhat.flex<- predict(regflex, newdata=test) +yhat.flex<- predict(regflex, newdata = test) y.test <- test$lwage -mean.train = mean(train$lwage) -MSE.test2 <- sum((y.test-yhat.flex)^2)/length(y.test) -R2.test2<- 1- MSE.test2/mean((y.test-mean.train)^2) +mean.train <- mean(train$lwage) +MSE.test2 <- sum((y.test - yhat.flex)^2) / length(y.test) +R2.test2 <- 1 - MSE.test2 / mean((y.test - mean.train)^2) cat("Test MSE for the flexible model: ", MSE.test2, " ") @@ -270,16 +275,16 @@ Next, let us use lasso regression in the flexible model instead of ols regressio ```{r} # Flexible model using Lasso # model matrix should be formed before train/test as some levels dropped -flex_data = model.matrix(flex,data) -train_flex <- flex_data[random,] -test_flex <- flex_data[-random,] +flex_data <- model.matrix(flex, data) +train_flex <- flex_data[random, ] +test_flex <- flex_data[-random, ] -fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family="gaussian", alpha=1, nfolds=5) -yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = "lambda.min") +fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) +yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = "lambda.min") # calculating the out-of-sample MSE -MSE.lasso <- sum((y.test-yhat.lasso.cv)^2)/length(y.test) -R2.lasso<- 1- MSE.lasso/mean((y.test-mean(train$lwage))^2) +MSE.lasso <- sum((y.test - yhat.lasso.cv)^2) / length(y.test) +R2.lasso <- 1 - MSE.lasso / mean((y.test - mean(train$lwage))^2) cat("Test MSE for the lasso on flexible model: ", MSE.lasso, " ") @@ -290,22 +295,22 @@ Finally, let us summarize the results: ```{r} # Output the comparison table -table2 <- matrix(0, 3,2) -table2[1,1] <- MSE.test1 -table2[2,1] <- MSE.test2 -table2[3,1] <- MSE.lasso -table2[1,2] <- R2.test1 -table2[2,2] <- R2.test2 -table2[3,2] <- R2.lasso - -rownames(table2)<- c("basic reg","flexible reg","lasso regression") -colnames(table2)<- c("$MSE_{test}$", "$R^2_{test}$") -tab2 <- xtable(table2, digits =3) +table2 <- matrix(0, 3, 2) +table2[1, 1] <- MSE.test1 +table2[2, 1] <- MSE.test2 +table2[3, 1] <- MSE.lasso +table2[1, 2] <- R2.test1 +table2[2, 2] <- R2.test2 +table2[3, 2] <- R2.lasso + +rownames(table2) <- c("basic reg", "flexible reg", "lasso regression") +colnames(table2) <- c("$MSE_{test}$", "$R^2_{test}$") +tab2 <- xtable(table2, digits = 3) tab2 ``` ```{r} -print(tab2,type="latex") +print(tab2, type = "latex") ``` ## Extra flexible model and Overfitting @@ -314,12 +319,13 @@ Given the results above, it is not immediately clear why one would choose to use ```{r} # extra flexible model -extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2 -regextra <- lm(extraflex, data=train) +extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2 +regextra <- lm(extraflex, data = train) sumextra <- summary(regextra) -cat("Number of Extra-Flex Controls", length(regextra$coef)-1, "\n") -n= length(data$wage); p =length(regextra$coef); -ntrain = length(train$wage) +cat("Number of Extra-Flex Controls", length(regextra$coef) - 1, "\n") +n <- length(data$wage) +p <- length(regextra$coef); +ntrain <- length(train$wage) ``` ```{r} @@ -333,7 +339,7 @@ cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjextra # MSE and adjusted MSE MSE.extra <- mean(sumextra$res^2) cat("MSE for the extra flexible model (in-sample): ", MSE.extra, "\n") -MSE.adjextra <- (ntrain/(ntrain-p))*MSE.extra +MSE.adjextra <- (ntrain / (ntrain - p)) * MSE.extra cat("adjusted MSE for the basic model (in-sample): ", MSE.adj1, "\n") ``` @@ -341,8 +347,8 @@ cat("adjusted MSE for the basic model (in-sample): ", MSE.adj1, "\n") ## Out-of-sample yhat.ex <- predict(regextra, newdata=test) y.test.ex <- test$lwage -MSE.test.ex <- sum((y.test.ex-yhat.ex)^2)/length(y.test.ex) -R2.test.ex<- 1- MSE.test.ex/mean((y.test.ex-mean(train$lwage))^2) +MSE.test.ex <- sum((y.test.ex - yhat.ex)^2) / length(y.test.ex) +R2.test.ex <- 1 - MSE.test.ex / mean((y.test.ex - mean(train$lwage))^2) cat("Test MSE for the basic model: ", MSE.test.ex, " ") cat("Test R2 for the basic model: ", R2.test.ex) @@ -354,23 +360,23 @@ Contrast this with Lasso: ```{r} # model matrix should be formed before train/test as some levels dropped -flex_data = model.matrix(extraflex,data) -train_flex <- flex_data[random,] -test_flex <- flex_data[-random,] +flex_data <- model.matrix(extraflex, data) +train_flex <- flex_data[random, ] +test_flex <- flex_data[-random, ] # fit model -fit.lcv <- cv.glmnet(train_flex, train$lwage, family="gaussian", alpha=1, nfolds=5) +fit.lcv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) # in-sample yhat.lcv <- predict(fit.lcv, newx = train_flex, s = "lambda.min") -R2.L <- 1-(sum((yhat.lcv - train$lwage) ^ 2))/(sum((train$lwage - mean(train$lwage)) ^ 2)) # fit.lasso.cv$glmnet.fit$dev.ratio # +R2.L <- 1 - sum((yhat.lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) pL <- fit.lcv$nzero[fit.lcv$index[1]] -R2.adjL <- 1-(sum((yhat.lcv - train$lwage) ^ 2)/(ntrain-pL-1))/(sum((train$lwage - mean(train$lwage)) ^ 2)/(ntrain-1)) +R2.adjL <- 1 - (sum((yhat.lcv - train$lwage) ^ 2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) lasso.res <- train$lwage - yhat.lcv -MSEL <-mean(lasso.res^2) -MSE.adjL <- (ntrain/(ntrain-pL-1))*MSEL +MSEL <- mean(lasso.res^2) +MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL cat("R-squared for the lasso with the extra flexible model (in-sample): ", R2.L, "\n") cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjL, "\n") @@ -379,8 +385,8 @@ cat("adjusted MSE for the lasso with the extraflexible model (in-sample): ", MSE # out-of-sample yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = "lambda.min") -MSE.lasso <- sum((test$lwage-yhat.lcv.test)^2)/length(test$lwage) -R2.lasso <- 1- MSE.lasso/mean((test$lwage-mean(train$lwage))^2) +MSE.lasso <- sum((test$lwage - yhat.lcv.test)^2) / length(test$lwage) +R2.lasso <- 1 - MSE.lasso / mean((test$lwage - mean(train$lwage))^2) cat("\n") cat("Test R2 for the lasso the extra flexible model: ", R2.lasso,"\n") From eea3fbd93c8a05dab05a727dda993d628fd79c7e Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 10:53:48 -0700 Subject: [PATCH 024/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- ...-ols-and-lasso-for-wage-gap-inference.irnb | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 6ad20300..5ab2bea2 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -127,7 +127,8 @@ }, "outputs": [], "source": [ - "z <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\", \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", + "z <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\", \"clg\",\n", + " \"ad\", \"ne\", \"mw\", \"so\", \"we\", \"exp1\"))]\n", "\n", "data_female <- data[data$sex == 1, ]\n", "z_female <- data_female[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", @@ -268,12 +269,12 @@ "nocontrol_fit <- lm(lwage ~ sex, data = data)\n", "nocontrol_est <- summary(nocontrol_fit)$coef[\"sex\", 1]\n", "# HC - \"heteroskedasticity cosistent\" -- HC3 is the SE that remains consistent in high dimensions\n", - "hcv_coefs <- vcovHC(nocontrol_fit, type = 'HC3')\n", + "hcv_coefs <- vcovHC(nocontrol_fit, type = \"HC3\")\n", "nocontrol_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gender and the corresponding standard error\n", - "cat (\"The estimated coefficient on the dummy for gender is\", nocontrol_est,\n", - " \" and the corresponding robust standard error is\", nocontrol_se)\n" + "cat(\"The estimated coefficient on the dummy for gender is\", nocontrol_est,\n", + " \" and the corresponding robust standard error is\", nocontrol_se)\n" ] }, { @@ -330,8 +331,8 @@ "flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", "# Note that ()*() operation in formula objects in R creates a formula of the sort:\n", - "# (exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) \n", - "# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", + "# '(exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we)\n", + "# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)'\n", "# This is not intuitive at all, but that's what it does.\n", "\n", "control_fit <- lm(flex, data = data)\n", @@ -341,7 +342,7 @@ "\n", "cat(\"Coefficient for OLS with controls\", control_est)\n", "\n", - "hcv_coefs <- vcovHC(control_fit, type = 'HC3');\n", + "hcv_coefs <- vcovHC(control_fit, type = \"HC3\")\n", "control_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors" ] }, @@ -373,10 +374,10 @@ "outputs": [], "source": [ "xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we),\n", - " data=data[data$sex == 0, ])\n", + " data = data[data$sex == 0, ])\n", "y0 <- data[data$sex == 0, ]$lwage\n", "xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we),\n", - " data=data[data$sex == 1, ])\n", + " data = data[data$sex == 1, ])\n", "y1 <- data[data$sex == 1, ]$lwage\n", "mu1 <- colMeans(xx1)\n", "mu0 <- colMeans(xx0)\n", @@ -409,8 +410,8 @@ "source": [ "svd0 <- svd(xx0)\n", "svd1 <- svd(xx1)\n", - "svd0$d[svd0$d <= 1e-10] = 0\n", - "svd0$d[svd0$d > 1e-10] = 1 / svd0$d[svd0$d > 1e-10]\n", + "svd0$d[svd0$d <= 1e-10] <- 0\n", + "svd0$d[svd0$d > 1e-10] <- 1 / svd0$d[svd0$d > 1e-10]\n", "beta0 <- (svd0$v %*% (svd0$d * svd0$d * t(svd0$v))) %*% t(xx0) %*% y0\n", "svd1$d[svd1$d <= 1e-10] <- 0\n", "svd1$d[svd1$d > 1e-10] <- 1 / svd1$d[svd1$d > 1e-10]\n", @@ -463,17 +464,17 @@ "t_d <- lm(flex_d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.fit <- lm(t_y ~ t_d)\n", - "partial_est <- summary(partial.fit)$coef[2, 1]\n", + "partial_fit <- lm(t_y ~ t_d)\n", + "partial_est <- summary(partial_fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out\", partial_est)\n", "\n", "# standard error\n", - "hcv_coefs <- vcovHC(partial.fit, type = 'HC3')\n", + "hcv_coefs <- vcovHC(partial_fit, type = \"HC3\")\n", "partial_se <- sqrt(diag(hcv_coefs))[2]\n", "\n", "# confidence interval\n", - "confint(partial.fit)[2, ]" + "confint(partial_fit)[2, ]" ] }, { @@ -526,13 +527,13 @@ "t_d <- rlasso(flex_d, data = data)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t_y ~ t_d)\n", - "partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1]\n", + "partial_lasso_fit <- lm(t_y ~ t_d)\n", + "partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out using lasso\", partial_lasso_est)\n", "\n", "# standard error\n", - "hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "hcv_coefs <- vcovHC(partial_lasso_fit, type = \"HC3\")\n", "partial_lasso_se <- sqrt(diag(hcv_coefs))[2]" ] }, @@ -581,7 +582,7 @@ "table[4, 2] <- partial_lasso_se\n", "colnames(table) <- c(\"Estimate\", \"Std. Error\")\n", "rownames(table) <- c(\"Without controls\", \"full reg\", \"partial reg\", \"partial reg via lasso\")\n", - "tab<- xtable(table, digits = c(3, 3, 4))\n", + "tab <- xtable(table, digits = c(3, 3, 4))\n", "tab" ] }, @@ -663,7 +664,7 @@ "source": [ "set.seed(2724)\n", "subset_size <- 1000\n", - "random <- sample(1:nrow(data), subset_size)\n", + "random <- sample(1:seq_len(nrow(data)), subset_size)\n", "subset <- data[random, ]" ] }, @@ -717,7 +718,7 @@ "p <- length(control_fit$coef)\n", "\n", "# HC0 SE\n", - "hcv_coefs_hc0 <- vcovHC(control_fit, type = 'HC0')\n", + "hcv_coefs_hc0 <- vcovHC(control_fit, type = \"HC0\")\n", "control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2]\n", "\n", "# For a more correct approach, we\n", @@ -727,9 +728,9 @@ "# really work here.\n", "coefs <- hatvalues(control_fit)\n", "trim <- 0.99999999999\n", - "coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim)\n", + "coefs_trimmed <- coefs * (coefs < trim) + trim * (coefs >= trim)\n", "omega <- (control_fit$residuals^2) / ((1 - coefs_trimmed)^2)\n", - "hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = 'HC3')\n", + "hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = \"HC3\")\n", "control_se_hc3 <- sqrt(diag(hcv_coefs))[2]" ] }, @@ -759,13 +760,13 @@ "t_d <- rlasso(extraflex_d, data = subset)$res\n", "\n", "# regression of Y on D after partialling-out the effect of W\n", - "partial.lasso.fit <- lm(t_y ~ t_d)\n", - "partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1]\n", + "partial_lasso_fit <- lm(t_y ~ t_d)\n", + "partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1]\n", "\n", "cat(\"Coefficient for D via partialling-out using lasso\", partial_lasso_est)\n", "\n", "# standard error\n", - "hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3')\n", + "hcv_coefs <- vcovHC(partial_lasso_fit, type = \"HC3\")\n", "partial_lasso_se <- sqrt(diag(hcv_coefs))[2]" ] }, From a383eb37c782e42f9a2a54df5139fa6a3d41b950 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 18:04:13 +0000 Subject: [PATCH 025/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files --- ...r-ols-and-lasso-for-wage-gap-inference.Rmd | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd index a662a61f..6e732620 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -44,7 +44,8 @@ dim(data) To start our (causal) analysis, we compare the sample means given gender: ```{r} -z <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", "ad", "ne", "mw", "so", "we", "exp1"))] +z <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", + "ad", "ne", "mw", "so", "we", "exp1"))] data_female <- data[data$sex == 1, ] z_female <- data_female[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", @@ -107,12 +108,12 @@ We verify this by running an ols regression in R. nocontrol_fit <- lm(lwage ~ sex, data = data) nocontrol_est <- summary(nocontrol_fit)$coef["sex", 1] # HC - "heteroskedasticity cosistent" -- HC3 is the SE that remains consistent in high dimensions -hcv_coefs <- vcovHC(nocontrol_fit, type = 'HC3') +hcv_coefs <- vcovHC(nocontrol_fit, type = "HC3") nocontrol_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors # print unconditional effect of gender and the corresponding standard error -cat ("The estimated coefficient on the dummy for gender is", nocontrol_est, - " and the corresponding robust standard error is", nocontrol_se) +cat("The estimated coefficient on the dummy for gender is", nocontrol_est, + " and the corresponding robust standard error is", nocontrol_se) ``` Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. @@ -133,8 +134,8 @@ Let us run the ols regression with controls. flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # Note that ()*() operation in formula objects in R creates a formula of the sort: -# (exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) -# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# '(exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)' # This is not intuitive at all, but that's what it does. control_fit <- lm(flex, data = data) @@ -144,7 +145,7 @@ summary(control_fit) cat("Coefficient for OLS with controls", control_est) -hcv_coefs <- vcovHC(control_fit, type = 'HC3'); +hcv_coefs <- vcovHC(control_fit, type = "HC3") control_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors ``` @@ -155,10 +156,10 @@ We now show how the conditional gap and the remainder decompose the marginal wag ```{r} xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), - data=data[data$sex == 0, ]) + data = data[data$sex == 0, ]) y0 <- data[data$sex == 0, ]$lwage xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), - data=data[data$sex == 1, ]) + data = data[data$sex == 1, ]) y1 <- data[data$sex == 1, ]$lwage mu1 <- colMeans(xx1) mu0 <- colMeans(xx0) @@ -177,8 +178,8 @@ We next consider a Oaxaca-Blinder decomposition that also incorporates an intera ```{r} svd0 <- svd(xx0) svd1 <- svd(xx1) -svd0$d[svd0$d <= 1e-10] = 0 -svd0$d[svd0$d > 1e-10] = 1 / svd0$d[svd0$d > 1e-10] +svd0$d[svd0$d <= 1e-10] <- 0 +svd0$d[svd0$d > 1e-10] <- 1 / svd0$d[svd0$d > 1e-10] beta0 <- (svd0$v %*% (svd0$d * svd0$d * t(svd0$v))) %*% t(xx0) %*% y0 svd1$d[svd1$d <= 1e-10] <- 0 svd1$d[svd1$d > 1e-10] <- 1 / svd1$d[svd1$d > 1e-10] @@ -209,17 +210,17 @@ t_y <- lm(flex_y, data = data)$res t_d <- lm(flex_d, data = data)$res # regression of Y on D after partialling-out the effect of W -partial.fit <- lm(t_y ~ t_d) -partial_est <- summary(partial.fit)$coef[2, 1] +partial_fit <- lm(t_y ~ t_d) +partial_est <- summary(partial_fit)$coef[2, 1] cat("Coefficient for D via partialling-out", partial_est) # standard error -hcv_coefs <- vcovHC(partial.fit, type = 'HC3') +hcv_coefs <- vcovHC(partial_fit, type = "HC3") partial_se <- sqrt(diag(hcv_coefs))[2] # confidence interval -confint(partial.fit)[2, ] +confint(partial_fit)[2, ] ``` Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. @@ -244,13 +245,13 @@ t_y <- rlasso(flex_y, data = data)$res t_d <- rlasso(flex_d, data = data)$res # regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t_y ~ t_d) -partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1] +partial_lasso_fit <- lm(t_y ~ t_d) +partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1] cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) # standard error -hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +hcv_coefs <- vcovHC(partial_lasso_fit, type = "HC3") partial_lasso_se <- sqrt(diag(hcv_coefs))[2] ``` @@ -270,7 +271,7 @@ table[4, 1] <- partial_lasso_est table[4, 2] <- partial_lasso_se colnames(table) <- c("Estimate", "Std. Error") rownames(table) <- c("Without controls", "full reg", "partial reg", "partial reg via lasso") -tab<- xtable(table, digits = c(3, 3, 4)) +tab <- xtable(table, digits = c(3, 3, 4)) tab ``` @@ -300,7 +301,7 @@ Next we motivate the usage of lasso. We try an "extra" flexible model, where we ```{r} set.seed(2724) subset_size <- 1000 -random <- sample(1:nrow(data), subset_size) +random <- sample(1:seq_len(nrow(data)), subset_size) subset <- data[random, ] ``` @@ -333,7 +334,7 @@ n <- subset_size p <- length(control_fit$coef) # HC0 SE -hcv_coefs_hc0 <- vcovHC(control_fit, type = 'HC0') +hcv_coefs_hc0 <- vcovHC(control_fit, type = "HC0") control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2] # For a more correct approach, we @@ -343,9 +344,9 @@ control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2] # really work here. coefs <- hatvalues(control_fit) trim <- 0.99999999999 -coefs_trimmed <- coefs*(coefs < trim) + trim * (coefs >= trim) +coefs_trimmed <- coefs * (coefs < trim) + trim * (coefs >= trim) omega <- (control_fit$residuals^2) / ((1 - coefs_trimmed)^2) -hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = 'HC3') +hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = "HC3") control_se_hc3 <- sqrt(diag(hcv_coefs))[2] ``` @@ -361,13 +362,13 @@ t_y <- rlasso(extraflex_y, data = subset)$res t_d <- rlasso(extraflex_d, data = subset)$res # regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t_y ~ t_d) -partial_lasso_est <- summary(partial.lasso.fit)$coef[2, 1] +partial_lasso_fit <- lm(t_y ~ t_d) +partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1] cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) # standard error -hcv_coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +hcv_coefs <- vcovHC(partial_lasso_fit, type = "HC3") partial_lasso_se <- sqrt(diag(hcv_coefs))[2] ``` From 12dab4847c76a8a597785df80a487b5bcf5348a8 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 12:32:32 -0700 Subject: [PATCH 026/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- PM1/r-ols-and-lasso-for-wage-gap-inference.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 5ab2bea2..eb87484b 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -664,7 +664,7 @@ "source": [ "set.seed(2724)\n", "subset_size <- 1000\n", - "random <- sample(1:seq_len(nrow(data)), subset_size)\n", + "random <- sample(seq_len(nrow(data)), subset_size)\n", "subset <- data[random, ]" ] }, From d2cc9330a390e538445e42a63e0a0a8e68809b52 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 13 Jul 2024 19:42:36 +0000 Subject: [PATCH 027/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files --- PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd index 6e732620..79fd5a70 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -301,7 +301,7 @@ Next we motivate the usage of lasso. We try an "extra" flexible model, where we ```{r} set.seed(2724) subset_size <- 1000 -random <- sample(1:seq_len(nrow(data)), subset_size) +random <- sample(seq_len(nrow(data)), subset_size) subset <- data[random, ] ``` From 9eab4fb3398b5fcd80d667d7f840fe23b9f247ad Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 12:56:15 -0700 Subject: [PATCH 028/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 208 ++++++++++--------- 1 file changed, 106 insertions(+), 102 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 957df718..51721b72 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -144,13 +144,13 @@ "outputs": [], "source": [ "# construct matrices for estimation from the data\n", - "Y <- log(data$wage)\n", - "n <- length(Y)\n", - "Z <- data[- which(colnames(data) %in% c(\"wage\", \"lwage\"))]\n", - "p <- dim(Z)[2]\n", + "y <- log(data$wage)\n", + "n <- length(y)\n", + "z <- data[- which(colnames(data) %in% c(\"wage\", \"lwage\"))]\n", + "p <- dim(z)[2]\n", "\n", - "cat(\"Number of observations:\", n, '\\n')\n", - "cat( \"Number of raw regressors:\", p)" + "cat(\"Number of observations:\", n, \"\\n\")\n", + "cat(\"Number of raw regressors:\", p)" ] }, { @@ -326,9 +326,10 @@ "outputs": [], "source": [ "# 1. basic model\n", - "basic <- lwage ~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2))\n", + "basic <- lwage ~ (sex + exp1 + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2))\n", "regbasic <- lm(basic, data = train) # perform ols using the defined model\n", - "cat(\"Number of regressors in the basic model:\", length(regbasic$coef), '\\n') # number of regressors in the Basic Model" + "# number of regressors in the Basic Model\n", + "cat(\"Number of regressors in the basic model:\", length(regbasic$coef), \"\\n\")" ] }, { @@ -353,9 +354,10 @@ "source": [ "# 2. flexible model\n", "flex <- lwage ~ sex + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2) +\n", - " (exp1 + exp2 + exp3 + exp4) * (shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\n", + " (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "regflex <- lm(flex, data = train)\n", - "cat( \"Number of regressors in the flexible model:\", length(regflex$coef)) # number of regressors in the Flexible Model" + "# number of regressors in the Flexible Model\n", + "cat(\"Number of regressors in the flexible model:\", length(regflex$coef))" ] }, { @@ -390,9 +392,9 @@ "source": [ "# Flexible model using Lasso, in-sample fit\n", "train_flex <- model.matrix(flex, train) # all regressors\n", - "fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "fit_lasso_cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", "# in-sample fit right now, not out-of-sample using \"test\"\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = \"lambda.min\")" + "yhat_lasso_cv <- predict(fit_lasso_cv, newx = as.matrix(train_flex), s = \"lambda.min\")" ] }, { @@ -421,43 +423,44 @@ "sumflex <- summary(regflex)\n", "# no summary() for lassocv\n", "\n", - "ntrain = nrow(train)\n", + "ntrain <- nrow(train)\n", "\n", "# R-squared and adjusted R-squared\n", - "R2.1 <- sumbasic$r.squared\n", - "cat(\"R-squared for the basic model: \", R2.1, \"\\n\")\n", - "R2.adj1 <- sumbasic$adj.r.squared\n", - "cat(\"adjusted R-squared for the basic model: \", R2.adj1, \"\\n\")\n", - "\n", - "R2.2 <- sumflex$r.squared\n", - "cat(\"R-squared for the flexible model: \", R2.2, \"\\n\")\n", - "R2.adj2 <- sumflex$adj.r.squared\n", - "cat(\"adjusted R-squared for the flexible model: \", R2.adj2, \"\\n\")\n", - "\n", - "pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]]\n", - "R2.L <- 1 - sum((yhat.lasso.cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", - "cat(\"R-squared for the lasso with flexible model: \", R2.L, \"\\n\")\n", - "R2.adjL <- 1 - (sum((yhat.lasso.cv - train$lwage)^2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", - "cat(\"adjusted R-squared for the flexible model: \", R2.adjL, \"\\n\")\n", + "r2_1 <- sumbasic$r.squared\n", + "cat(\"R-squared for the basic model: \", r2_1, \"\\n\")\n", + "r2_adj1 <- sumbasic$adj.r.squared\n", + "cat(\"adjusted R-squared for the basic model: \", r2_adj1, \"\\n\")\n", + "\n", + "r2_2 <- sumflex$r.squared\n", + "cat(\"R-squared for the flexible model: \", r2_2, \"\\n\")\n", + "r2_adj2 <- sumflex$adj.r.squared\n", + "cat(\"adjusted R-squared for the flexible model: \", r2_adj2, \"\\n\")\n", + "\n", + "p_l <- fit_lasso_cv$nzero[fit_lasso_cv$index[1]]\n", + "r2_l <- 1 - sum((yhat_lasso_cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", + "cat(\"R-squared for the lasso with flexible model: \", r2_l, \"\\n\")\n", + "r2_adj_l <- 1 -\n", + " (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", + "cat(\"adjusted R-squared for the flexible model: \", r2_adj_l, \"\\n\")\n", "\n", "# MSE and adjusted MSE\n", - "MSE1 <- mean(sumbasic$res^2)\n", - "cat(\"MSE for the basic model: \", MSE1, \"\\n\")\n", + "mse1 <- mean(sumbasic$res^2)\n", + "cat(\"MSE for the basic model: \", mse1, \"\\n\")\n", "p1 <- sumbasic$df[1] # number of regressors\n", - "MSE.adj1 <- (ntrain / (ntrain - p1)) * MSE1\n", - "cat(\"adjusted MSE for the basic model: \", MSE.adj1, \"\\n\")\n", + "mse_adj1 <- (ntrain / (ntrain - p1)) * mse1\n", + "cat(\"adjusted MSE for the basic model: \", mse_adj1, \"\\n\")\n", "\n", - "MSE2 <-mean(sumflex$res^2)\n", - "cat(\"MSE for the flexible model: \", MSE2, \"\\n\")\n", + "mse2 <- mean(sumflex$res^2)\n", + "cat(\"MSE for the flexible model: \", mse2, \"\\n\")\n", "p2 <- sumflex$df[1]\n", - "MSE.adj2 <- (ntrain / (ntrain - p2)) * MSE2\n", - "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adj2, \"\\n\")\n", + "mse_adj2 <- (ntrain / (ntrain - p2)) * mse2\n", + "cat(\"adjusted MSE for the lasso flexible model: \", mse_adj2, \"\\n\")\n", "\n", - "lasso.res <- train$lwage - yhat.lasso.cv\n", - "MSEL <-mean(lasso.res^2)\n", - "cat(\"MSE for the lasso flexible model: \", MSEL, \"\\n\")\n", - "MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL\n", - "cat(\"adjusted MSE for the lasso flexible model: \", MSE.adjL, \"\\n\")" + "lasso_res <- train$lwage - yhat_lasso_cv\n", + "mse_l <- mean(lasso_res^2)\n", + "cat(\"MSE for the lasso flexible model: \", mse_l, \"\\n\")\n", + "mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l\n", + "cat(\"adjusted MSE for the lasso flexible model: \", mse_adj_l, \"\\n\")" ] }, { @@ -473,9 +476,9 @@ "source": [ "# Output the table\n", "table <- matrix(0, 3, 5)\n", - "table[1, 1:5] <- c(p1, R2.1, MSE1, R2.adj1, MSE.adj1)\n", - "table[2, 1:5] <- c(p2, R2.2, MSE2, R2.adj2, MSE.adj2)\n", - "table[3, 1:5] <- c(pL, R2.L, MSEL, R2.adjL, MSE.adjL)\n", + "table[1, 1:5] <- c(p1, r2_1, mse1, r2_adj1, mse_adj1)\n", + "table[2, 1:5] <- c(p2, r2_2, mse2, r2_adj2, mse_adj2)\n", + "table[3, 1:5] <- c(pl, r2_l, mse_l, r2_adj_l, mse_adj_l)\n", "colnames(table) <- c(\"p\", \"$R^2_{sample}$\", \"$MSE_{sample}$\", \"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", "rownames(table) <- c(\"basic reg\", \"flexible reg\", \"lasso flex\")\n", "tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2))\n", @@ -523,14 +526,14 @@ "regbasic <- lm(basic, data = train)\n", "\n", "# calculating the out-of-sample MSE\n", - "yhat.bas <- predict(regbasic, newdata = test)\n", - "y.test <- test$lwage\n", - "mean.train <- mean(train$lwage)\n", - "MSE.test1 <- sum((y.test - yhat.bas)^2) / length(y.test)\n", - "R2.test1 <- 1 - MSE.test1 / mean((y.test - mean.train)^2)\n", + "yhat_bas <- predict(regbasic, newdata = test)\n", + "y_test <- test$lwage\n", + "mean_train <- mean(train$lwage)\n", + "mse_test1 <- sum((y_test - yhat_bas)^2) / length(y_test)\n", + "r2_test1 <- 1 - mse_test1 / mean((y_test - mean_train)^2)\n", "\n", - "cat(\"Test MSE for the basic model: \", MSE.test1, \" \")\n", - "cat(\"Test R2 for the basic model: \", R2.test1)" + "cat(\"Test MSE for the basic model: \", mse_test1, \" \")\n", + "cat(\"Test R2 for the basic model: \", r2_test1)" ] }, { @@ -558,15 +561,15 @@ "regflex <- lm(flex, data = train)\n", "\n", "# calculating the out-of-sample MSE\n", - "yhat.flex<- predict(regflex, newdata = test)\n", - "y.test <- test$lwage\n", - "mean.train <- mean(train$lwage)\n", - "MSE.test2 <- sum((y.test - yhat.flex)^2) / length(y.test)\n", - "R2.test2 <- 1 - MSE.test2 / mean((y.test - mean.train)^2)\n", + "yhat_flex<- predict(regflex, newdata = test)\n", + "y_test <- test$lwage\n", + "mean_train <- mean(train$lwage)\n", + "mse_test2 <- sum((y_test - yhat_flex)^2) / length(y_test)\n", + "r2_test2 <- 1 - mse_test2 / mean((y_test - mean_train)^2)\n", "\n", - "cat(\"Test MSE for the flexible model: \", MSE.test2, \" \")\n", + "cat(\"Test MSE for the flexible model: \", mse_test2, \" \")\n", "\n", - "cat(\"Test R2 for the flexible model: \", R2.test2)" + "cat(\"Test R2 for the flexible model: \", r2_test2)" ] }, { @@ -608,16 +611,16 @@ "train_flex <- flex_data[random, ]\n", "test_flex <- flex_data[-random, ]\n", "\n", - "fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = \"lambda.min\")\n", + "fit_lasso_cv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "yhat_lasso_cv <- predict(fit_lasso_cv, newx = test_flex, s = \"lambda.min\")\n", "\n", "# calculating the out-of-sample MSE\n", - "MSE.lasso <- sum((y.test - yhat.lasso.cv)^2) / length(y.test)\n", - "R2.lasso <- 1 - MSE.lasso / mean((y.test - mean(train$lwage))^2)\n", + "mse_lasso <- sum((y_test - yhat_lasso_cv)^2) / length(y_test)\n", + "r2_lasso <- 1 - mse_lasso / mean((y_test - mean(train$lwage))^2)\n", "\n", - "cat(\"Test MSE for the lasso on flexible model: \", MSE.lasso, \" \")\n", + "cat(\"Test MSE for the lasso on flexible model: \", mse_lasso, \" \")\n", "\n", - "cat(\"Test R2 for the lasso flexible model: \", R2.lasso)" + "cat(\"Test R2 for the lasso flexible model: \", r2_lasso)" ] }, { @@ -642,12 +645,12 @@ "source": [ "# Output the comparison table\n", "table2 <- matrix(0, 3, 2)\n", - "table2[1, 1] <- MSE.test1\n", - "table2[2, 1] <- MSE.test2\n", - "table2[3, 1] <- MSE.lasso\n", - "table2[1, 2] <- R2.test1\n", - "table2[2, 2] <- R2.test2\n", - "table2[3, 2] <- R2.lasso\n", + "table2[1, 1] <- mse_test1\n", + "table2[2, 1] <- mse_test2\n", + "table2[3, 1] <- mse_lasso\n", + "table2[1, 2] <- r2_test1\n", + "table2[2, 2] <- r2_test2\n", + "table2[3, 2] <- r2_lasso\n", "\n", "rownames(table2) <- c(\"basic reg\", \"flexible reg\", \"lasso regression\")\n", "colnames(table2) <- c(\"$MSE_{test}$\", \"$R^2_{test}$\")\n", @@ -697,7 +700,7 @@ "sumextra <- summary(regextra)\n", "cat(\"Number of Extra-Flex Controls\", length(regextra$coef) - 1, \"\\n\")\n", "n <- length(data$wage)\n", - "p <- length(regextra$coef);\n", + "p <- length(regextra$coef)\n", "ntrain <- length(train$wage)" ] }, @@ -714,16 +717,16 @@ "source": [ "## In-sample\n", "# R-squared and adjusted R-squared\n", - "R2.extra <- sumextra$r.squared\n", - "cat(\"R-squared for the extra flexible model (in-sample): \", R2.extra, \"\\n\")\n", - "R2.adjextra <- sumextra$adj.r.squared\n", - "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjextra, \"\\n\")\n", + "r2_extra <- sumextra$r.squared\n", + "cat(\"R-squared for the extra flexible model (in-sample): \", r2_extra, \"\\n\")\n", + "r2_adjextra <- sumextra$adj.r.squared\n", + "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", r2_adjextra, \"\\n\")\n", "\n", "# MSE and adjusted MSE\n", - "MSE.extra <- mean(sumextra$res^2)\n", - "cat(\"MSE for the extra flexible model (in-sample): \", MSE.extra, \"\\n\")\n", - "MSE.adjextra <- (ntrain / (ntrain - p)) * MSE.extra\n", - "cat(\"adjusted MSE for the basic model (in-sample): \", MSE.adj1, \"\\n\")" + "mse_extra <- mean(sumextra$res^2)\n", + "cat(\"MSE for the extra flexible model (in-sample): \", mse_extra, \"\\n\")\n", + "mse_adjextra <- (ntrain / (ntrain - p)) * mse_extra\n", + "cat(\"adjusted MSE for the basic model (in-sample): \", mse_adj1, \"\\n\")" ] }, { @@ -738,13 +741,13 @@ "outputs": [], "source": [ "## Out-of-sample\n", - "yhat.ex <- predict(regextra, newdata=test)\n", - "y.test.ex <- test$lwage\n", - "MSE.test.ex <- sum((y.test.ex - yhat.ex)^2) / length(y.test.ex)\n", - "R2.test.ex <- 1 - MSE.test.ex / mean((y.test.ex - mean(train$lwage))^2)\n", + "yhat_ex <- predict(regextra, newdata = test)\n", + "y_test_ex <- test$lwage\n", + "mse_test_ex <- sum((y_test_ex - yhat_ex)^2) / length(y_test_ex)\n", + "r2_test_ex <- 1 - mse_test_ex / mean((y_test_ex - mean(train$lwage))^2)\n", "\n", - "cat(\"Test MSE for the basic model: \", MSE.test.ex, \" \")\n", - "cat(\"Test R2 for the basic model: \", R2.test.ex)" + "cat(\"Test MSE for the basic model: \", mse_test_ex, \" \")\n", + "cat(\"Test R2 for the basic model: \", r2_test_ex)" ] }, { @@ -775,32 +778,33 @@ "test_flex <- flex_data[-random, ]\n", "\n", "# fit model\n", - "fit.lcv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "fit_lcv <- cv.glmnet(train_flex, train$lwage, family = \"gaussian\", alpha = 1, nfolds = 5)\n", "\n", "# in-sample\n", - "yhat.lcv <- predict(fit.lcv, newx = train_flex, s = \"lambda.min\")\n", + "yhat_lcv <- predict(fit_lcv, newx = train_flex, s = \"lambda.min\")\n", "\n", - "R2.L <- 1 - sum((yhat.lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", - "pL <- fit.lcv$nzero[fit.lcv$index[1]]\n", - "R2.adjL <- 1 - (sum((yhat.lcv - train$lwage) ^ 2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", + "r2_l <- 1 - sum((yhat_lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", + "p_l <- fit_lcv$nzero[fit_lcv$index[1]]\n", + "r2_adj_l <- 1 -\n", + " (sum((yhat_lcv - train$lwage) ^ 2) / (ntrain - p_l - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "\n", - "lasso.res <- train$lwage - yhat.lcv\n", - "MSEL <- mean(lasso.res^2)\n", - "MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL\n", + "lasso_res <- train$lwage - yhat_lcv\n", + "mse_l <- mean(lasso_res^2)\n", + "mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l\n", "\n", - "cat(\"R-squared for the lasso with the extra flexible model (in-sample): \", R2.L, \"\\n\")\n", - "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", R2.adjL, \"\\n\")\n", - "cat(\"MSE for the lasso with the extra flexible model (in-sample): \", MSEL, \"\\n\")\n", - "cat(\"adjusted MSE for the lasso with the extraflexible model (in-sample): \", MSE.adjL, \"\\n\")\n", + "cat(\"R-squared for the lasso with the extra flexible model (in-sample): \", r2_l, \"\\n\")\n", + "cat(\"adjusted R-squared for the extra flexible model (in-sample): \", r2_adj_l, \"\\n\")\n", + "cat(\"MSE for the lasso with the extra flexible model (in-sample): \", mse_l, \"\\n\")\n", + "cat(\"adjusted MSE for the lasso with the extraflexible model (in-sample): \", mse_adj_l, \"\\n\")\n", "\n", "# out-of-sample\n", - "yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = \"lambda.min\")\n", - "MSE.lasso <- sum((test$lwage - yhat.lcv.test)^2) / length(test$lwage)\n", - "R2.lasso <- 1 - MSE.lasso / mean((test$lwage - mean(train$lwage))^2)\n", + "yhat_lcv_test <- predict(fit_lcv, newx = test_flex, s = \"lambda.min\")\n", + "mse_lasso <- sum((test$lwage - yhat_lcv_test)^2) / length(test$lwage)\n", + "r2_lasso <- 1 - mse_lasso / mean((test$lwage - mean(train$lwage))^2)\n", "\n", "cat(\"\\n\")\n", - "cat(\"Test R2 for the lasso the extra flexible model: \", R2.lasso,\"\\n\")\n", - "cat(\"Test MSE for the lasso on the extra flexible model: \", MSE.lasso)\n" + "cat(\"Test R2 for the lasso the extra flexible model: \", r2_lasso, \"\\n\")\n", + "cat(\"Test MSE for the lasso on the extra flexible model: \", mse_lasso)\n" ] }, { From f6806c8669a53e273228a718ce8c37b842420b66 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 13:08:54 -0700 Subject: [PATCH 029/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 192a5fed..be273c92 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -64,30 +64,48 @@ jobs: if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) - # stop("Linting failed with warnings") + stop("Linting failed with warnings") } }) ' - - name: Execute R scripts + - name: Execute R scripts and log output id: execute run: | + log_file="r_script_execution.log" R -e ' files <- list.files(path = "PM1", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) + log_con <- file("'$log_file'", open = "wt") + sink(log_con, type = "output") + sink(log_con, type = "message") for (file in files) { tryCatch( { source(file) }, error = function(e) { + sink(type = "output") + sink(type = "message") cat("Error found in file:", file, "\n") cat("Error message:", e$message, "\n") - # stop("Execution failed due to an error in ", file) + stop("Execution failed due to an error in ", file) } ) } + sink(type = "output") + sink(type = "message") + close(log_con) ' + - name: Upload execution log + uses: actions/upload-artifact@v2 + with: + name: r-script-execution-log + path: r_script_execution.log + + - name: Delete execution log + run: rm r_script_execution.log + - name: Zip .R files run: | mkdir r_scripts From 0950b83bfa0c49ad82c8ba77cc60c542ceb77d4d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 13:19:09 -0700 Subject: [PATCH 030/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 51721b72..8867821c 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -440,7 +440,8 @@ "r2_l <- 1 - sum((yhat_lasso_cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", "cat(\"R-squared for the lasso with flexible model: \", r2_l, \"\\n\")\n", "r2_adj_l <- 1 -\n", - " (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", + " (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) /\n", + " (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "cat(\"adjusted R-squared for the flexible model: \", r2_adj_l, \"\\n\")\n", "\n", "# MSE and adjusted MSE\n", @@ -561,7 +562,7 @@ "regflex <- lm(flex, data = train)\n", "\n", "# calculating the out-of-sample MSE\n", - "yhat_flex<- predict(regflex, newdata = test)\n", + "yhat_flex <- predict(regflex, newdata = test)\n", "y_test <- test$lwage\n", "mean_train <- mean(train$lwage)\n", "mse_test2 <- sum((y_test - yhat_flex)^2) / length(y_test)\n", From c72f45b2605c5503318a81c30724a4605e71bd55 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 13:23:47 -0700 Subject: [PATCH 031/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- PM1/r-ols-and-lasso-for-wage-gap-inference.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index eb87484b..766fe825 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -274,7 +274,7 @@ "\n", "# print unconditional effect of gender and the corresponding standard error\n", "cat(\"The estimated coefficient on the dummy for gender is\", nocontrol_est,\n", - " \" and the corresponding robust standard error is\", nocontrol_se)\n" + " \" and the corresponding robust standard error is\", nocontrol_se)" ] }, { From 50be018a0636d1f2e341c719a7e723d139aa407b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 13 Jul 2024 13:25:00 -0700 Subject: [PATCH 032/261] Update r-ols-and-lasso-for-wage-gap-inference.irnb --- PM1/r-ols-and-lasso-for-wage-gap-inference.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb index 766fe825..dbff75b4 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.irnb @@ -454,7 +454,7 @@ "# Partialling-out using ols\n", "\n", "# model for Y\n", - "flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", + "flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "# model for D\n", "flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\n", "\n", From 11a2bab78e896a8676c9368276ebf5bb99b50cdf Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 12:32:10 -0700 Subject: [PATCH 033/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 8867821c..8dbf6724 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -441,7 +441,7 @@ "cat(\"R-squared for the lasso with flexible model: \", r2_l, \"\\n\")\n", "r2_adj_l <- 1 -\n", " (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) /\n", - " (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", + " (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "cat(\"adjusted R-squared for the flexible model: \", r2_adj_l, \"\\n\")\n", "\n", "# MSE and adjusted MSE\n", @@ -787,7 +787,8 @@ "r2_l <- 1 - sum((yhat_lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2)\n", "p_l <- fit_lcv$nzero[fit_lcv$index[1]]\n", "r2_adj_l <- 1 -\n", - " (sum((yhat_lcv - train$lwage) ^ 2) / (ntrain - p_l - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", + " (sum((yhat_lcv - train$lwage) ^ 2) / (ntrain - p_l - 1)) /\n", + " (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1))\n", "\n", "lasso_res <- train$lwage - yhat_lcv\n", "mse_l <- mean(lasso_res^2)\n", From 7c4858942075fd47e30bf875c8399a5ddcbff21e Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 12:40:17 -0700 Subject: [PATCH 034/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 87 +++++++++++++++++++----- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index be273c92..32bed9df 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -1,15 +1,70 @@ name: Convert and Lint R Notebooks on: - push + push: + paths: + - 'PM1/*.irnb' + - 'PM2/*.irnb' + - 'PM3/*.irnb' + - 'PM4/*.irnb' + - 'PM5/*.irnb' + schedule: + - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC concurrency: group: convert-lint-notebooks cancel-in-progress: true jobs: + set-vars: + runs-on: ubuntu-latest + if: github.event_name == 'push' + steps: + - name: Check if PM1 files changed + id: check_PM1 + uses: dorny/paths-filter@v2 + with: + filters: | + PM1: + - 'PM1/*.irnb' + - name: Check if PM2 files changed + id: check_PM2 + uses: dorny/paths-filter@v2 + with: + filters: | + PM2: + - 'PM2/*.irnb' + - name: Check if PM3 files changed + id: check_PM3 + uses: dorny/paths-filter@v2 + with: + filters: | + PM3: + - 'PM3/*.irnb' + - name: Check if PM4 files changed + id: check_PM4 + uses: dorny/paths-filter@v2 + with: + filters: | + PM4: + - 'PM4/*.irnb' + - name: Check if PM5 files changed + id: check_PM5 + uses: dorny/paths-filter@v2 + with: + filters: | + PM5: + - 'PM5/*.irnb' + convert-lint-notebooks: + needs: [set-vars] runs-on: ubuntu-latest + strategy: + matrix: + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] + if: | + (github.event_name == 'push' && needs.set-vars.outputs.PM${{ matrix.directory#[-1] }}.matched == 'true') || + (github.event_name == 'schedule') steps: - name: Checkout repository @@ -34,7 +89,7 @@ jobs: - name: Strip outputs from .irnb files run: | - for notebook in PM1/*.irnb; do + for notebook in ${{ matrix.directory }}/*.irnb; do ipynb_notebook="${notebook%.irnb}.ipynb" mv "$notebook" "$ipynb_notebook" nbstripout "$ipynb_notebook" @@ -44,7 +99,7 @@ jobs: - name: Convert .irnb to .Rmd and .R run: | R -e ' - files <- list.files(path = "PM1", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) lapply(files, function(input) { rmarkdown::convert_ipynb(input) rmd_file <- xfun::with_ext(input, "Rmd") @@ -58,7 +113,7 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120)) - rmd_files <- list.files(path = "PM1", pattern = "\\.Rmd$", full.names = TRUE) + rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) if (length(lints) > 0) { @@ -72,9 +127,9 @@ jobs: - name: Execute R scripts and log output id: execute run: | - log_file="r_script_execution.log" + log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' - files <- list.files(path = "PM1", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) log_con <- file("'$log_file'", open = "wt") sink(log_con, type = "output") sink(log_con, type = "message") @@ -100,35 +155,35 @@ jobs: - name: Upload execution log uses: actions/upload-artifact@v2 with: - name: r-script-execution-log - path: r_script_execution.log + name: ${{ matrix.directory }}-r-script-execution-log + path: ${{ matrix.directory }}_r_script_execution.log - name: Delete execution log - run: rm r_script_execution.log + run: rm ${{ matrix.directory }}_r_script_execution.log - name: Zip .R files run: | mkdir r_scripts - mv PM1/*.R r_scripts/ - zip -r r_scripts.zip r_scripts + mv ${{ matrix.directory }}/*.R r_scripts/ + zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts - name: Upload artifact uses: actions/upload-artifact@v2 with: - name: r-scripts - path: r_scripts.zip + name: ${{ matrix.directory }}-r-scripts + path: ${{ matrix.directory }}_r_scripts.zip - name: Delete .R files and zip run: | rm -rf r_scripts - rm r_scripts.zip + rm ${{ matrix.directory }}_r_scripts.zip - name: Commit and push stripped .irnb and .Rmd files run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add PM1/*.irnb PM1/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files' + git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 632fe7662416767d9b12ec9e5a417e5441866224 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 12:56:40 -0700 Subject: [PATCH 035/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 27 ++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 32bed9df..7dc6a4cd 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -2,12 +2,6 @@ name: Convert and Lint R Notebooks on: push: - paths: - - 'PM1/*.irnb' - - 'PM2/*.irnb' - - 'PM3/*.irnb' - - 'PM4/*.irnb' - - 'PM5/*.irnb' schedule: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC @@ -61,7 +55,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] + directory: ['PM1'] if: | (github.event_name == 'push' && needs.set-vars.outputs.PM${{ matrix.directory#[-1] }}.matched == 'true') || (github.event_name == 'schedule') @@ -126,6 +120,7 @@ jobs: - name: Execute R scripts and log output id: execute + continue-on-error: true run: | log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' @@ -133,31 +128,37 @@ jobs: log_con <- file("'$log_file'", open = "wt") sink(log_con, type = "output") sink(log_con, type = "message") + errors <- list() for (file in files) { tryCatch( { source(file) }, error = function(e) { - sink(type = "output") - sink(type = "message") - cat("Error found in file:", file, "\n") - cat("Error message:", e$message, "\n") - stop("Execution failed due to an error in ", file) + errors[[length(errors) + 1]] <<- list(file = file, message = e$message) } ) } sink(type = "output") sink(type = "message") close(log_con) + if (length(errors) > 0) { + for (error in errors) { + cat("Error found in file:", error$file, "\n") + cat("Error message:", error$message, "\n") + } + } ' - - name: Upload execution log uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-script-execution-log path: ${{ matrix.directory }}_r_script_execution.log + - name: Fail if errors found + if: failure() + run: exit 1 + - name: Delete execution log run: rm ${{ matrix.directory }}_r_script_execution.log From b0186b1a64d44bf0148f27bba54b53c2d82ae904 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:05:43 -0700 Subject: [PATCH 036/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 7dc6a4cd..f9594a78 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -55,9 +55,9 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] if: | - (github.event_name == 'push' && needs.set-vars.outputs.PM${{ matrix.directory#[-1] }}.matched == 'true') || + (github.event_name == 'push' && (needs.set-vars.outputs['check_' + matrix.directory].matched == 'true')) || (github.event_name == 'schedule') steps: @@ -188,3 +188,4 @@ jobs: git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + From 8cb5b70d9bd719f88af1b65b974993ef0227a284 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:08:02 -0700 Subject: [PATCH 037/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index f9594a78..604c217c 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -57,7 +57,7 @@ jobs: matrix: directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] if: | - (github.event_name == 'push' && (needs.set-vars.outputs['check_' + matrix.directory].matched == 'true')) || + (github.event_name == 'push' && fromJSON(needs.set-vars.outputs['check_' + matrix.directory + '_matched']).matched == 'true') || (github.event_name == 'schedule') steps: @@ -183,9 +183,4 @@ jobs: run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' - git push - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - + git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.R From 26e56c222d181bc742aa2eebc7c928f9e642ce59 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:10:11 -0700 Subject: [PATCH 038/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 604c217c..f598ec37 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -13,6 +13,12 @@ jobs: set-vars: runs-on: ubuntu-latest if: github.event_name == 'push' + outputs: + PM1_changed: ${{ steps.check_PM1.outputs.matched }} + PM2_changed: ${{ steps.check_PM2.outputs.matched }} + PM3_changed: ${{ steps.check_PM3.outputs.matched }} + PM4_changed: ${{ steps.check_PM4.outputs.matched }} + PM5_changed: ${{ steps.check_PM5.outputs.matched }} steps: - name: Check if PM1 files changed id: check_PM1 @@ -57,10 +63,13 @@ jobs: matrix: directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] if: | - (github.event_name == 'push' && fromJSON(needs.set-vars.outputs['check_' + matrix.directory + '_matched']).matched == 'true') || + (github.event_name == 'push' && (needs.set-vars.outputs[env.MATRIX_DIRECTORY + '_changed'] == 'true')) || (github.event_name == 'schedule') steps: + - name: Set matrix directory environment variable + run: echo "MATRIX_DIRECTORY=${{ matrix.directory }}" >> $GITHUB_ENV + - name: Checkout repository uses: actions/checkout@v2 @@ -183,4 +192,8 @@ jobs: run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.R + git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b3c0b7157c95f349b8e53a03736469dbcf801ad8 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:19:43 -0700 Subject: [PATCH 039/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 287 ++++++++++++----------- 1 file changed, 154 insertions(+), 133 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index f598ec37..51034cf8 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -4,6 +4,7 @@ on: push: schedule: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC +name: Convert and Lint R Notebooks concurrency: group: convert-lint-notebooks @@ -56,144 +57,164 @@ jobs: PM5: - 'PM5/*.irnb' - convert-lint-notebooks: + determine-dirs: needs: [set-vars] runs-on: ubuntu-latest + steps: + - name: Determine directories to process + id: determine + run: | + directories=$(jq -n \ + --arg pm1 ${{ needs.set-vars.outputs.PM1_changed }} \ + --arg pm2 ${{ needs.set-vars.outputs.PM2_changed }} \ + --arg pm3 ${{ needs.set-vars.outputs.PM3_changed }} \ + --arg pm4 ${{ needs.set-vars.outputs.PM4_changed }} \ + --arg pm5 ${{ needs.set-vars.outputs.PM5_changed }} \ + '{ + PM1: $pm1, + PM2: $pm2, + PM3: $pm3, + PM4: $pm4, + PM5: $pm5 + } | to_entries | map(select(.value == "true")) | map(.key)') + echo "$directories" > directories.json + - name: Upload directories json + uses: actions/upload-artifact@v2 + with: + name: directories-json + path: directories.json + + convert-lint-notebooks: + needs: [determine-dirs] + runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] - if: | - (github.event_name == 'push' && (needs.set-vars.outputs[env.MATRIX_DIRECTORY + '_changed'] == 'true')) || - (github.event_name == 'schedule') - + directory: ${{ fromJson(needs.determine-dirs.outputs.directories) }} steps: - - name: Set matrix directory environment variable - run: echo "MATRIX_DIRECTORY=${{ matrix.directory }}" >> $GITHUB_ENV - - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Specify your Python version here - - - name: Install nbstripout - run: | - python -m pip install --upgrade pip - pip install nbstripout - - - name: Set up R - uses: r-lib/actions/setup-r@v2 - - - name: Install rmarkdown, knitr, and lintr packages - run: | - R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' - - - name: Strip outputs from .irnb files - run: | - for notebook in ${{ matrix.directory }}/*.irnb; do - ipynb_notebook="${notebook%.irnb}.ipynb" - mv "$notebook" "$ipynb_notebook" - nbstripout "$ipynb_notebook" - mv "$ipynb_notebook" "$notebook" - done - - - name: Convert .irnb to .Rmd and .R - run: | - R -e ' - files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) - lapply(files, function(input) { - rmarkdown::convert_ipynb(input) - rmd_file <- xfun::with_ext(input, "Rmd") - knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) - }) - ' - - - name: Lint .Rmd files - id: lint - run: | - R -e ' - library(lintr) - linters <- with_defaults(line_length_linter = line_length_linter(120)) - rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) - results <- lapply(rmd_files, function(file) { - lints <- lint(file, linters) - if (length(lints) > 0) { - cat("Warnings found during linting:\n") - print(lints) - stop("Linting failed with warnings") + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Specify your Python version here + + - name: Install nbstripout + run: | + python -m pip install --upgrade pip + pip install nbstripout + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + + - name: Install rmarkdown, knitr, and lintr packages + run: | + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' + + - name: Strip outputs from .irnb files + run: | + for notebook in ${{ matrix.directory }}/*.irnb; do + ipynb_notebook="${notebook%.irnb}.ipynb" + mv "$notebook" "$ipynb_notebook" + nbstripout "$ipynb_notebook" + mv "$ipynb_notebook" "$notebook" + done + + - name: Convert .irnb to .Rmd and .R + run: | + R -e ' + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + lapply(files, function(input) { + rmarkdown::convert_ipynb(input) + rmd_file <- xfun::with_ext(input, "Rmd") + knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) + }) + ' + + - name: Lint .Rmd files + id: lint + run: | + R -e ' + library(lintr) + linters <- with_defaults(line_length_linter = line_length_linter(120)) + rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) + results <- lapply(rmd_files, function(file) { + lints <- lint(file, linters) + if (length(lints) > 0) { + cat("Warnings found during linting:\n") + print(lints) + stop("Linting failed with warnings") + } + }) + ' + + - name: Execute R scripts and log output + id: execute + continue-on-error: true + run: | + log_file="${{ matrix.directory }}_r_script_execution.log" + R -e ' + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) + log_con <- file("'$log_file'", open = "wt") + sink(log_con, type = "output") + sink(log_con, type = "message") + errors <- list() + for (file in files) { + tryCatch( + { + source(file) + }, + error = function(e) { + errors[[length(errors) + 1]] <<- list(file = file, message = e$message) + } + ) } - }) - ' - - - name: Execute R scripts and log output - id: execute - continue-on-error: true - run: | - log_file="${{ matrix.directory }}_r_script_execution.log" - R -e ' - files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) - log_con <- file("'$log_file'", open = "wt") - sink(log_con, type = "output") - sink(log_con, type = "message") - errors <- list() - for (file in files) { - tryCatch( - { - source(file) - }, - error = function(e) { - errors[[length(errors) + 1]] <<- list(file = file, message = e$message) + sink(type = "output") + sink(type = "message") + close(log_con) + if (length(errors) > 0) { + for (error in errors) { + cat("Error found in file:", error$file, "\n") + cat("Error message:", error$message, "\n") } - ) - } - sink(type = "output") - sink(type = "message") - close(log_con) - if (length(errors) > 0) { - for (error in errors) { - cat("Error found in file:", error$file, "\n") - cat("Error message:", error$message, "\n") } - } - ' - - name: Upload execution log - uses: actions/upload-artifact@v2 - with: - name: ${{ matrix.directory }}-r-script-execution-log - path: ${{ matrix.directory }}_r_script_execution.log - - - name: Fail if errors found - if: failure() - run: exit 1 - - - name: Delete execution log - run: rm ${{ matrix.directory }}_r_script_execution.log - - - name: Zip .R files - run: | - mkdir r_scripts - mv ${{ matrix.directory }}/*.R r_scripts/ - zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts - - - name: Upload artifact - uses: actions/upload-artifact@v2 - with: - name: ${{ matrix.directory }}-r-scripts - path: ${{ matrix.directory }}_r_scripts.zip - - - name: Delete .R files and zip - run: | - rm -rf r_scripts - rm ${{ matrix.directory }}_r_scripts.zip - - - name: Commit and push stripped .irnb and .Rmd files - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' - git push - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ' + - name: Upload execution log + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.directory }}-r-script-execution-log + path: ${{ matrix.directory }}_r_script_execution.log + + - name: Fail if errors found + if: failure() + run: exit 1 + + - name: Delete execution log + run: rm ${{ matrix.directory }}_r_script_execution.log + + - name: Zip .R files + run: | + mkdir r_scripts + mv ${{ matrix.directory }}/*.R r_scripts/ + zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts + + - name: Upload artifact + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.directory }}-r-scripts + path: ${{ matrix.directory }}_r_scripts.zip + + - name: Delete .R files and zip + run: | + rm -rf r_scripts + rm ${{ matrix.directory }}_r_scripts.zip + + - name: Commit and push stripped .irnb and .Rmd files + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 4977144318ad1fccaf93213e90cde0dd5776c331 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:21:29 -0700 Subject: [PATCH 040/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 51034cf8..99777630 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -4,7 +4,6 @@ on: push: schedule: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC -name: Convert and Lint R Notebooks concurrency: group: convert-lint-notebooks From 73c819b47e47ca13fd52b7e2594fd8557c069ef1 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:23:46 -0700 Subject: [PATCH 041/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 213 +++++++++-------------- 1 file changed, 80 insertions(+), 133 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 99777630..3bf3f191 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -20,6 +20,9 @@ jobs: PM4_changed: ${{ steps.check_PM4.outputs.matched }} PM5_changed: ${{ steps.check_PM5.outputs.matched }} steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Check if PM1 files changed id: check_PM1 uses: dorny/paths-filter@v2 @@ -60,6 +63,9 @@ jobs: needs: [set-vars] runs-on: ubuntu-latest steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Determine directories to process id: determine run: | @@ -69,14 +75,15 @@ jobs: --arg pm3 ${{ needs.set-vars.outputs.PM3_changed }} \ --arg pm4 ${{ needs.set-vars.outputs.PM4_changed }} \ --arg pm5 ${{ needs.set-vars.outputs.PM5_changed }} \ - '{ - PM1: $pm1, - PM2: $pm2, - PM3: $pm3, - PM4: $pm4, - PM5: $pm5 - } | to_entries | map(select(.value == "true")) | map(.key)') + '[ + { "name": "PM1", "changed": $pm1 }, + { "name": "PM2", "changed": $pm2 }, + { "name": "PM3", "changed": $pm3 }, + { "name": "PM4", "changed": $pm4 }, + { "name": "PM5", "changed": $pm5 } + ] | map(select(.changed == "true")) | map(.name)') echo "$directories" > directories.json + - name: Upload directories json uses: actions/upload-artifact@v2 with: @@ -86,134 +93,74 @@ jobs: convert-lint-notebooks: needs: [determine-dirs] runs-on: ubuntu-latest - strategy: - matrix: - directory: ${{ fromJson(needs.determine-dirs.outputs.directories) }} steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Specify your Python version here - - - name: Install nbstripout - run: | - python -m pip install --upgrade pip - pip install nbstripout - - - name: Set up R - uses: r-lib/actions/setup-r@v2 - - - name: Install rmarkdown, knitr, and lintr packages - run: | - R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' - - - name: Strip outputs from .irnb files - run: | - for notebook in ${{ matrix.directory }}/*.irnb; do - ipynb_notebook="${notebook%.irnb}.ipynb" - mv "$notebook" "$ipynb_notebook" - nbstripout "$ipynb_notebook" - mv "$ipynb_notebook" "$notebook" - done - - - name: Convert .irnb to .Rmd and .R - run: | - R -e ' - files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) - lapply(files, function(input) { - rmarkdown::convert_ipynb(input) - rmd_file <- xfun::with_ext(input, "Rmd") - knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) - }) - ' - - - name: Lint .Rmd files - id: lint - run: | - R -e ' - library(lintr) - linters <- with_defaults(line_length_linter = line_length_linter(120)) - rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) - results <- lapply(rmd_files, function(file) { - lints <- lint(file, linters) - if (length(lints) > 0) { - cat("Warnings found during linting:\n") - print(lints) - stop("Linting failed with warnings") - } - }) - ' - - - name: Execute R scripts and log output - id: execute - continue-on-error: true - run: | - log_file="${{ matrix.directory }}_r_script_execution.log" - R -e ' - files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) - log_con <- file("'$log_file'", open = "wt") - sink(log_con, type = "output") - sink(log_con, type = "message") - errors <- list() - for (file in files) { - tryCatch( - { - source(file) - }, - error = function(e) { - errors[[length(errors) + 1]] <<- list(file = file, message = e$message) - } - ) - } - sink(type = "output") - sink(type = "message") - close(log_con) - if (length(errors) > 0) { - for (error in errors) { - cat("Error found in file:", error$file, "\n") - cat("Error message:", error$message, "\n") - } - } - ' - - name: Upload execution log - uses: actions/upload-artifact@v2 - with: - name: ${{ matrix.directory }}-r-script-execution-log - path: ${{ matrix.directory }}_r_script_execution.log - - - name: Fail if errors found - if: failure() - run: exit 1 - - - name: Delete execution log - run: rm ${{ matrix.directory }}_r_script_execution.log - - - name: Zip .R files - run: | - mkdir r_scripts - mv ${{ matrix.directory }}/*.R r_scripts/ - zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts - - - name: Upload artifact - uses: actions/upload-artifact@v2 + - name: Download directories json + uses: actions/download-artifact@v2 with: - name: ${{ matrix.directory }}-r-scripts - path: ${{ matrix.directory }}_r_scripts.zip + name: directories-json + path: directories.json - - name: Delete .R files and zip + - name: Load directories from json + id: load-dirs run: | - rm -rf r_scripts - rm ${{ matrix.directory }}_r_scripts.zip + dirs=$(cat directories.json) + echo "::set-output name=directories::$dirs" + + - name: Set up matrix + id: matrix + run: echo "::set-output name=matrix::{\"include\": $(cat directories.json)}" + + - name: Convert and lint notebooks + runs-on: ubuntu-latest + strategy: + matrix: ${{ fromJson(steps.matrix.outputs.matrix) }} + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' # Specify your Python version here + + - name: Install nbstripout + run: | + python -m pip install --upgrade pip + pip install nbstripout + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + + - name: Install rmarkdown, knitr, and lintr packages + run: | + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' + + - name: Strip outputs from .irnb files + run: | + for notebook in ${{ matrix.name }}/*.irnb; do + ipynb_notebook="${notebook%.irnb}.ipynb" + mv "$notebook" "$ipynb_notebook" + nbstripout "$ipynb_notebook" + mv "$ipynb_notebook" "$notebook" + done + + - name: Convert .irnb to .Rmd and .R + run: | + R -e ' + files <- list.files(path = "${{ matrix.name }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + lapply(files, function(input) { + rmarkdown::convert_ipynb(input) + rmd_file <- xfun::with_ext(input, "Rmd") + knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) + }) + ' + + - name: Lint .Rmd files + id: lint + run: | + R -e ' + library(lintr) + linters <- with_defaults(line_length_linter = line_length_linter(120)) + rmd_files <- list.files(path = "${{ matrix.name }}", pattern = "\\.Rmd$", full.names = TRUE) + resul - - name: Commit and push stripped .irnb and .Rmd files - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' - git push - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From e497ea218df4bf645d6194fbfcab885ad19bf14d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:31:53 -0700 Subject: [PATCH 042/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 210 ++++++++++++++--------- 1 file changed, 133 insertions(+), 77 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 3bf3f191..37c3a9e6 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -63,9 +63,6 @@ jobs: needs: [set-vars] runs-on: ubuntu-latest steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Determine directories to process id: determine run: | @@ -75,15 +72,14 @@ jobs: --arg pm3 ${{ needs.set-vars.outputs.PM3_changed }} \ --arg pm4 ${{ needs.set-vars.outputs.PM4_changed }} \ --arg pm5 ${{ needs.set-vars.outputs.PM5_changed }} \ - '[ - { "name": "PM1", "changed": $pm1 }, - { "name": "PM2", "changed": $pm2 }, - { "name": "PM3", "changed": $pm3 }, - { "name": "PM4", "changed": $pm4 }, - { "name": "PM5", "changed": $pm5 } - ] | map(select(.changed == "true")) | map(.name)') + '{ + PM1: $pm1, + PM2: $pm2, + PM3: $pm3, + PM4: $pm4, + PM5: $pm5 + } | to_entries | map(select(.value == "true")) | map(.key)') echo "$directories" > directories.json - - name: Upload directories json uses: actions/upload-artifact@v2 with: @@ -93,74 +89,134 @@ jobs: convert-lint-notebooks: needs: [determine-dirs] runs-on: ubuntu-latest + strategy: + matrix: + directory: ${{ fromJson(needs.determine-dirs.outputs.directories) }} steps: - - name: Download directories json - uses: actions/download-artifact@v2 + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 with: - name: directories-json - path: directories.json + python-version: '3.8' # Specify your Python version here + + - name: Install nbstripout + run: | + python -m pip install --upgrade pip + pip install nbstripout + + - name: Set up R + uses: r-lib/actions/setup-r@v2 - - name: Load directories from json - id: load-dirs + - name: Install rmarkdown, knitr, and lintr packages run: | - dirs=$(cat directories.json) - echo "::set-output name=directories::$dirs" - - - name: Set up matrix - id: matrix - run: echo "::set-output name=matrix::{\"include\": $(cat directories.json)}" - - - name: Convert and lint notebooks - runs-on: ubuntu-latest - strategy: - matrix: ${{ fromJson(steps.matrix.outputs.matrix) }} - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.8' # Specify your Python version here - - - name: Install nbstripout - run: | - python -m pip install --upgrade pip - pip install nbstripout - - - name: Set up R - uses: r-lib/actions/setup-r@v2 - - - name: Install rmarkdown, knitr, and lintr packages - run: | - R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' - - - name: Strip outputs from .irnb files - run: | - for notebook in ${{ matrix.name }}/*.irnb; do - ipynb_notebook="${notebook%.irnb}.ipynb" - mv "$notebook" "$ipynb_notebook" - nbstripout "$ipynb_notebook" - mv "$ipynb_notebook" "$notebook" - done - - - name: Convert .irnb to .Rmd and .R - run: | - R -e ' - files <- list.files(path = "${{ matrix.name }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) - lapply(files, function(input) { - rmarkdown::convert_ipynb(input) - rmd_file <- xfun::with_ext(input, "Rmd") - knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) - }) - ' - - - name: Lint .Rmd files - id: lint - run: | - R -e ' - library(lintr) - linters <- with_defaults(line_length_linter = line_length_linter(120)) - rmd_files <- list.files(path = "${{ matrix.name }}", pattern = "\\.Rmd$", full.names = TRUE) - resul + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' + - name: Strip outputs from .irnb files + run: | + for notebook in ${{ matrix.directory }}/*.irnb; do + ipynb_notebook="${notebook%.irnb}.ipynb" + mv "$notebook" "$ipynb_notebook" + nbstripout "$ipynb_notebook" + mv "$ipynb_notebook" "$notebook" + done + + - name: Convert .irnb to .Rmd and .R + run: | + R -e ' + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + lapply(files, function(input) { + rmarkdown::convert_ipynb(input) + rmd_file <- xfun::with_ext(input, "Rmd") + knitr::purl(rmd_file, output = xfun::with_ext(input, "R")) + }) + ' + + - name: Lint .Rmd files + id: lint + run: | + R -e ' + library(lintr) + linters <- with_defaults(line_length_linter = line_length_linter(120)) + rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) + results <- lapply(rmd_files, function(file) { + lints <- lint(file, linters) + if (length(lints) > 0) { + cat("Warnings found during linting:\n") + print(lints) + stop("Linting failed with warnings") + } + }) + ' + + - name: Execute R scripts and log output + id: execute + continue-on-error: true + run: | + log_file="${{ matrix.directory }}_r_script_execution.log" + R -e ' + files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) + log_con <- file("'$log_file'", open = "wt") + sink(log_con, type = "output") + sink(log_con, type = "message") + errors <- list() + for (file in files) { + tryCatch( + { + source(file) + }, + error = function(e) { + errors[[length(errors) + 1]] <<- list(file = file, message = e$message) + } + ) + } + sink(type = "output") + sink(type = "message") + close(log_con) + if (length(errors) > 0) { + for (error in errors) { + cat("Error found in file:", error$file, "\n") + cat("Error message:", error$message, "\n") + } + } + ' + - name: Upload execution log + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.directory }}-r-script-execution-log + path: ${{ matrix.directory }}_r_script_execution.log + + - name: Fail if errors found + if: failure() + run: exit 1 + + - name: Delete execution log + run: rm ${{ matrix.directory }}_r_script_execution.log + + - name: Zip .R files + run: | + mkdir r_scripts + mv ${{ matrix.directory }}/*.R r_scripts/ + zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts + + - name: Upload artifact + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.directory }}-r-scripts + path: ${{ matrix.directory }}_r_scripts.zip + + - name: Delete .R files and zip + run: | + rm -rf r_scripts + rm ${{ matrix.directory }}_r_scripts.zip + + - name: Commit and push stripped .irnb and .Rmd files + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 1e283f12c39817775ca4f30ba0ddd3e21e134b3f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:36:27 -0700 Subject: [PATCH 043/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 37c3a9e6..3d755cc9 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -58,6 +58,16 @@ jobs: filters: | PM5: - 'PM5/*.irnb' + + - name: Set output variables + id: set_output + run: | + echo "::set-output name=PM1_changed::${{ steps.check_PM1.outputs.matched }}" + echo "::set-output name=PM2_changed::${{ steps.check_PM2.outputs.matched }}" + echo "::set-output name=PM3_changed::${{ steps.check_PM3.outputs.matched }}" + echo "::set-output name=PM4_changed::${{ steps.check_PM4.outputs.matched }}" + echo "::set-output name=PM5_changed::${{ steps.check_PM5.outputs.matched }}" + determine-dirs: needs: [set-vars] From 5f7dbf6da31af58b20c542ed7abd3f673ba69a67 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:41:49 -0700 Subject: [PATCH 044/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 3d755cc9..f9b211f0 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -76,20 +76,16 @@ jobs: - name: Determine directories to process id: determine run: | - directories=$(jq -n \ - --arg pm1 ${{ needs.set-vars.outputs.PM1_changed }} \ - --arg pm2 ${{ needs.set-vars.outputs.PM2_changed }} \ - --arg pm3 ${{ needs.set-vars.outputs.PM3_changed }} \ - --arg pm4 ${{ needs.set-vars.outputs.PM4_changed }} \ - --arg pm5 ${{ needs.set-vars.outputs.PM5_changed }} \ - '{ - PM1: $pm1, - PM2: $pm2, - PM3: $pm3, - PM4: $pm4, - PM5: $pm5 - } | to_entries | map(select(.value == "true")) | map(.key)') - echo "$directories" > directories.json + directories="" + [ "${{ needs.set-vars.outputs.PM1_changed }}" == "true" ] && directories="$directories PM1" + [ "${{ needs.set-vars.outputs.PM2_changed }}" == "true" ] && directories="$directories PM2" + [ "${{ needs.set-vars.outputs.PM3_changed }}" == "true" ] && directories="$directories PM3" + [ "${{ needs.set-vars.outputs.PM4_changed }}" == "true" ] && directories="$directories PM4" + [ "${{ needs.set-vars.outputs.PM5_changed }}" == "true" ] && directories="$directories PM5" + echo "directories=$(echo $directories | jq -R -s -c 'split(" ")')" > directories.json + cat directories.json + echo "::set-output name=directories::$(cat directories.json)" + - name: Upload directories json uses: actions/upload-artifact@v2 with: From c36775e4b540d24cc242b12758b96928c5eed48a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:46:53 -0700 Subject: [PATCH 045/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 41 ++++++------------------ 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index f9b211f0..bf3f60eb 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -13,12 +13,6 @@ jobs: set-vars: runs-on: ubuntu-latest if: github.event_name == 'push' - outputs: - PM1_changed: ${{ steps.check_PM1.outputs.matched }} - PM2_changed: ${{ steps.check_PM2.outputs.matched }} - PM3_changed: ${{ steps.check_PM3.outputs.matched }} - PM4_changed: ${{ steps.check_PM4.outputs.matched }} - PM5_changed: ${{ steps.check_PM5.outputs.matched }} steps: - name: Checkout repository uses: actions/checkout@v2 @@ -58,16 +52,6 @@ jobs: filters: | PM5: - 'PM5/*.irnb' - - - name: Set output variables - id: set_output - run: | - echo "::set-output name=PM1_changed::${{ steps.check_PM1.outputs.matched }}" - echo "::set-output name=PM2_changed::${{ steps.check_PM2.outputs.matched }}" - echo "::set-output name=PM3_changed::${{ steps.check_PM3.outputs.matched }}" - echo "::set-output name=PM4_changed::${{ steps.check_PM4.outputs.matched }}" - echo "::set-output name=PM5_changed::${{ steps.check_PM5.outputs.matched }}" - determine-dirs: needs: [set-vars] @@ -76,28 +60,21 @@ jobs: - name: Determine directories to process id: determine run: | - directories="" - [ "${{ needs.set-vars.outputs.PM1_changed }}" == "true" ] && directories="$directories PM1" - [ "${{ needs.set-vars.outputs.PM2_changed }}" == "true" ] && directories="$directories PM2" - [ "${{ needs.set-vars.outputs.PM3_changed }}" == "true" ] && directories="$directories PM3" - [ "${{ needs.set-vars.outputs.PM4_changed }}" == "true" ] && directories="$directories PM4" - [ "${{ needs.set-vars.outputs.PM5_changed }}" == "true" ] && directories="$directories PM5" - echo "directories=$(echo $directories | jq -R -s -c 'split(" ")')" > directories.json - cat directories.json - echo "::set-output name=directories::$(cat directories.json)" + directories="[" + [ "${{ needs.set-vars.outputs.PM1_changed }}" == "true" ] && directories="$directories, PM1" + [ "${{ needs.set-vars.outputs.PM2_changed }}" == "true" ] && directories="$directories, PM2" + [ "${{ needs.set-vars.outputs.PM3_changed }}" == "true" ] && directories="$directories, PM3" + [ "${{ needs.set-vars.outputs.PM4_changed }}" == "true" ] && directories="$directories, PM4" + [ "${{ needs.set-vars.outputs.PM5_changed }}" == "true" ] && directories="$directories, PM5" + directories="$directories ]" + echo "directories=$directories" - - name: Upload directories json - uses: actions/upload-artifact@v2 - with: - name: directories-json - path: directories.json - convert-lint-notebooks: needs: [determine-dirs] runs-on: ubuntu-latest strategy: matrix: - directory: ${{ fromJson(needs.determine-dirs.outputs.directories) }} + directory: ${{ needs.determine-dirs.outputs.directories) }} steps: - name: Checkout repository uses: actions/checkout@v2 From 7251e2b54769ddf0e574d213b184ff5b8f9b8591 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:47:38 -0700 Subject: [PATCH 046/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index bf3f60eb..18fdbcce 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -74,7 +74,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ${{ needs.determine-dirs.outputs.directories) }} + directory: ${{ needs.determine-dirs.outputs.directories }} steps: - name: Checkout repository uses: actions/checkout@v2 From 4a17dc7d6c341ed6f75dcb22d64c799f87f32188 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:51:50 -0700 Subject: [PATCH 047/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 62 +----------------------- 1 file changed, 1 insertion(+), 61 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 18fdbcce..db829e38 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -10,71 +10,11 @@ concurrency: cancel-in-progress: true jobs: - set-vars: - runs-on: ubuntu-latest - if: github.event_name == 'push' - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Check if PM1 files changed - id: check_PM1 - uses: dorny/paths-filter@v2 - with: - filters: | - PM1: - - 'PM1/*.irnb' - - name: Check if PM2 files changed - id: check_PM2 - uses: dorny/paths-filter@v2 - with: - filters: | - PM2: - - 'PM2/*.irnb' - - name: Check if PM3 files changed - id: check_PM3 - uses: dorny/paths-filter@v2 - with: - filters: | - PM3: - - 'PM3/*.irnb' - - name: Check if PM4 files changed - id: check_PM4 - uses: dorny/paths-filter@v2 - with: - filters: | - PM4: - - 'PM4/*.irnb' - - name: Check if PM5 files changed - id: check_PM5 - uses: dorny/paths-filter@v2 - with: - filters: | - PM5: - - 'PM5/*.irnb' - - determine-dirs: - needs: [set-vars] - runs-on: ubuntu-latest - steps: - - name: Determine directories to process - id: determine - run: | - directories="[" - [ "${{ needs.set-vars.outputs.PM1_changed }}" == "true" ] && directories="$directories, PM1" - [ "${{ needs.set-vars.outputs.PM2_changed }}" == "true" ] && directories="$directories, PM2" - [ "${{ needs.set-vars.outputs.PM3_changed }}" == "true" ] && directories="$directories, PM3" - [ "${{ needs.set-vars.outputs.PM4_changed }}" == "true" ] && directories="$directories, PM4" - [ "${{ needs.set-vars.outputs.PM5_changed }}" == "true" ] && directories="$directories, PM5" - directories="$directories ]" - echo "directories=$directories" - convert-lint-notebooks: - needs: [determine-dirs] runs-on: ubuntu-latest strategy: matrix: - directory: ${{ needs.determine-dirs.outputs.directories }} + directory: ['PM1'] steps: - name: Checkout repository uses: actions/checkout@v2 From 1dfd1683be28e102da8c24cfec4db7a9be164939 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:56:57 -0700 Subject: [PATCH 048/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index db829e38..ef4276f0 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -84,13 +84,13 @@ jobs: sink(log_con, type = "output") sink(log_con, type = "message") errors <- list() - for (file in files) { + for (gitrfile in files) { tryCatch( { - source(file) + source(gitrfile) }, error = function(e) { - errors[[length(errors) + 1]] <<- list(file = file, message = e$message) + errors[[length(errors) + 1]] <<- list(file = gitrfile, message = e$message) } ) } @@ -99,7 +99,7 @@ jobs: close(log_con) if (length(errors) > 0) { for (error in errors) { - cat("Error found in file:", error$file, "\n") + cat("Error found in file:", error$gitrfile, "\n") cat("Error message:", error$message, "\n") } } From 105c74631613ed45e0bcc1bae6fc74768c2da193 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 13:59:38 -0700 Subject: [PATCH 049/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 8dbf6724..1282968d 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -479,7 +479,7 @@ "table <- matrix(0, 3, 5)\n", "table[1, 1:5] <- c(p1, r2_1, mse1, r2_adj1, mse_adj1)\n", "table[2, 1:5] <- c(p2, r2_2, mse2, r2_adj2, mse_adj2)\n", - "table[3, 1:5] <- c(pl, r2_l, mse_l, r2_adj_l, mse_adj_l)\n", + "table[3, 1:5] <- c(p_l, r2_l, mse_l, r2_adj_l, mse_adj_l)\n", "colnames(table) <- c(\"p\", \"$R^2_{sample}$\", \"$MSE_{sample}$\", \"$R^2_{adjusted}$\", \"$MSE_{adjusted}$\")\n", "rownames(table) <- c(\"basic reg\", \"flexible reg\", \"lasso flex\")\n", "tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2))\n", From 2ccc83cf200d3a986083bd4e35d836d456e00922 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 14 Jul 2024 21:10:11 +0000 Subject: [PATCH 050/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM1 --- ...r-ols-and-lasso-for-wage-gap-inference.Rmd | 2 +- PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 214 +++++++++--------- 2 files changed, 111 insertions(+), 105 deletions(-) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd index 79fd5a70..573705f7 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -200,7 +200,7 @@ Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out # Partialling-out using ols # model for Y -flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) +flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) # model for D flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd index d53fffcf..627cfb37 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd @@ -51,13 +51,13 @@ We construct the output variable $Y$ and the matrix $Z$ which includes the chara ```{r} # construct matrices for estimation from the data -Y <- log(data$wage) -n <- length(Y) -Z <- data[- which(colnames(data) %in% c("wage", "lwage"))] -p <- dim(Z)[2] +y <- log(data$wage) +n <- length(y) +z <- data[- which(colnames(data) %in% c("wage", "lwage"))] +p <- dim(z)[2] -cat("Number of observations:", n, '\n') -cat( "Number of raw regressors:", p) +cat("Number of observations:", n, "\n") +cat("Number of raw regressors:", p) ``` For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data. @@ -130,9 +130,10 @@ Let us fit both models to our data by running ordinary least squares (ols): ```{r} # 1. basic model -basic <- lwage ~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +C(occ2) + C(ind2)) +basic <- lwage ~ (sex + exp1 + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2)) regbasic <- lm(basic, data = train) # perform ols using the defined model -cat("Number of regressors in the basic model:", length(regbasic$coef), '\n') # number of regressors in the Basic Model +# number of regressors in the Basic Model +cat("Number of regressors in the basic model:", length(regbasic$coef), "\n") ``` ##### Note that the basic model consists of $51$ regressors. @@ -140,9 +141,10 @@ cat("Number of regressors in the basic model:", length(regbasic$coef), '\n') # n ```{r} # 2. flexible model flex <- lwage ~ sex + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2) + - (exp1 + exp2 + exp3 + exp4) * (shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we) + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) regflex <- lm(flex, data = train) -cat( "Number of regressors in the flexible model:", length(regflex$coef)) # number of regressors in the Flexible Model +# number of regressors in the Flexible Model +cat("Number of regressors in the flexible model:", length(regflex$coef)) ``` ##### Note that the flexible model consists of $246$ regressors. @@ -153,9 +155,9 @@ We re-estimate the flexible model using Lasso (the least absolute shrinkage and ```{r} # Flexible model using Lasso, in-sample fit train_flex <- model.matrix(flex, train) # all regressors -fit.lasso.cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = "gaussian", alpha = 1, nfolds = 5) +fit_lasso_cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = "gaussian", alpha = 1, nfolds = 5) # in-sample fit right now, not out-of-sample using "test" -yhat.lasso.cv <- predict(fit.lasso.cv, newx = as.matrix(train_flex), s = "lambda.min") +yhat_lasso_cv <- predict(fit_lasso_cv, newx = as.matrix(train_flex), s = "lambda.min") ``` #### Evaluating the predictive performance of the basic and flexible models in-sample @@ -167,51 +169,53 @@ sumbasic <- summary(regbasic) sumflex <- summary(regflex) # no summary() for lassocv -ntrain = nrow(train) +ntrain <- nrow(train) # R-squared and adjusted R-squared -R2.1 <- sumbasic$r.squared -cat("R-squared for the basic model: ", R2.1, "\n") -R2.adj1 <- sumbasic$adj.r.squared -cat("adjusted R-squared for the basic model: ", R2.adj1, "\n") - -R2.2 <- sumflex$r.squared -cat("R-squared for the flexible model: ", R2.2, "\n") -R2.adj2 <- sumflex$adj.r.squared -cat("adjusted R-squared for the flexible model: ", R2.adj2, "\n") - -pL <- fit.lasso.cv$nzero[fit.lasso.cv$index[1]] -R2.L <- 1 - sum((yhat.lasso.cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) -cat("R-squared for the lasso with flexible model: ", R2.L, "\n") -R2.adjL <- 1 - (sum((yhat.lasso.cv - train$lwage)^2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) -cat("adjusted R-squared for the flexible model: ", R2.adjL, "\n") +r2_1 <- sumbasic$r.squared +cat("R-squared for the basic model: ", r2_1, "\n") +r2_adj1 <- sumbasic$adj.r.squared +cat("adjusted R-squared for the basic model: ", r2_adj1, "\n") + +r2_2 <- sumflex$r.squared +cat("R-squared for the flexible model: ", r2_2, "\n") +r2_adj2 <- sumflex$adj.r.squared +cat("adjusted R-squared for the flexible model: ", r2_adj2, "\n") + +p_l <- fit_lasso_cv$nzero[fit_lasso_cv$index[1]] +r2_l <- 1 - sum((yhat_lasso_cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) +cat("R-squared for the lasso with flexible model: ", r2_l, "\n") +r2_adj_l <- 1 - + (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) / + (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) +cat("adjusted R-squared for the flexible model: ", r2_adj_l, "\n") # MSE and adjusted MSE -MSE1 <- mean(sumbasic$res^2) -cat("MSE for the basic model: ", MSE1, "\n") +mse1 <- mean(sumbasic$res^2) +cat("MSE for the basic model: ", mse1, "\n") p1 <- sumbasic$df[1] # number of regressors -MSE.adj1 <- (ntrain / (ntrain - p1)) * MSE1 -cat("adjusted MSE for the basic model: ", MSE.adj1, "\n") +mse_adj1 <- (ntrain / (ntrain - p1)) * mse1 +cat("adjusted MSE for the basic model: ", mse_adj1, "\n") -MSE2 <-mean(sumflex$res^2) -cat("MSE for the flexible model: ", MSE2, "\n") +mse2 <- mean(sumflex$res^2) +cat("MSE for the flexible model: ", mse2, "\n") p2 <- sumflex$df[1] -MSE.adj2 <- (ntrain / (ntrain - p2)) * MSE2 -cat("adjusted MSE for the lasso flexible model: ", MSE.adj2, "\n") - -lasso.res <- train$lwage - yhat.lasso.cv -MSEL <-mean(lasso.res^2) -cat("MSE for the lasso flexible model: ", MSEL, "\n") -MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL -cat("adjusted MSE for the lasso flexible model: ", MSE.adjL, "\n") +mse_adj2 <- (ntrain / (ntrain - p2)) * mse2 +cat("adjusted MSE for the lasso flexible model: ", mse_adj2, "\n") + +lasso_res <- train$lwage - yhat_lasso_cv +mse_l <- mean(lasso_res^2) +cat("MSE for the lasso flexible model: ", mse_l, "\n") +mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l +cat("adjusted MSE for the lasso flexible model: ", mse_adj_l, "\n") ``` ```{r} # Output the table table <- matrix(0, 3, 5) -table[1, 1:5] <- c(p1, R2.1, MSE1, R2.adj1, MSE.adj1) -table[2, 1:5] <- c(p2, R2.2, MSE2, R2.adj2, MSE.adj2) -table[3, 1:5] <- c(pL, R2.L, MSEL, R2.adjL, MSE.adjL) +table[1, 1:5] <- c(p1, r2_1, mse1, r2_adj1, mse_adj1) +table[2, 1:5] <- c(p2, r2_2, mse2, r2_adj2, mse_adj2) +table[3, 1:5] <- c(p_l, r2_l, mse_l, r2_adj_l, mse_adj_l) colnames(table) <- c("p", "$R^2_{sample}$", "$MSE_{sample}$", "$R^2_{adjusted}$", "$MSE_{adjusted}$") rownames(table) <- c("basic reg", "flexible reg", "lasso flex") tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2)) @@ -235,14 +239,14 @@ options(warn = -1) # ignore warnings regbasic <- lm(basic, data = train) # calculating the out-of-sample MSE -yhat.bas <- predict(regbasic, newdata = test) -y.test <- test$lwage -mean.train <- mean(train$lwage) -MSE.test1 <- sum((y.test - yhat.bas)^2) / length(y.test) -R2.test1 <- 1 - MSE.test1 / mean((y.test - mean.train)^2) - -cat("Test MSE for the basic model: ", MSE.test1, " ") -cat("Test R2 for the basic model: ", R2.test1) +yhat_bas <- predict(regbasic, newdata = test) +y_test <- test$lwage +mean_train <- mean(train$lwage) +mse_test1 <- sum((y_test - yhat_bas)^2) / length(y_test) +r2_test1 <- 1 - mse_test1 / mean((y_test - mean_train)^2) + +cat("Test MSE for the basic model: ", mse_test1, " ") +cat("Test R2 for the basic model: ", r2_test1) ``` In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$. @@ -253,15 +257,15 @@ options(warn = -1) # ignore warnings regflex <- lm(flex, data = train) # calculating the out-of-sample MSE -yhat.flex<- predict(regflex, newdata = test) -y.test <- test$lwage -mean.train <- mean(train$lwage) -MSE.test2 <- sum((y.test - yhat.flex)^2) / length(y.test) -R2.test2 <- 1 - MSE.test2 / mean((y.test - mean.train)^2) +yhat_flex <- predict(regflex, newdata = test) +y_test <- test$lwage +mean_train <- mean(train$lwage) +mse_test2 <- sum((y_test - yhat_flex)^2) / length(y_test) +r2_test2 <- 1 - mse_test2 / mean((y_test - mean_train)^2) -cat("Test MSE for the flexible model: ", MSE.test2, " ") +cat("Test MSE for the flexible model: ", mse_test2, " ") -cat("Test R2 for the flexible model: ", R2.test2) +cat("Test R2 for the flexible model: ", r2_test2) ``` In the flexible model too, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is not large. @@ -279,16 +283,16 @@ flex_data <- model.matrix(flex, data) train_flex <- flex_data[random, ] test_flex <- flex_data[-random, ] -fit.lasso.cv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) -yhat.lasso.cv <- predict(fit.lasso.cv, newx = test_flex, s = "lambda.min") +fit_lasso_cv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) +yhat_lasso_cv <- predict(fit_lasso_cv, newx = test_flex, s = "lambda.min") # calculating the out-of-sample MSE -MSE.lasso <- sum((y.test - yhat.lasso.cv)^2) / length(y.test) -R2.lasso <- 1 - MSE.lasso / mean((y.test - mean(train$lwage))^2) +mse_lasso <- sum((y_test - yhat_lasso_cv)^2) / length(y_test) +r2_lasso <- 1 - mse_lasso / mean((y_test - mean(train$lwage))^2) -cat("Test MSE for the lasso on flexible model: ", MSE.lasso, " ") +cat("Test MSE for the lasso on flexible model: ", mse_lasso, " ") -cat("Test R2 for the lasso flexible model: ", R2.lasso) +cat("Test R2 for the lasso flexible model: ", r2_lasso) ``` Finally, let us summarize the results: @@ -296,12 +300,12 @@ Finally, let us summarize the results: ```{r} # Output the comparison table table2 <- matrix(0, 3, 2) -table2[1, 1] <- MSE.test1 -table2[2, 1] <- MSE.test2 -table2[3, 1] <- MSE.lasso -table2[1, 2] <- R2.test1 -table2[2, 2] <- R2.test2 -table2[3, 2] <- R2.lasso +table2[1, 1] <- mse_test1 +table2[2, 1] <- mse_test2 +table2[3, 1] <- mse_lasso +table2[1, 2] <- r2_test1 +table2[2, 2] <- r2_test2 +table2[3, 2] <- r2_lasso rownames(table2) <- c("basic reg", "flexible reg", "lasso regression") colnames(table2) <- c("$MSE_{test}$", "$R^2_{test}$") @@ -324,34 +328,34 @@ regextra <- lm(extraflex, data = train) sumextra <- summary(regextra) cat("Number of Extra-Flex Controls", length(regextra$coef) - 1, "\n") n <- length(data$wage) -p <- length(regextra$coef); +p <- length(regextra$coef) ntrain <- length(train$wage) ``` ```{r} ## In-sample # R-squared and adjusted R-squared -R2.extra <- sumextra$r.squared -cat("R-squared for the extra flexible model (in-sample): ", R2.extra, "\n") -R2.adjextra <- sumextra$adj.r.squared -cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjextra, "\n") +r2_extra <- sumextra$r.squared +cat("R-squared for the extra flexible model (in-sample): ", r2_extra, "\n") +r2_adjextra <- sumextra$adj.r.squared +cat("adjusted R-squared for the extra flexible model (in-sample): ", r2_adjextra, "\n") # MSE and adjusted MSE -MSE.extra <- mean(sumextra$res^2) -cat("MSE for the extra flexible model (in-sample): ", MSE.extra, "\n") -MSE.adjextra <- (ntrain / (ntrain - p)) * MSE.extra -cat("adjusted MSE for the basic model (in-sample): ", MSE.adj1, "\n") +mse_extra <- mean(sumextra$res^2) +cat("MSE for the extra flexible model (in-sample): ", mse_extra, "\n") +mse_adjextra <- (ntrain / (ntrain - p)) * mse_extra +cat("adjusted MSE for the basic model (in-sample): ", mse_adj1, "\n") ``` ```{r} ## Out-of-sample -yhat.ex <- predict(regextra, newdata=test) -y.test.ex <- test$lwage -MSE.test.ex <- sum((y.test.ex - yhat.ex)^2) / length(y.test.ex) -R2.test.ex <- 1 - MSE.test.ex / mean((y.test.ex - mean(train$lwage))^2) +yhat_ex <- predict(regextra, newdata = test) +y_test_ex <- test$lwage +mse_test_ex <- sum((y_test_ex - yhat_ex)^2) / length(y_test_ex) +r2_test_ex <- 1 - mse_test_ex / mean((y_test_ex - mean(train$lwage))^2) -cat("Test MSE for the basic model: ", MSE.test.ex, " ") -cat("Test R2 for the basic model: ", R2.test.ex) +cat("Test MSE for the basic model: ", mse_test_ex, " ") +cat("Test R2 for the basic model: ", r2_test_ex) ``` As we can see, a simple OLS overfits when the dimensionality of covariates is high, as the out-of-sample performance suffers dramatically in comparison to the in-sample performance. @@ -365,32 +369,34 @@ train_flex <- flex_data[random, ] test_flex <- flex_data[-random, ] # fit model -fit.lcv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) +fit_lcv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) # in-sample -yhat.lcv <- predict(fit.lcv, newx = train_flex, s = "lambda.min") +yhat_lcv <- predict(fit_lcv, newx = train_flex, s = "lambda.min") -R2.L <- 1 - sum((yhat.lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) -pL <- fit.lcv$nzero[fit.lcv$index[1]] -R2.adjL <- 1 - (sum((yhat.lcv - train$lwage) ^ 2) / (ntrain - pL - 1)) / (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) +r2_l <- 1 - sum((yhat_lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) +p_l <- fit_lcv$nzero[fit_lcv$index[1]] +r2_adj_l <- 1 - + (sum((yhat_lcv - train$lwage) ^ 2) / (ntrain - p_l - 1)) / + (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) -lasso.res <- train$lwage - yhat.lcv -MSEL <- mean(lasso.res^2) -MSE.adjL <- (ntrain / (ntrain - pL - 1)) * MSEL +lasso_res <- train$lwage - yhat_lcv +mse_l <- mean(lasso_res^2) +mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l -cat("R-squared for the lasso with the extra flexible model (in-sample): ", R2.L, "\n") -cat("adjusted R-squared for the extra flexible model (in-sample): ", R2.adjL, "\n") -cat("MSE for the lasso with the extra flexible model (in-sample): ", MSEL, "\n") -cat("adjusted MSE for the lasso with the extraflexible model (in-sample): ", MSE.adjL, "\n") +cat("R-squared for the lasso with the extra flexible model (in-sample): ", r2_l, "\n") +cat("adjusted R-squared for the extra flexible model (in-sample): ", r2_adj_l, "\n") +cat("MSE for the lasso with the extra flexible model (in-sample): ", mse_l, "\n") +cat("adjusted MSE for the lasso with the extraflexible model (in-sample): ", mse_adj_l, "\n") # out-of-sample -yhat.lcv.test <- predict(fit.lcv, newx = test_flex, s = "lambda.min") -MSE.lasso <- sum((test$lwage - yhat.lcv.test)^2) / length(test$lwage) -R2.lasso <- 1 - MSE.lasso / mean((test$lwage - mean(train$lwage))^2) +yhat_lcv_test <- predict(fit_lcv, newx = test_flex, s = "lambda.min") +mse_lasso <- sum((test$lwage - yhat_lcv_test)^2) / length(test$lwage) +r2_lasso <- 1 - mse_lasso / mean((test$lwage - mean(train$lwage))^2) cat("\n") -cat("Test R2 for the lasso the extra flexible model: ", R2.lasso,"\n") -cat("Test MSE for the lasso on the extra flexible model: ", MSE.lasso) +cat("Test R2 for the lasso the extra flexible model: ", r2_lasso, "\n") +cat("Test MSE for the lasso on the extra flexible model: ", mse_lasso) ``` As shown above, the overfitting effect is mitigated with the penalized regression model. From 9b885d472bdf9a4349103821e1fa5d71fa1ace35 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 14:16:45 -0700 Subject: [PATCH 051/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index ef4276f0..2771d8a0 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] steps: - name: Checkout repository uses: actions/checkout@v2 From 757af80e080901cb32be3614028db9b09f940219 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 14 Jul 2024 14:17:28 -0700 Subject: [PATCH 052/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 2771d8a0..3faea168 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From 3c8a41aa46dc8e1a3c3f4303148773c01eab7f18 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 04:57:01 -0700 Subject: [PATCH 053/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 3faea168..4e83dbb1 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -68,7 +68,7 @@ jobs: if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) - stop("Linting failed with warnings") + # stop("Linting failed with warnings") } }) ' From 189b5d6c2b6b5b14985b9bccfec1dc14565d3f18 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 12:02:07 +0000 Subject: [PATCH 054/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM2 --- CM2/r-colliderbias-hollywood.Rmd | 53 +++++++++++++ CM2/r-colliderbias-hollywood.irnb | 126 +++++++++++++++++++++++++++++- 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 CM2/r-colliderbias-hollywood.Rmd diff --git a/CM2/r-colliderbias-hollywood.Rmd b/CM2/r-colliderbias-hollywood.Rmd new file mode 100644 index 00000000..5bf2e323 --- /dev/null +++ b/CM2/r-colliderbias-hollywood.Rmd @@ -0,0 +1,53 @@ +--- +title: An R Markdown document converted from "CM2/r-colliderbias-hollywood.irnb" +output: html_document +--- + +# Collider Bias + +Here is a simple mnemonic example to illustate the collider or M-bias. + +Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beaty are negatively correlated" for celebrities. + +```{r} +install.packages("dagitty") +library(dagitty) +``` + +```{r} +g <- dagitty( "dag{ T -> C <- B }" ) +plot(g) +``` + +```{r} +#collider bias +n=1000000 +T = rnorm(n) #talent +B = rnorm(n) #beaty +C = T+B + rnorm(n) #congeniality +T.H= subset(T, C>0) # condition on C>0 +B.H= subset(B, C>0) # condition on C>0 + +summary(lm(T~ B)) #regression of T on B +summary(lm(T~ B +C)) #regression of T on B and C +summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0. +``` + +We can also use package Dagitty to illustrate collider bias, also known as M-bias. + +```{r} +## If we want to infer causal effec of B on T, +## we can apply the command to figure out +## variables we should condition on: + +adjustmentSets( g, "T", "B" ) + +## empty set -- we should not condition on the additional +## variable C. + +## Generate data where C = .5T + .5B +set.seed( 123); d <- simulateSEM( g, .5 ) +confint( lm( T ~ B, d ) )["B",] # includes 0 +confint( lm( T ~ B + C, d ) )["B",] # does not include 0 +``` + diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index b82d32dd..f1648997 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -1 +1,125 @@ -{"cells":[{"metadata":{"id":"UqKC2AValFyz"},"cell_type":"markdown","source":["# Collider Bias"]},{"metadata":{"id":"52GxYF5-lFy1"},"cell_type":"markdown","source":["Here is a simple mnemonic example to illustate the collider or M-bias.\n","\n","Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that \"talent and beaty are negatively correlated\" for celebrities. "]},{"metadata":{"trusted":true,"id":"HqSBY2MwlFy2"},"cell_type":"code","source":["install.packages(\"dagitty\")\n","library(dagitty)"],"execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"id":"eITRXDGJlFy5"},"cell_type":"code","source":["g <- dagitty( \"dag{ T -> C <- B }\" )\n","plot(g)"],"execution_count":null,"outputs":[]},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"id":"XrV8UFAOlFy5"},"cell_type":"code","source":["#collider bias\n","n=1000000\n","T = rnorm(n) #talent\n","B = rnorm(n) #beaty\n","C = T+B + rnorm(n) #congeniality\n","T.H= subset(T, C>0) # condition on C>0\n","B.H= subset(B, C>0) # condition on C>0\n","\n","summary(lm(T~ B)) #regression of T on B\n","summary(lm(T~ B +C)) #regression of T on B and C\n","summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0."],"execution_count":null,"outputs":[]},{"metadata":{"id":"Y08MPHCClFy5"},"cell_type":"markdown","source":["We can also use package Dagitty to illustrate collider bias, also known as M-bias."]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true,"id":"B4vPm1JRlFy6"},"cell_type":"code","source":["## If we want to infer causal effec of B on T,\n","## we can apply the command to figure out\n","## variables we should condition on:\n","\n","adjustmentSets( g, \"T\", \"B\" )\n","\n","## empty set -- we should not condition on the additional\n","## variable C.\n","\n","## Generate data where C = .5T + .5B\n","set.seed( 123); d <- simulateSEM( g, .5 )\n","confint( lm( T ~ B, d ) )[\"B\",] # includes 0\n","confint( lm( T ~ B + C, d ) )[\"B\",] # does not include 0\n"],"execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"ir","display_name":"R","language":"R"},"language_info":{"name":"R","codemirror_mode":"r","pygments_lexer":"r","mimetype":"text/x-r-source","file_extension":".r","version":"3.6.3"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UqKC2AValFyz" + }, + "source": [ + "# Collider Bias" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "52GxYF5-lFy1" + }, + "source": [ + "Here is a simple mnemonic example to illustate the collider or M-bias.\n", + "\n", + "Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that \"talent and beaty are negatively correlated\" for celebrities. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HqSBY2MwlFy2" + }, + "outputs": [], + "source": [ + "install.packages(\"dagitty\")\n", + "library(dagitty)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eITRXDGJlFy5" + }, + "outputs": [], + "source": [ + "g <- dagitty( \"dag{ T -> C <- B }\" )\n", + "plot(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "id": "XrV8UFAOlFy5" + }, + "outputs": [], + "source": [ + "#collider bias\n", + "n=1000000\n", + "T = rnorm(n) #talent\n", + "B = rnorm(n) #beaty\n", + "C = T+B + rnorm(n) #congeniality\n", + "T.H= subset(T, C>0) # condition on C>0\n", + "B.H= subset(B, C>0) # condition on C>0\n", + "\n", + "summary(lm(T~ B)) #regression of T on B\n", + "summary(lm(T~ B +C)) #regression of T on B and C\n", + "summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y08MPHCClFy5" + }, + "source": [ + "We can also use package Dagitty to illustrate collider bias, also known as M-bias." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", + "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", + "id": "B4vPm1JRlFy6" + }, + "outputs": [], + "source": [ + "## If we want to infer causal effec of B on T,\n", + "## we can apply the command to figure out\n", + "## variables we should condition on:\n", + "\n", + "adjustmentSets( g, \"T\", \"B\" )\n", + "\n", + "## empty set -- we should not condition on the additional\n", + "## variable C.\n", + "\n", + "## Generate data where C = .5T + .5B\n", + "set.seed( 123); d <- simulateSEM( g, .5 )\n", + "confint( lm( T ~ B, d ) )[\"B\",] # includes 0\n", + "confint( lm( T ~ B + C, d ) )[\"B\",] # does not include 0\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 26bcf6f7eff23b61476ea15a3eebe3a069e11afb Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:09:15 -0700 Subject: [PATCH 055/261] Update r_convergence_hypothesis_double_lasso.irnb --- ...r_convergence_hypothesis_double_lasso.irnb | 755 +++++++++++------- 1 file changed, 470 insertions(+), 285 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 0dbb7183..ee0bdb8a 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -1,42 +1,31 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "ir", - "display_name": "R" - }, - "language_info": { - "name": "R" - } - }, "cells": [ { "cell_type": "markdown", - "source": [ - "# Testing the Convergence Hypothesis" - ], "metadata": { "id": "79U65py1grzb" - } + }, + "source": [ + "# Testing the Convergence Hypothesis" + ] }, { "cell_type": "code", "execution_count": 1, "metadata": { - "id": "GK-MMvLseA2Q", - "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "GK-MMvLseA2Q", + "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", + "vscode": { + "languageId": "r" } }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "Installing package into ‘/usr/local/lib/R/site-library’\n", "(as ‘lib’ is unspecified)\n", @@ -97,15 +86,18 @@ }, { "cell_type": "markdown", - "source": [ - "## Introduction" - ], "metadata": { "id": "nlpSLLV6g1pc" - } + }, + "source": [ + "## Introduction" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "xXkzGJWag02O" + }, "source": [ "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", " $$\n", @@ -115,74 +107,73 @@ "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", " \n", "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" - ], - "metadata": { - "id": "xXkzGJWag02O" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Data Analysis" - ], "metadata": { "id": "a5Ul2ppLfUBQ" - } + }, + "source": [ + "## Data Analysis" + ] }, { "cell_type": "markdown", - "source": [ - "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." - ], "metadata": { "id": "9GgPNICafYuK" - } + }, + "source": [ + "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." + ] }, { "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "_B9DWuS6fcVW", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "getdata <- function(...)\n", "{\n", - " e <- new.env()\n", - " name <- data(..., envir = e)[1]\n", - " e[[name]]\n", + " e <- new.env()\n", + " name <- data(..., envir = e)[1]\n", + " e[[name]]\n", "}\n", "\n", "# now load your data calling getdata()\n", "growth <- getdata(GrowthData)" - ], - "metadata": { - "id": "_B9DWuS6fcVW" - }, - "execution_count": 2, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "The sample contains $90$ countries and $63$ controls." - ], "metadata": { "id": "smYhqwpbffVh" - } + }, + "source": [ + "The sample contains $90$ countries and $63$ controls." + ] }, { "cell_type": "code", - "source": [ - "growth" - ], + "execution_count": 3, "metadata": { - "id": "1dsF7_R4j-Qv", - "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 + }, + "id": "1dsF7_R4j-Qv", + "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", + "vscode": { + "languageId": "r" } }, - "execution_count": 3, "outputs": [ { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -256,8 +247,144 @@ "\n", "
\n" ], - "text/markdown": "\nA data.frame: 90 × 63\n\n| Outcome <dbl> | intercept <int> | gdpsh465 <dbl> | bmp1l <dbl> | freeop <dbl> | freetar <dbl> | h65 <dbl> | hm65 <dbl> | hf65 <dbl> | p65 <dbl> | ⋯ ⋯ | seccf65 <dbl> | syr65 <dbl> | syrm65 <dbl> | syrf65 <dbl> | teapri65 <dbl> | teasec65 <dbl> | ex1 <dbl> | im1 <dbl> | xr65 <dbl> | tot1 <dbl> |\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n| -0.02433575 | 1 | 6.591674 | 0.2837 | 0.153491 | 0.043888 | 0.007 | 0.013 | 0.001 | 0.29 | ⋯ | 0.04 | 0.033 | 0.057 | 0.010 | 47.6 | 17.3 | 0.0729 | 0.0667 | 0.348 | -0.014727 |\n| 0.10047257 | 1 | 6.829794 | 0.6141 | 0.313509 | 0.061827 | 0.019 | 0.032 | 0.007 | 0.91 | ⋯ | 0.64 | 0.173 | 0.274 | 0.067 | 57.1 | 18.0 | 0.0940 | 0.1438 | 0.525 | 0.005750 |\n| 0.06705148 | 1 | 8.895082 | 0.0000 | 0.204244 | 0.009186 | 0.260 | 0.325 | 0.201 | 1.00 | ⋯ | 18.14 | 2.573 | 2.478 | 2.667 | 26.5 | 20.7 | 0.1741 | 0.1750 | 1.082 | -0.010040 |\n| 0.06408917 | 1 | 7.565275 | 0.1997 | 0.248714 | 0.036270 | 0.061 | 0.070 | 0.051 | 1.00 | ⋯ | 2.63 | 0.438 | 0.453 | 0.424 | 27.8 | 22.7 | 0.1265 | 0.1496 | 6.625 | -0.002195 |\n| 0.02792955 | 1 | 7.162397 | 0.1740 | 0.299252 | 0.037367 | 0.017 | 0.027 | 0.007 | 0.82 | ⋯ | 2.11 | 0.257 | 0.287 | 0.229 | 34.5 | 17.6 | 0.1211 | 0.1308 | 2.500 | 0.003283 |\n| 0.04640744 | 1 | 7.218910 | 0.0000 | 0.258865 | 0.020880 | 0.023 | 0.038 | 0.006 | 0.50 | ⋯ | 1.46 | 0.160 | 0.174 | 0.146 | 34.3 | 8.1 | 0.0634 | 0.0762 | 1.000 | -0.001747 |\n| 0.06733234 | 1 | 7.853605 | 0.0000 | 0.182525 | 0.014385 | 0.039 | 0.063 | 0.014 | 0.92 | ⋯ | 1.59 | 0.342 | 0.484 | 0.207 | 46.6 | 14.7 | 0.0342 | 0.0428 | 12.499 | 0.009092 |\n| 0.02097768 | 1 | 7.703910 | 0.2776 | 0.215275 | 0.029713 | 0.024 | 0.035 | 0.013 | 0.69 | ⋯ | 1.63 | 0.184 | 0.219 | 0.152 | 34.0 | 16.1 | 0.0864 | 0.0931 | 7.000 | 0.011630 |\n| 0.03355124 | 1 | 9.063463 | 0.0000 | 0.109614 | 0.002171 | 0.402 | 0.488 | 0.314 | 1.00 | ⋯ | 24.72 | 3.206 | 3.154 | 3.253 | 28.2 | 20.6 | 0.0594 | 0.0460 | 1.000 | 0.008169 |\n| 0.03914652 | 1 | 8.151910 | 0.1484 | 0.110885 | 0.028579 | 0.145 | 0.173 | 0.114 | 1.00 | ⋯ | 6.76 | 0.703 | 0.785 | 0.620 | 20.3 | 7.2 | 0.0524 | 0.0523 | 2.119 | 0.007584 |\n| 0.07612651 | 1 | 6.929517 | 0.0296 | 0.165784 | 0.020115 | 0.046 | 0.066 | 0.025 | 0.73 | ⋯ | 6.21 | 1.316 | 1.683 | 0.969 | 27.8 | 17.2 | 0.0560 | 0.0826 | 11.879 | 0.086032 |\n| 0.12795121 | 1 | 7.237778 | 0.2151 | 0.078488 | 0.011581 | 0.022 | 0.031 | 0.014 | 1.00 | ⋯ | 3.96 | 0.594 | 0.674 | 0.515 | 28.2 | 14.8 | 0.0270 | 0.0275 | 1.938 | 0.007666 |\n| -0.02432609 | 1 | 8.115820 | 0.4318 | 0.137482 | 0.026547 | 0.059 | 0.073 | 0.045 | 1.00 | ⋯ | 11.36 | 1.132 | 1.126 | 1.138 | 52.1 | 18.8 | 0.0804 | 0.0930 | 0.003 | 0.016968 |\n| 0.07829342 | 1 | 7.271704 | 0.1689 | 0.164598 | 0.044446 | 0.029 | 0.045 | 0.013 | 0.84 | ⋯ | 3.10 | 0.568 | 0.695 | 0.450 | 35.9 | 13.1 | 0.0617 | 0.0678 | 10.479 | 0.004573 |\n| 0.11291155 | 1 | 7.121252 | 0.1832 | 0.188016 | 0.045678 | 0.033 | 0.051 | 0.015 | 0.91 | ⋯ | 3.16 | 0.440 | 0.512 | 0.369 | 37.4 | 12.7 | 0.0775 | 0.0780 | 18.476 | -0.020322 |\n| 0.05230819 | 1 | 6.977281 | 0.0962 | 0.204611 | 0.077852 | 0.037 | 0.043 | 0.030 | 1.00 | ⋯ | 2.40 | 0.419 | 0.548 | 0.299 | 30.3 | 7.9 | 0.0668 | 0.0787 | 125.990 | 0.028916 |\n| 0.03639089 | 1 | 7.649693 | 0.0227 | 0.136287 | 0.046730 | 0.081 | 0.105 | 0.056 | 0.99 | ⋯ | 3.51 | 0.562 | 0.699 | 0.427 | 35.7 | 14.7 | 0.0872 | 0.0938 | 26.800 | 0.020228 |\n| 0.02973823 | 1 | 8.056744 | 0.0208 | 0.197853 | 0.037224 | 0.083 | 0.097 | 0.069 | 1.00 | ⋯ | 3.30 | 0.722 | 0.765 | 0.680 | 36.6 | 12.6 | 0.0557 | 0.0624 | 0.052 | 0.013407 |\n| -0.05664358 | 1 | 8.780941 | 0.2654 | 0.189867 | 0.031747 | 0.068 | 0.089 | 0.046 | 0.94 | ⋯ | 2.99 | 0.372 | 0.462 | 0.281 | 34.0 | 20.3 | 0.3178 | 0.1583 | 4.500 | -0.024761 |\n| 0.01920480 | 1 | 6.287859 | 0.4207 | 0.130682 | 0.109921 | 0.053 | 0.039 | 0.011 | 0.74 | ⋯ | 0.34 | 0.142 | 0.223 | 0.055 | 35.5 | 19.1 | 0.0201 | 0.0341 | 4.762 | -0.021656 |\n| 0.08520600 | 1 | 6.137727 | 0.1371 | 0.123818 | 0.015897 | 0.028 | 0.025 | 0.007 | 0.72 | ⋯ | 0.56 | 0.148 | 0.232 | 0.065 | 41.3 | 21.3 | 0.0298 | 0.0297 | 4.125 | -0.054872 |\n| 0.13398221 | 1 | 8.128880 | 0.0000 | 0.167210 | 0.003311 | 0.129 | 0.196 | 0.063 | 1.00 | ⋯ | 13.16 | 1.727 | 1.910 | 1.560 | 28.1 | 23.2 | 0.0570 | 0.0609 | 360.000 | -0.054874 |\n| 0.17302474 | 1 | 6.680855 | 0.4713 | 0.228424 | 0.029328 | 0.062 | 0.090 | 0.032 | 1.00 | ⋯ | 3.95 | 0.974 | 1.526 | 0.470 | 62.4 | 34.9 | 0.0206 | 0.0618 | 265.690 | 0.018194 |\n| 0.10969915 | 1 | 7.177019 | 0.0178 | 0.185240 | 0.015453 | 0.020 | 0.026 | 0.013 | 0.90 | ⋯ | 1.89 | 0.571 | 0.843 | 0.286 | 26.9 | 24.1 | 0.2295 | 0.1990 | 3.061 | -0.034733 |\n| 0.01598990 | 1 | 6.648985 | 0.4762 | 0.171181 | 0.058937 | 0.018 | 0.028 | 0.007 | 0.40 | ⋯ | 0.76 | 0.357 | 0.512 | 0.185 | 39.9 | 26.9 | 0.0178 | 0.0634 | 4.762 | -0.000222 |\n| 0.06224977 | 1 | 6.879356 | 0.2927 | 0.179508 | 0.035842 | 0.188 | 0.169 | 0.208 | 1.00 | ⋯ | 3.69 | 0.651 | 0.759 | 0.547 | 31.4 | 31.2 | 0.0695 | 0.0728 | 4.017 | 0.033636 |\n| 0.10987069 | 1 | 7.347300 | 0.1017 | 0.247626 | 0.037392 | 0.080 | 0.133 | 0.027 | 0.78 | ⋯ | 0.72 | 0.195 | 0.303 | 0.085 | 36.2 | 21.6 | 0.0860 | 0.0898 | 3.177 | 0.010162 |\n| 0.09210628 | 1 | 6.725034 | 0.0266 | 0.179933 | 0.046376 | 0.015 | 0.020 | 0.010 | 0.78 | ⋯ | 0.86 | 0.258 | 0.382 | 0.137 | 34.6 | 16.5 | 0.0558 | 0.0613 | 20.800 | -0.018514 |\n| 0.08337604 | 1 | 8.451053 | 0.0000 | 0.358556 | 0.016468 | 0.090 | 0.133 | 0.044 | 1.00 | ⋯ | 2.91 | 0.766 | 1.087 | 0.510 | 21.9 | 15.3 | 0.1687 | 0.1635 | 26.000 | 0.010943 |\n| 0.07623345 | 1 | 8.602453 | 0.0000 | 0.416234 | 0.014721 | 0.148 | 0.194 | 0.100 | 1.00 | ⋯ | 12.17 | 1.554 | 1.724 | 1.398 | 20.6 | 7.2 | 0.2629 | 0.2698 | 50.000 | -0.001521 |\n| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n| -0.03404539 | 1 | 8.346168 | 0.3199 | 0.110885 | 0.028579 | 0.272 | 0.289 | 0.272 | 1.00 | ⋯ | 9.58 | 0.919 | 0.936 | 0.902 | 18.2 | 7.7 | 0.0625 | 0.0578 | 36.603 | 0.014286 |\n| -0.03380635 | 1 | 7.303170 | 0.3133 | 0.165784 | 0.020115 | 0.112 | 0.132 | 0.065 | 0.85 | ⋯ | 5.60 | 1.158 | 1.473 | 0.862 | 22.7 | 18.2 | 0.1071 | 0.1028 | 20.000 | 0.111198 |\n| 0.06991488 | 1 | 7.859027 | 0.1222 | 0.078488 | 0.011581 | 0.107 | 0.103 | 0.092 | 0.88 | ⋯ | 2.80 | 0.596 | 0.645 | 0.548 | 21.7 | 14.4 | 0.0357 | 0.0466 | 8.127 | 0.006002 |\n| -0.08172560 | 1 | 7.998335 | 1.6378 | 0.137482 | 0.026547 | 0.156 | 0.181 | 0.150 | 1.00 | ⋯ | 13.74 | 1.339 | 1.222 | 1.445 | 34.8 | 15.2 | 0.0783 | 0.0847 | 4.911 | -0.127025 |\n| 0.04601005 | 1 | 7.655864 | 0.1345 | 0.164598 | 0.044446 | 0.080 | 0.097 | 0.058 | 1.00 | ⋯ | 8.25 | 1.076 | 1.143 | 1.013 | 32.1 | 19.4 | 0.0525 | 0.0572 | 30.929 | -0.004592 |\n| 0.06659809 | 1 | 7.675082 | 0.0898 | 0.188016 | 0.045678 | 0.269 | 0.338 | 0.200 | 1.00 | ⋯ | 5.80 | 0.687 | 0.745 | 0.630 | 37.5 | 16.4 | 0.0906 | 0.0959 | 25.000 | 0.191066 |\n| -0.01138424 | 1 | 7.830028 | 0.4880 | 0.136287 | 0.046730 | 0.146 | 0.193 | 0.094 | 1.00 | ⋯ | 6.42 | 0.950 | 1.129 | 0.772 | 39.1 | 23.8 | 0.0764 | 0.0866 | 40.500 | -0.007018 |\n| -0.10098990 | 1 | 8.498622 | 0.0010 | 0.189867 | 0.031747 | 0.181 | 0.190 | 0.159 | 0.97 | ⋯ | 7.63 | 0.801 | 0.850 | 0.752 | 30.2 | 16.8 | 0.2131 | 0.1437 | 4.285 | 0.168536 |\n| 0.05475087 | 1 | 6.216606 | 0.7557 | 0.214345 | 0.073495 | 0.023 | 0.051 | 0.006 | 0.73 | ⋯ | 0.44 | 0.282 | 0.488 | 0.051 | 50.6 | 21.8 | 0.0232 | 0.0407 | 8.876 | -0.084064 |\n| 0.09461817 | 1 | 8.414496 | 0.0000 | 0.374328 | 0.000000 | 0.101 | 0.147 | 0.053 | 1.00 | ⋯ | 11.80 | 1.846 | 2.369 | 1.301 | 31.1 | 24.3 | 0.5958 | 0.5819 | 4.935 | 0.021808 |\n| 0.04571529 | 1 | 6.383507 | 0.3556 | 0.130682 | 0.109921 | 0.086 | 0.130 | 0.042 | 0.79 | ⋯ | 1.00 | 0.446 | 0.713 | 0.163 | 42.3 | 19.8 | 0.0188 | 0.0222 | 8.653 | -0.012443 |\n| 0.06549111 | 1 | 8.782323 | 0.0000 | 0.167210 | 0.003311 | 0.246 | 0.331 | 0.160 | 0.99 | ⋯ | 15.52 | 1.969 | 2.121 | 1.828 | 25.3 | 17.5 | 0.1032 | 0.0958 | 296.800 | -0.057094 |\n| 0.02124651 | 1 | 7.251345 | 0.0516 | 0.263813 | 0.045225 | 0.090 | 0.053 | 0.030 | 0.88 | ⋯ | 4.00 | 0.817 | 1.205 | 0.413 | 34.7 | 21.1 | 0.0730 | 0.2227 | 0.320 | 0.128443 |\n| 0.14144548 | 1 | 7.511525 | 0.1053 | 0.228424 | 0.029328 | 0.103 | 0.139 | 0.054 | 1.00 | ⋯ | 9.33 | 1.700 | 2.369 | 1.060 | 51.7 | 37.1 | 0.0903 | 0.1229 | 484.000 | 0.007257 |\n| 0.09681623 | 1 | 7.713785 | 0.0050 | 0.185240 | 0.015453 | 0.031 | 0.042 | 0.016 | 0.91 | ⋯ | 4.35 | 0.891 | 1.255 | 0.517 | 31.8 | 27.3 | 0.1922 | 0.1821 | 2.402 | 0.030424 |\n| 0.04053420 | 1 | 6.728629 | 0.6190 | 0.171181 | 0.058937 | 0.019 | 0.027 | 0.009 | 0.46 | ⋯ | 1.01 | 0.670 | 1.039 | 0.271 | 40.1 | 18.0 | 0.0281 | 0.0459 | 9.900 | -0.012137 |\n| 0.01058841 | 1 | 7.186144 | 0.0760 | 0.179508 | 0.035842 | 0.184 | 0.173 | 0.217 | 1.00 | ⋯ | 5.34 | 0.943 | 1.049 | 0.837 | 29.0 | 31.5 | 0.0703 | 0.0716 | 7.248 | 0.009640 |\n| 0.18552649 | 1 | 8.326033 | 0.0050 | 0.321658 | 0.005106 | 0.090 | 0.109 | 0.075 | 1.00 | ⋯ | 4.64 | 1.127 | 1.427 | 0.817 | 30.5 | 23.1 | 0.7470 | 0.8489 | 2.371 | 0.051395 |\n| 0.09310491 | 1 | 7.894691 | 0.1062 | 0.247626 | 0.037392 | 0.121 | 0.175 | 0.063 | 0.96 | ⋯ | 1.47 | 0.481 | 0.761 | 0.200 | 33.8 | 19.6 | 0.0797 | 0.1018 | 3.017 | 0.207492 |\n| 0.06522856 | 1 | 7.175490 | 0.0000 | 0.179933 | 0.046376 | 0.035 | 0.040 | 0.027 | 0.83 | ⋯ | 1.23 | 0.332 | 0.451 | 0.219 | 27.9 | 27.2 | 0.0636 | 0.0721 | 20.379 | 0.018019 |\n| 0.03809502 | 1 | 9.030974 | 0.0000 | 0.293138 | 0.005517 | 0.245 | 0.251 | 0.238 | 1.00 | ⋯ | 7.50 | 1.167 | 1.210 | 1.128 | 22.5 | 15.5 | 0.1662 | 0.1617 | 4.286 | -0.006642 |\n| 0.03421300 | 1 | 8.995537 | 0.0000 | 0.304720 | 0.011658 | 0.246 | 0.260 | 0.190 | 1.00 | ⋯ | 6.75 | 0.667 | 0.776 | 0.575 | 23.5 | 15.0 | 0.2597 | 0.2288 | 2.460 | -0.003241 |\n| 0.05275914 | 1 | 8.234830 | 0.0363 | 0.288405 | 0.011589 | 0.183 | 0.222 | 0.142 | 1.00 | ⋯ | 8.18 | 1.010 | 1.220 | 0.821 | 30.2 | 28.3 | 0.1044 | 0.1796 | 32.051 | -0.034352 |\n| 0.03841564 | 1 | 8.332549 | 0.0000 | 0.345485 | 0.006503 | 0.188 | 0.248 | 0.136 | 1.00 | ⋯ | 13.12 | 1.576 | 1.567 | 1.585 | 31.0 | 14.3 | 0.2866 | 0.3500 | 0.452 | -0.001660 |\n| 0.03189479 | 1 | 8.645586 | 0.0000 | 0.288440 | 0.005995 | 0.256 | 0.301 | 0.199 | 1.00 | ⋯ | 6.91 | 1.307 | 1.579 | 1.062 | 18.9 | 11.3 | 0.1296 | 0.1458 | 652.850 | -0.046278 |\n| 0.03119598 | 1 | 8.991064 | 0.0000 | 0.371898 | 0.014586 | 0.255 | 0.336 | 0.170 | 0.98 | ⋯ | 11.41 | 2.226 | 2.494 | 1.971 | 27.5 | 15.9 | 0.4407 | 0.4257 | 2.529 | -0.011883 |\n| 0.03409566 | 1 | 8.025189 | 0.0050 | 0.296437 | 0.013615 | 0.108 | 0.117 | 0.093 | 1.00 | ⋯ | 1.95 | 0.510 | 0.694 | 0.362 | 20.2 | 15.7 | 0.1669 | 0.2201 | 25.553 | -0.039080 |\n| 0.04690046 | 1 | 9.030137 | 0.0000 | 0.265778 | 0.008629 | 0.288 | 0.337 | 0.237 | 1.00 | ⋯ | 25.64 | 2.727 | 2.664 | 2.788 | 20.4 | 9.4 | 0.3238 | 0.3134 | 4.152 | 0.005175 |\n| 0.03977337 | 1 | 8.865312 | 0.0000 | 0.282939 | 0.005048 | 0.188 | 0.236 | 0.139 | 1.00 | ⋯ | 10.76 | 1.888 | 1.920 | 1.860 | 20.0 | 16.0 | 0.1845 | 0.1940 | 0.452 | -0.029551 |\n| 0.04064154 | 1 | 8.912339 | 0.0000 | 0.150366 | 0.024377 | 0.257 | 0.338 | 0.215 | 1.00 | ⋯ | 24.40 | 3.051 | 3.235 | 2.875 | 18.5 | 29.1 | 0.1876 | 0.2007 | 0.886 | -0.036482 |\n\n", - "text/latex": "A data.frame: 90 × 63\n\\begin{tabular}{lllllllllllllllllllll}\n Outcome & intercept & gdpsh465 & bmp1l & freeop & freetar & h65 & hm65 & hf65 & p65 & ⋯ & seccf65 & syr65 & syrm65 & syrf65 & teapri65 & teasec65 & ex1 & im1 & xr65 & tot1\\\\\n & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n\\hline\n\t -0.02433575 & 1 & 6.591674 & 0.2837 & 0.153491 & 0.043888 & 0.007 & 0.013 & 0.001 & 0.29 & ⋯ & 0.04 & 0.033 & 0.057 & 0.010 & 47.6 & 17.3 & 0.0729 & 0.0667 & 0.348 & -0.014727\\\\\n\t 0.10047257 & 1 & 6.829794 & 0.6141 & 0.313509 & 0.061827 & 0.019 & 0.032 & 0.007 & 0.91 & ⋯ & 0.64 & 0.173 & 0.274 & 0.067 & 57.1 & 18.0 & 0.0940 & 0.1438 & 0.525 & 0.005750\\\\\n\t 0.06705148 & 1 & 8.895082 & 0.0000 & 0.204244 & 0.009186 & 0.260 & 0.325 & 0.201 & 1.00 & ⋯ & 18.14 & 2.573 & 2.478 & 2.667 & 26.5 & 20.7 & 0.1741 & 0.1750 & 1.082 & -0.010040\\\\\n\t 0.06408917 & 1 & 7.565275 & 0.1997 & 0.248714 & 0.036270 & 0.061 & 0.070 & 0.051 & 1.00 & ⋯ & 2.63 & 0.438 & 0.453 & 0.424 & 27.8 & 22.7 & 0.1265 & 0.1496 & 6.625 & -0.002195\\\\\n\t 0.02792955 & 1 & 7.162397 & 0.1740 & 0.299252 & 0.037367 & 0.017 & 0.027 & 0.007 & 0.82 & ⋯ & 2.11 & 0.257 & 0.287 & 0.229 & 34.5 & 17.6 & 0.1211 & 0.1308 & 2.500 & 0.003283\\\\\n\t 0.04640744 & 1 & 7.218910 & 0.0000 & 0.258865 & 0.020880 & 0.023 & 0.038 & 0.006 & 0.50 & ⋯ & 1.46 & 0.160 & 0.174 & 0.146 & 34.3 & 8.1 & 0.0634 & 0.0762 & 1.000 & -0.001747\\\\\n\t 0.06733234 & 1 & 7.853605 & 0.0000 & 0.182525 & 0.014385 & 0.039 & 0.063 & 0.014 & 0.92 & ⋯ & 1.59 & 0.342 & 0.484 & 0.207 & 46.6 & 14.7 & 0.0342 & 0.0428 & 12.499 & 0.009092\\\\\n\t 0.02097768 & 1 & 7.703910 & 0.2776 & 0.215275 & 0.029713 & 0.024 & 0.035 & 0.013 & 0.69 & ⋯ & 1.63 & 0.184 & 0.219 & 0.152 & 34.0 & 16.1 & 0.0864 & 0.0931 & 7.000 & 0.011630\\\\\n\t 0.03355124 & 1 & 9.063463 & 0.0000 & 0.109614 & 0.002171 & 0.402 & 0.488 & 0.314 & 1.00 & ⋯ & 24.72 & 3.206 & 3.154 & 3.253 & 28.2 & 20.6 & 0.0594 & 0.0460 & 1.000 & 0.008169\\\\\n\t 0.03914652 & 1 & 8.151910 & 0.1484 & 0.110885 & 0.028579 & 0.145 & 0.173 & 0.114 & 1.00 & ⋯ & 6.76 & 0.703 & 0.785 & 0.620 & 20.3 & 7.2 & 0.0524 & 0.0523 & 2.119 & 0.007584\\\\\n\t 0.07612651 & 1 & 6.929517 & 0.0296 & 0.165784 & 0.020115 & 0.046 & 0.066 & 0.025 & 0.73 & ⋯ & 6.21 & 1.316 & 1.683 & 0.969 & 27.8 & 17.2 & 0.0560 & 0.0826 & 11.879 & 0.086032\\\\\n\t 0.12795121 & 1 & 7.237778 & 0.2151 & 0.078488 & 0.011581 & 0.022 & 0.031 & 0.014 & 1.00 & ⋯ & 3.96 & 0.594 & 0.674 & 0.515 & 28.2 & 14.8 & 0.0270 & 0.0275 & 1.938 & 0.007666\\\\\n\t -0.02432609 & 1 & 8.115820 & 0.4318 & 0.137482 & 0.026547 & 0.059 & 0.073 & 0.045 & 1.00 & ⋯ & 11.36 & 1.132 & 1.126 & 1.138 & 52.1 & 18.8 & 0.0804 & 0.0930 & 0.003 & 0.016968\\\\\n\t 0.07829342 & 1 & 7.271704 & 0.1689 & 0.164598 & 0.044446 & 0.029 & 0.045 & 0.013 & 0.84 & ⋯ & 3.10 & 0.568 & 0.695 & 0.450 & 35.9 & 13.1 & 0.0617 & 0.0678 & 10.479 & 0.004573\\\\\n\t 0.11291155 & 1 & 7.121252 & 0.1832 & 0.188016 & 0.045678 & 0.033 & 0.051 & 0.015 & 0.91 & ⋯ & 3.16 & 0.440 & 0.512 & 0.369 & 37.4 & 12.7 & 0.0775 & 0.0780 & 18.476 & -0.020322\\\\\n\t 0.05230819 & 1 & 6.977281 & 0.0962 & 0.204611 & 0.077852 & 0.037 & 0.043 & 0.030 & 1.00 & ⋯ & 2.40 & 0.419 & 0.548 & 0.299 & 30.3 & 7.9 & 0.0668 & 0.0787 & 125.990 & 0.028916\\\\\n\t 0.03639089 & 1 & 7.649693 & 0.0227 & 0.136287 & 0.046730 & 0.081 & 0.105 & 0.056 & 0.99 & ⋯ & 3.51 & 0.562 & 0.699 & 0.427 & 35.7 & 14.7 & 0.0872 & 0.0938 & 26.800 & 0.020228\\\\\n\t 0.02973823 & 1 & 8.056744 & 0.0208 & 0.197853 & 0.037224 & 0.083 & 0.097 & 0.069 & 1.00 & ⋯ & 3.30 & 0.722 & 0.765 & 0.680 & 36.6 & 12.6 & 0.0557 & 0.0624 & 0.052 & 0.013407\\\\\n\t -0.05664358 & 1 & 8.780941 & 0.2654 & 0.189867 & 0.031747 & 0.068 & 0.089 & 0.046 & 0.94 & ⋯ & 2.99 & 0.372 & 0.462 & 0.281 & 34.0 & 20.3 & 0.3178 & 0.1583 & 4.500 & -0.024761\\\\\n\t 0.01920480 & 1 & 6.287859 & 0.4207 & 0.130682 & 0.109921 & 0.053 & 0.039 & 0.011 & 0.74 & ⋯ & 0.34 & 0.142 & 0.223 & 0.055 & 35.5 & 19.1 & 0.0201 & 0.0341 & 4.762 & -0.021656\\\\\n\t 0.08520600 & 1 & 6.137727 & 0.1371 & 0.123818 & 0.015897 & 0.028 & 0.025 & 0.007 & 0.72 & ⋯ & 0.56 & 0.148 & 0.232 & 0.065 & 41.3 & 21.3 & 0.0298 & 0.0297 & 4.125 & -0.054872\\\\\n\t 0.13398221 & 1 & 8.128880 & 0.0000 & 0.167210 & 0.003311 & 0.129 & 0.196 & 0.063 & 1.00 & ⋯ & 13.16 & 1.727 & 1.910 & 1.560 & 28.1 & 23.2 & 0.0570 & 0.0609 & 360.000 & -0.054874\\\\\n\t 0.17302474 & 1 & 6.680855 & 0.4713 & 0.228424 & 0.029328 & 0.062 & 0.090 & 0.032 & 1.00 & ⋯ & 3.95 & 0.974 & 1.526 & 0.470 & 62.4 & 34.9 & 0.0206 & 0.0618 & 265.690 & 0.018194\\\\\n\t 0.10969915 & 1 & 7.177019 & 0.0178 & 0.185240 & 0.015453 & 0.020 & 0.026 & 0.013 & 0.90 & ⋯ & 1.89 & 0.571 & 0.843 & 0.286 & 26.9 & 24.1 & 0.2295 & 0.1990 & 3.061 & -0.034733\\\\\n\t 0.01598990 & 1 & 6.648985 & 0.4762 & 0.171181 & 0.058937 & 0.018 & 0.028 & 0.007 & 0.40 & ⋯ & 0.76 & 0.357 & 0.512 & 0.185 & 39.9 & 26.9 & 0.0178 & 0.0634 & 4.762 & -0.000222\\\\\n\t 0.06224977 & 1 & 6.879356 & 0.2927 & 0.179508 & 0.035842 & 0.188 & 0.169 & 0.208 & 1.00 & ⋯ & 3.69 & 0.651 & 0.759 & 0.547 & 31.4 & 31.2 & 0.0695 & 0.0728 & 4.017 & 0.033636\\\\\n\t 0.10987069 & 1 & 7.347300 & 0.1017 & 0.247626 & 0.037392 & 0.080 & 0.133 & 0.027 & 0.78 & ⋯ & 0.72 & 0.195 & 0.303 & 0.085 & 36.2 & 21.6 & 0.0860 & 0.0898 & 3.177 & 0.010162\\\\\n\t 0.09210628 & 1 & 6.725034 & 0.0266 & 0.179933 & 0.046376 & 0.015 & 0.020 & 0.010 & 0.78 & ⋯ & 0.86 & 0.258 & 0.382 & 0.137 & 34.6 & 16.5 & 0.0558 & 0.0613 & 20.800 & -0.018514\\\\\n\t 0.08337604 & 1 & 8.451053 & 0.0000 & 0.358556 & 0.016468 & 0.090 & 0.133 & 0.044 & 1.00 & ⋯ & 2.91 & 0.766 & 1.087 & 0.510 & 21.9 & 15.3 & 0.1687 & 0.1635 & 26.000 & 0.010943\\\\\n\t 0.07623345 & 1 & 8.602453 & 0.0000 & 0.416234 & 0.014721 & 0.148 & 0.194 & 0.100 & 1.00 & ⋯ & 12.17 & 1.554 & 1.724 & 1.398 & 20.6 & 7.2 & 0.2629 & 0.2698 & 50.000 & -0.001521\\\\\n\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋱ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n\t -0.03404539 & 1 & 8.346168 & 0.3199 & 0.110885 & 0.028579 & 0.272 & 0.289 & 0.272 & 1.00 & ⋯ & 9.58 & 0.919 & 0.936 & 0.902 & 18.2 & 7.7 & 0.0625 & 0.0578 & 36.603 & 0.014286\\\\\n\t -0.03380635 & 1 & 7.303170 & 0.3133 & 0.165784 & 0.020115 & 0.112 & 0.132 & 0.065 & 0.85 & ⋯ & 5.60 & 1.158 & 1.473 & 0.862 & 22.7 & 18.2 & 0.1071 & 0.1028 & 20.000 & 0.111198\\\\\n\t 0.06991488 & 1 & 7.859027 & 0.1222 & 0.078488 & 0.011581 & 0.107 & 0.103 & 0.092 & 0.88 & ⋯ & 2.80 & 0.596 & 0.645 & 0.548 & 21.7 & 14.4 & 0.0357 & 0.0466 & 8.127 & 0.006002\\\\\n\t -0.08172560 & 1 & 7.998335 & 1.6378 & 0.137482 & 0.026547 & 0.156 & 0.181 & 0.150 & 1.00 & ⋯ & 13.74 & 1.339 & 1.222 & 1.445 & 34.8 & 15.2 & 0.0783 & 0.0847 & 4.911 & -0.127025\\\\\n\t 0.04601005 & 1 & 7.655864 & 0.1345 & 0.164598 & 0.044446 & 0.080 & 0.097 & 0.058 & 1.00 & ⋯ & 8.25 & 1.076 & 1.143 & 1.013 & 32.1 & 19.4 & 0.0525 & 0.0572 & 30.929 & -0.004592\\\\\n\t 0.06659809 & 1 & 7.675082 & 0.0898 & 0.188016 & 0.045678 & 0.269 & 0.338 & 0.200 & 1.00 & ⋯ & 5.80 & 0.687 & 0.745 & 0.630 & 37.5 & 16.4 & 0.0906 & 0.0959 & 25.000 & 0.191066\\\\\n\t -0.01138424 & 1 & 7.830028 & 0.4880 & 0.136287 & 0.046730 & 0.146 & 0.193 & 0.094 & 1.00 & ⋯ & 6.42 & 0.950 & 1.129 & 0.772 & 39.1 & 23.8 & 0.0764 & 0.0866 & 40.500 & -0.007018\\\\\n\t -0.10098990 & 1 & 8.498622 & 0.0010 & 0.189867 & 0.031747 & 0.181 & 0.190 & 0.159 & 0.97 & ⋯ & 7.63 & 0.801 & 0.850 & 0.752 & 30.2 & 16.8 & 0.2131 & 0.1437 & 4.285 & 0.168536\\\\\n\t 0.05475087 & 1 & 6.216606 & 0.7557 & 0.214345 & 0.073495 & 0.023 & 0.051 & 0.006 & 0.73 & ⋯ & 0.44 & 0.282 & 0.488 & 0.051 & 50.6 & 21.8 & 0.0232 & 0.0407 & 8.876 & -0.084064\\\\\n\t 0.09461817 & 1 & 8.414496 & 0.0000 & 0.374328 & 0.000000 & 0.101 & 0.147 & 0.053 & 1.00 & ⋯ & 11.80 & 1.846 & 2.369 & 1.301 & 31.1 & 24.3 & 0.5958 & 0.5819 & 4.935 & 0.021808\\\\\n\t 0.04571529 & 1 & 6.383507 & 0.3556 & 0.130682 & 0.109921 & 0.086 & 0.130 & 0.042 & 0.79 & ⋯ & 1.00 & 0.446 & 0.713 & 0.163 & 42.3 & 19.8 & 0.0188 & 0.0222 & 8.653 & -0.012443\\\\\n\t 0.06549111 & 1 & 8.782323 & 0.0000 & 0.167210 & 0.003311 & 0.246 & 0.331 & 0.160 & 0.99 & ⋯ & 15.52 & 1.969 & 2.121 & 1.828 & 25.3 & 17.5 & 0.1032 & 0.0958 & 296.800 & -0.057094\\\\\n\t 0.02124651 & 1 & 7.251345 & 0.0516 & 0.263813 & 0.045225 & 0.090 & 0.053 & 0.030 & 0.88 & ⋯ & 4.00 & 0.817 & 1.205 & 0.413 & 34.7 & 21.1 & 0.0730 & 0.2227 & 0.320 & 0.128443\\\\\n\t 0.14144548 & 1 & 7.511525 & 0.1053 & 0.228424 & 0.029328 & 0.103 & 0.139 & 0.054 & 1.00 & ⋯ & 9.33 & 1.700 & 2.369 & 1.060 & 51.7 & 37.1 & 0.0903 & 0.1229 & 484.000 & 0.007257\\\\\n\t 0.09681623 & 1 & 7.713785 & 0.0050 & 0.185240 & 0.015453 & 0.031 & 0.042 & 0.016 & 0.91 & ⋯ & 4.35 & 0.891 & 1.255 & 0.517 & 31.8 & 27.3 & 0.1922 & 0.1821 & 2.402 & 0.030424\\\\\n\t 0.04053420 & 1 & 6.728629 & 0.6190 & 0.171181 & 0.058937 & 0.019 & 0.027 & 0.009 & 0.46 & ⋯ & 1.01 & 0.670 & 1.039 & 0.271 & 40.1 & 18.0 & 0.0281 & 0.0459 & 9.900 & -0.012137\\\\\n\t 0.01058841 & 1 & 7.186144 & 0.0760 & 0.179508 & 0.035842 & 0.184 & 0.173 & 0.217 & 1.00 & ⋯ & 5.34 & 0.943 & 1.049 & 0.837 & 29.0 & 31.5 & 0.0703 & 0.0716 & 7.248 & 0.009640\\\\\n\t 0.18552649 & 1 & 8.326033 & 0.0050 & 0.321658 & 0.005106 & 0.090 & 0.109 & 0.075 & 1.00 & ⋯ & 4.64 & 1.127 & 1.427 & 0.817 & 30.5 & 23.1 & 0.7470 & 0.8489 & 2.371 & 0.051395\\\\\n\t 0.09310491 & 1 & 7.894691 & 0.1062 & 0.247626 & 0.037392 & 0.121 & 0.175 & 0.063 & 0.96 & ⋯ & 1.47 & 0.481 & 0.761 & 0.200 & 33.8 & 19.6 & 0.0797 & 0.1018 & 3.017 & 0.207492\\\\\n\t 0.06522856 & 1 & 7.175490 & 0.0000 & 0.179933 & 0.046376 & 0.035 & 0.040 & 0.027 & 0.83 & ⋯ & 1.23 & 0.332 & 0.451 & 0.219 & 27.9 & 27.2 & 0.0636 & 0.0721 & 20.379 & 0.018019\\\\\n\t 0.03809502 & 1 & 9.030974 & 0.0000 & 0.293138 & 0.005517 & 0.245 & 0.251 & 0.238 & 1.00 & ⋯ & 7.50 & 1.167 & 1.210 & 1.128 & 22.5 & 15.5 & 0.1662 & 0.1617 & 4.286 & -0.006642\\\\\n\t 0.03421300 & 1 & 8.995537 & 0.0000 & 0.304720 & 0.011658 & 0.246 & 0.260 & 0.190 & 1.00 & ⋯ & 6.75 & 0.667 & 0.776 & 0.575 & 23.5 & 15.0 & 0.2597 & 0.2288 & 2.460 & -0.003241\\\\\n\t 0.05275914 & 1 & 8.234830 & 0.0363 & 0.288405 & 0.011589 & 0.183 & 0.222 & 0.142 & 1.00 & ⋯ & 8.18 & 1.010 & 1.220 & 0.821 & 30.2 & 28.3 & 0.1044 & 0.1796 & 32.051 & -0.034352\\\\\n\t 0.03841564 & 1 & 8.332549 & 0.0000 & 0.345485 & 0.006503 & 0.188 & 0.248 & 0.136 & 1.00 & ⋯ & 13.12 & 1.576 & 1.567 & 1.585 & 31.0 & 14.3 & 0.2866 & 0.3500 & 0.452 & -0.001660\\\\\n\t 0.03189479 & 1 & 8.645586 & 0.0000 & 0.288440 & 0.005995 & 0.256 & 0.301 & 0.199 & 1.00 & ⋯ & 6.91 & 1.307 & 1.579 & 1.062 & 18.9 & 11.3 & 0.1296 & 0.1458 & 652.850 & -0.046278\\\\\n\t 0.03119598 & 1 & 8.991064 & 0.0000 & 0.371898 & 0.014586 & 0.255 & 0.336 & 0.170 & 0.98 & ⋯ & 11.41 & 2.226 & 2.494 & 1.971 & 27.5 & 15.9 & 0.4407 & 0.4257 & 2.529 & -0.011883\\\\\n\t 0.03409566 & 1 & 8.025189 & 0.0050 & 0.296437 & 0.013615 & 0.108 & 0.117 & 0.093 & 1.00 & ⋯ & 1.95 & 0.510 & 0.694 & 0.362 & 20.2 & 15.7 & 0.1669 & 0.2201 & 25.553 & -0.039080\\\\\n\t 0.04690046 & 1 & 9.030137 & 0.0000 & 0.265778 & 0.008629 & 0.288 & 0.337 & 0.237 & 1.00 & ⋯ & 25.64 & 2.727 & 2.664 & 2.788 & 20.4 & 9.4 & 0.3238 & 0.3134 & 4.152 & 0.005175\\\\\n\t 0.03977337 & 1 & 8.865312 & 0.0000 & 0.282939 & 0.005048 & 0.188 & 0.236 & 0.139 & 1.00 & ⋯ & 10.76 & 1.888 & 1.920 & 1.860 & 20.0 & 16.0 & 0.1845 & 0.1940 & 0.452 & -0.029551\\\\\n\t 0.04064154 & 1 & 8.912339 & 0.0000 & 0.150366 & 0.024377 & 0.257 & 0.338 & 0.215 & 1.00 & ⋯ & 24.40 & 3.051 & 3.235 & 2.875 & 18.5 & 29.1 & 0.1876 & 0.2007 & 0.886 & -0.036482\\\\\n\\end{tabular}\n", + "text/latex": [ + "A data.frame: 90 × 63\n", + "\\begin{tabular}{lllllllllllllllllllll}\n", + " Outcome & intercept & gdpsh465 & bmp1l & freeop & freetar & h65 & hm65 & hf65 & p65 & ⋯ & seccf65 & syr65 & syrm65 & syrf65 & teapri65 & teasec65 & ex1 & im1 & xr65 & tot1\\\\\n", + " & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t -0.02433575 & 1 & 6.591674 & 0.2837 & 0.153491 & 0.043888 & 0.007 & 0.013 & 0.001 & 0.29 & ⋯ & 0.04 & 0.033 & 0.057 & 0.010 & 47.6 & 17.3 & 0.0729 & 0.0667 & 0.348 & -0.014727\\\\\n", + "\t 0.10047257 & 1 & 6.829794 & 0.6141 & 0.313509 & 0.061827 & 0.019 & 0.032 & 0.007 & 0.91 & ⋯ & 0.64 & 0.173 & 0.274 & 0.067 & 57.1 & 18.0 & 0.0940 & 0.1438 & 0.525 & 0.005750\\\\\n", + "\t 0.06705148 & 1 & 8.895082 & 0.0000 & 0.204244 & 0.009186 & 0.260 & 0.325 & 0.201 & 1.00 & ⋯ & 18.14 & 2.573 & 2.478 & 2.667 & 26.5 & 20.7 & 0.1741 & 0.1750 & 1.082 & -0.010040\\\\\n", + "\t 0.06408917 & 1 & 7.565275 & 0.1997 & 0.248714 & 0.036270 & 0.061 & 0.070 & 0.051 & 1.00 & ⋯ & 2.63 & 0.438 & 0.453 & 0.424 & 27.8 & 22.7 & 0.1265 & 0.1496 & 6.625 & -0.002195\\\\\n", + "\t 0.02792955 & 1 & 7.162397 & 0.1740 & 0.299252 & 0.037367 & 0.017 & 0.027 & 0.007 & 0.82 & ⋯ & 2.11 & 0.257 & 0.287 & 0.229 & 34.5 & 17.6 & 0.1211 & 0.1308 & 2.500 & 0.003283\\\\\n", + "\t 0.04640744 & 1 & 7.218910 & 0.0000 & 0.258865 & 0.020880 & 0.023 & 0.038 & 0.006 & 0.50 & ⋯ & 1.46 & 0.160 & 0.174 & 0.146 & 34.3 & 8.1 & 0.0634 & 0.0762 & 1.000 & -0.001747\\\\\n", + "\t 0.06733234 & 1 & 7.853605 & 0.0000 & 0.182525 & 0.014385 & 0.039 & 0.063 & 0.014 & 0.92 & ⋯ & 1.59 & 0.342 & 0.484 & 0.207 & 46.6 & 14.7 & 0.0342 & 0.0428 & 12.499 & 0.009092\\\\\n", + "\t 0.02097768 & 1 & 7.703910 & 0.2776 & 0.215275 & 0.029713 & 0.024 & 0.035 & 0.013 & 0.69 & ⋯ & 1.63 & 0.184 & 0.219 & 0.152 & 34.0 & 16.1 & 0.0864 & 0.0931 & 7.000 & 0.011630\\\\\n", + "\t 0.03355124 & 1 & 9.063463 & 0.0000 & 0.109614 & 0.002171 & 0.402 & 0.488 & 0.314 & 1.00 & ⋯ & 24.72 & 3.206 & 3.154 & 3.253 & 28.2 & 20.6 & 0.0594 & 0.0460 & 1.000 & 0.008169\\\\\n", + "\t 0.03914652 & 1 & 8.151910 & 0.1484 & 0.110885 & 0.028579 & 0.145 & 0.173 & 0.114 & 1.00 & ⋯ & 6.76 & 0.703 & 0.785 & 0.620 & 20.3 & 7.2 & 0.0524 & 0.0523 & 2.119 & 0.007584\\\\\n", + "\t 0.07612651 & 1 & 6.929517 & 0.0296 & 0.165784 & 0.020115 & 0.046 & 0.066 & 0.025 & 0.73 & ⋯ & 6.21 & 1.316 & 1.683 & 0.969 & 27.8 & 17.2 & 0.0560 & 0.0826 & 11.879 & 0.086032\\\\\n", + "\t 0.12795121 & 1 & 7.237778 & 0.2151 & 0.078488 & 0.011581 & 0.022 & 0.031 & 0.014 & 1.00 & ⋯ & 3.96 & 0.594 & 0.674 & 0.515 & 28.2 & 14.8 & 0.0270 & 0.0275 & 1.938 & 0.007666\\\\\n", + "\t -0.02432609 & 1 & 8.115820 & 0.4318 & 0.137482 & 0.026547 & 0.059 & 0.073 & 0.045 & 1.00 & ⋯ & 11.36 & 1.132 & 1.126 & 1.138 & 52.1 & 18.8 & 0.0804 & 0.0930 & 0.003 & 0.016968\\\\\n", + "\t 0.07829342 & 1 & 7.271704 & 0.1689 & 0.164598 & 0.044446 & 0.029 & 0.045 & 0.013 & 0.84 & ⋯ & 3.10 & 0.568 & 0.695 & 0.450 & 35.9 & 13.1 & 0.0617 & 0.0678 & 10.479 & 0.004573\\\\\n", + "\t 0.11291155 & 1 & 7.121252 & 0.1832 & 0.188016 & 0.045678 & 0.033 & 0.051 & 0.015 & 0.91 & ⋯ & 3.16 & 0.440 & 0.512 & 0.369 & 37.4 & 12.7 & 0.0775 & 0.0780 & 18.476 & -0.020322\\\\\n", + "\t 0.05230819 & 1 & 6.977281 & 0.0962 & 0.204611 & 0.077852 & 0.037 & 0.043 & 0.030 & 1.00 & ⋯ & 2.40 & 0.419 & 0.548 & 0.299 & 30.3 & 7.9 & 0.0668 & 0.0787 & 125.990 & 0.028916\\\\\n", + "\t 0.03639089 & 1 & 7.649693 & 0.0227 & 0.136287 & 0.046730 & 0.081 & 0.105 & 0.056 & 0.99 & ⋯ & 3.51 & 0.562 & 0.699 & 0.427 & 35.7 & 14.7 & 0.0872 & 0.0938 & 26.800 & 0.020228\\\\\n", + "\t 0.02973823 & 1 & 8.056744 & 0.0208 & 0.197853 & 0.037224 & 0.083 & 0.097 & 0.069 & 1.00 & ⋯ & 3.30 & 0.722 & 0.765 & 0.680 & 36.6 & 12.6 & 0.0557 & 0.0624 & 0.052 & 0.013407\\\\\n", + "\t -0.05664358 & 1 & 8.780941 & 0.2654 & 0.189867 & 0.031747 & 0.068 & 0.089 & 0.046 & 0.94 & ⋯ & 2.99 & 0.372 & 0.462 & 0.281 & 34.0 & 20.3 & 0.3178 & 0.1583 & 4.500 & -0.024761\\\\\n", + "\t 0.01920480 & 1 & 6.287859 & 0.4207 & 0.130682 & 0.109921 & 0.053 & 0.039 & 0.011 & 0.74 & ⋯ & 0.34 & 0.142 & 0.223 & 0.055 & 35.5 & 19.1 & 0.0201 & 0.0341 & 4.762 & -0.021656\\\\\n", + "\t 0.08520600 & 1 & 6.137727 & 0.1371 & 0.123818 & 0.015897 & 0.028 & 0.025 & 0.007 & 0.72 & ⋯ & 0.56 & 0.148 & 0.232 & 0.065 & 41.3 & 21.3 & 0.0298 & 0.0297 & 4.125 & -0.054872\\\\\n", + "\t 0.13398221 & 1 & 8.128880 & 0.0000 & 0.167210 & 0.003311 & 0.129 & 0.196 & 0.063 & 1.00 & ⋯ & 13.16 & 1.727 & 1.910 & 1.560 & 28.1 & 23.2 & 0.0570 & 0.0609 & 360.000 & -0.054874\\\\\n", + "\t 0.17302474 & 1 & 6.680855 & 0.4713 & 0.228424 & 0.029328 & 0.062 & 0.090 & 0.032 & 1.00 & ⋯ & 3.95 & 0.974 & 1.526 & 0.470 & 62.4 & 34.9 & 0.0206 & 0.0618 & 265.690 & 0.018194\\\\\n", + "\t 0.10969915 & 1 & 7.177019 & 0.0178 & 0.185240 & 0.015453 & 0.020 & 0.026 & 0.013 & 0.90 & ⋯ & 1.89 & 0.571 & 0.843 & 0.286 & 26.9 & 24.1 & 0.2295 & 0.1990 & 3.061 & -0.034733\\\\\n", + "\t 0.01598990 & 1 & 6.648985 & 0.4762 & 0.171181 & 0.058937 & 0.018 & 0.028 & 0.007 & 0.40 & ⋯ & 0.76 & 0.357 & 0.512 & 0.185 & 39.9 & 26.9 & 0.0178 & 0.0634 & 4.762 & -0.000222\\\\\n", + "\t 0.06224977 & 1 & 6.879356 & 0.2927 & 0.179508 & 0.035842 & 0.188 & 0.169 & 0.208 & 1.00 & ⋯ & 3.69 & 0.651 & 0.759 & 0.547 & 31.4 & 31.2 & 0.0695 & 0.0728 & 4.017 & 0.033636\\\\\n", + "\t 0.10987069 & 1 & 7.347300 & 0.1017 & 0.247626 & 0.037392 & 0.080 & 0.133 & 0.027 & 0.78 & ⋯ & 0.72 & 0.195 & 0.303 & 0.085 & 36.2 & 21.6 & 0.0860 & 0.0898 & 3.177 & 0.010162\\\\\n", + "\t 0.09210628 & 1 & 6.725034 & 0.0266 & 0.179933 & 0.046376 & 0.015 & 0.020 & 0.010 & 0.78 & ⋯ & 0.86 & 0.258 & 0.382 & 0.137 & 34.6 & 16.5 & 0.0558 & 0.0613 & 20.800 & -0.018514\\\\\n", + "\t 0.08337604 & 1 & 8.451053 & 0.0000 & 0.358556 & 0.016468 & 0.090 & 0.133 & 0.044 & 1.00 & ⋯ & 2.91 & 0.766 & 1.087 & 0.510 & 21.9 & 15.3 & 0.1687 & 0.1635 & 26.000 & 0.010943\\\\\n", + "\t 0.07623345 & 1 & 8.602453 & 0.0000 & 0.416234 & 0.014721 & 0.148 & 0.194 & 0.100 & 1.00 & ⋯ & 12.17 & 1.554 & 1.724 & 1.398 & 20.6 & 7.2 & 0.2629 & 0.2698 & 50.000 & -0.001521\\\\\n", + "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋱ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", + "\t -0.03404539 & 1 & 8.346168 & 0.3199 & 0.110885 & 0.028579 & 0.272 & 0.289 & 0.272 & 1.00 & ⋯ & 9.58 & 0.919 & 0.936 & 0.902 & 18.2 & 7.7 & 0.0625 & 0.0578 & 36.603 & 0.014286\\\\\n", + "\t -0.03380635 & 1 & 7.303170 & 0.3133 & 0.165784 & 0.020115 & 0.112 & 0.132 & 0.065 & 0.85 & ⋯ & 5.60 & 1.158 & 1.473 & 0.862 & 22.7 & 18.2 & 0.1071 & 0.1028 & 20.000 & 0.111198\\\\\n", + "\t 0.06991488 & 1 & 7.859027 & 0.1222 & 0.078488 & 0.011581 & 0.107 & 0.103 & 0.092 & 0.88 & ⋯ & 2.80 & 0.596 & 0.645 & 0.548 & 21.7 & 14.4 & 0.0357 & 0.0466 & 8.127 & 0.006002\\\\\n", + "\t -0.08172560 & 1 & 7.998335 & 1.6378 & 0.137482 & 0.026547 & 0.156 & 0.181 & 0.150 & 1.00 & ⋯ & 13.74 & 1.339 & 1.222 & 1.445 & 34.8 & 15.2 & 0.0783 & 0.0847 & 4.911 & -0.127025\\\\\n", + "\t 0.04601005 & 1 & 7.655864 & 0.1345 & 0.164598 & 0.044446 & 0.080 & 0.097 & 0.058 & 1.00 & ⋯ & 8.25 & 1.076 & 1.143 & 1.013 & 32.1 & 19.4 & 0.0525 & 0.0572 & 30.929 & -0.004592\\\\\n", + "\t 0.06659809 & 1 & 7.675082 & 0.0898 & 0.188016 & 0.045678 & 0.269 & 0.338 & 0.200 & 1.00 & ⋯ & 5.80 & 0.687 & 0.745 & 0.630 & 37.5 & 16.4 & 0.0906 & 0.0959 & 25.000 & 0.191066\\\\\n", + "\t -0.01138424 & 1 & 7.830028 & 0.4880 & 0.136287 & 0.046730 & 0.146 & 0.193 & 0.094 & 1.00 & ⋯ & 6.42 & 0.950 & 1.129 & 0.772 & 39.1 & 23.8 & 0.0764 & 0.0866 & 40.500 & -0.007018\\\\\n", + "\t -0.10098990 & 1 & 8.498622 & 0.0010 & 0.189867 & 0.031747 & 0.181 & 0.190 & 0.159 & 0.97 & ⋯ & 7.63 & 0.801 & 0.850 & 0.752 & 30.2 & 16.8 & 0.2131 & 0.1437 & 4.285 & 0.168536\\\\\n", + "\t 0.05475087 & 1 & 6.216606 & 0.7557 & 0.214345 & 0.073495 & 0.023 & 0.051 & 0.006 & 0.73 & ⋯ & 0.44 & 0.282 & 0.488 & 0.051 & 50.6 & 21.8 & 0.0232 & 0.0407 & 8.876 & -0.084064\\\\\n", + "\t 0.09461817 & 1 & 8.414496 & 0.0000 & 0.374328 & 0.000000 & 0.101 & 0.147 & 0.053 & 1.00 & ⋯ & 11.80 & 1.846 & 2.369 & 1.301 & 31.1 & 24.3 & 0.5958 & 0.5819 & 4.935 & 0.021808\\\\\n", + "\t 0.04571529 & 1 & 6.383507 & 0.3556 & 0.130682 & 0.109921 & 0.086 & 0.130 & 0.042 & 0.79 & ⋯ & 1.00 & 0.446 & 0.713 & 0.163 & 42.3 & 19.8 & 0.0188 & 0.0222 & 8.653 & -0.012443\\\\\n", + "\t 0.06549111 & 1 & 8.782323 & 0.0000 & 0.167210 & 0.003311 & 0.246 & 0.331 & 0.160 & 0.99 & ⋯ & 15.52 & 1.969 & 2.121 & 1.828 & 25.3 & 17.5 & 0.1032 & 0.0958 & 296.800 & -0.057094\\\\\n", + "\t 0.02124651 & 1 & 7.251345 & 0.0516 & 0.263813 & 0.045225 & 0.090 & 0.053 & 0.030 & 0.88 & ⋯ & 4.00 & 0.817 & 1.205 & 0.413 & 34.7 & 21.1 & 0.0730 & 0.2227 & 0.320 & 0.128443\\\\\n", + "\t 0.14144548 & 1 & 7.511525 & 0.1053 & 0.228424 & 0.029328 & 0.103 & 0.139 & 0.054 & 1.00 & ⋯ & 9.33 & 1.700 & 2.369 & 1.060 & 51.7 & 37.1 & 0.0903 & 0.1229 & 484.000 & 0.007257\\\\\n", + "\t 0.09681623 & 1 & 7.713785 & 0.0050 & 0.185240 & 0.015453 & 0.031 & 0.042 & 0.016 & 0.91 & ⋯ & 4.35 & 0.891 & 1.255 & 0.517 & 31.8 & 27.3 & 0.1922 & 0.1821 & 2.402 & 0.030424\\\\\n", + "\t 0.04053420 & 1 & 6.728629 & 0.6190 & 0.171181 & 0.058937 & 0.019 & 0.027 & 0.009 & 0.46 & ⋯ & 1.01 & 0.670 & 1.039 & 0.271 & 40.1 & 18.0 & 0.0281 & 0.0459 & 9.900 & -0.012137\\\\\n", + "\t 0.01058841 & 1 & 7.186144 & 0.0760 & 0.179508 & 0.035842 & 0.184 & 0.173 & 0.217 & 1.00 & ⋯ & 5.34 & 0.943 & 1.049 & 0.837 & 29.0 & 31.5 & 0.0703 & 0.0716 & 7.248 & 0.009640\\\\\n", + "\t 0.18552649 & 1 & 8.326033 & 0.0050 & 0.321658 & 0.005106 & 0.090 & 0.109 & 0.075 & 1.00 & ⋯ & 4.64 & 1.127 & 1.427 & 0.817 & 30.5 & 23.1 & 0.7470 & 0.8489 & 2.371 & 0.051395\\\\\n", + "\t 0.09310491 & 1 & 7.894691 & 0.1062 & 0.247626 & 0.037392 & 0.121 & 0.175 & 0.063 & 0.96 & ⋯ & 1.47 & 0.481 & 0.761 & 0.200 & 33.8 & 19.6 & 0.0797 & 0.1018 & 3.017 & 0.207492\\\\\n", + "\t 0.06522856 & 1 & 7.175490 & 0.0000 & 0.179933 & 0.046376 & 0.035 & 0.040 & 0.027 & 0.83 & ⋯ & 1.23 & 0.332 & 0.451 & 0.219 & 27.9 & 27.2 & 0.0636 & 0.0721 & 20.379 & 0.018019\\\\\n", + "\t 0.03809502 & 1 & 9.030974 & 0.0000 & 0.293138 & 0.005517 & 0.245 & 0.251 & 0.238 & 1.00 & ⋯ & 7.50 & 1.167 & 1.210 & 1.128 & 22.5 & 15.5 & 0.1662 & 0.1617 & 4.286 & -0.006642\\\\\n", + "\t 0.03421300 & 1 & 8.995537 & 0.0000 & 0.304720 & 0.011658 & 0.246 & 0.260 & 0.190 & 1.00 & ⋯ & 6.75 & 0.667 & 0.776 & 0.575 & 23.5 & 15.0 & 0.2597 & 0.2288 & 2.460 & -0.003241\\\\\n", + "\t 0.05275914 & 1 & 8.234830 & 0.0363 & 0.288405 & 0.011589 & 0.183 & 0.222 & 0.142 & 1.00 & ⋯ & 8.18 & 1.010 & 1.220 & 0.821 & 30.2 & 28.3 & 0.1044 & 0.1796 & 32.051 & -0.034352\\\\\n", + "\t 0.03841564 & 1 & 8.332549 & 0.0000 & 0.345485 & 0.006503 & 0.188 & 0.248 & 0.136 & 1.00 & ⋯ & 13.12 & 1.576 & 1.567 & 1.585 & 31.0 & 14.3 & 0.2866 & 0.3500 & 0.452 & -0.001660\\\\\n", + "\t 0.03189479 & 1 & 8.645586 & 0.0000 & 0.288440 & 0.005995 & 0.256 & 0.301 & 0.199 & 1.00 & ⋯ & 6.91 & 1.307 & 1.579 & 1.062 & 18.9 & 11.3 & 0.1296 & 0.1458 & 652.850 & -0.046278\\\\\n", + "\t 0.03119598 & 1 & 8.991064 & 0.0000 & 0.371898 & 0.014586 & 0.255 & 0.336 & 0.170 & 0.98 & ⋯ & 11.41 & 2.226 & 2.494 & 1.971 & 27.5 & 15.9 & 0.4407 & 0.4257 & 2.529 & -0.011883\\\\\n", + "\t 0.03409566 & 1 & 8.025189 & 0.0050 & 0.296437 & 0.013615 & 0.108 & 0.117 & 0.093 & 1.00 & ⋯ & 1.95 & 0.510 & 0.694 & 0.362 & 20.2 & 15.7 & 0.1669 & 0.2201 & 25.553 & -0.039080\\\\\n", + "\t 0.04690046 & 1 & 9.030137 & 0.0000 & 0.265778 & 0.008629 & 0.288 & 0.337 & 0.237 & 1.00 & ⋯ & 25.64 & 2.727 & 2.664 & 2.788 & 20.4 & 9.4 & 0.3238 & 0.3134 & 4.152 & 0.005175\\\\\n", + "\t 0.03977337 & 1 & 8.865312 & 0.0000 & 0.282939 & 0.005048 & 0.188 & 0.236 & 0.139 & 1.00 & ⋯ & 10.76 & 1.888 & 1.920 & 1.860 & 20.0 & 16.0 & 0.1845 & 0.1940 & 0.452 & -0.029551\\\\\n", + "\t 0.04064154 & 1 & 8.912339 & 0.0000 & 0.150366 & 0.024377 & 0.257 & 0.338 & 0.215 & 1.00 & ⋯ & 24.40 & 3.051 & 3.235 & 2.875 & 18.5 & 29.1 & 0.1876 & 0.2007 & 0.886 & -0.036482\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 90 × 63\n", + "\n", + "| Outcome <dbl> | intercept <int> | gdpsh465 <dbl> | bmp1l <dbl> | freeop <dbl> | freetar <dbl> | h65 <dbl> | hm65 <dbl> | hf65 <dbl> | p65 <dbl> | ⋯ ⋯ | seccf65 <dbl> | syr65 <dbl> | syrm65 <dbl> | syrf65 <dbl> | teapri65 <dbl> | teasec65 <dbl> | ex1 <dbl> | im1 <dbl> | xr65 <dbl> | tot1 <dbl> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| -0.02433575 | 1 | 6.591674 | 0.2837 | 0.153491 | 0.043888 | 0.007 | 0.013 | 0.001 | 0.29 | ⋯ | 0.04 | 0.033 | 0.057 | 0.010 | 47.6 | 17.3 | 0.0729 | 0.0667 | 0.348 | -0.014727 |\n", + "| 0.10047257 | 1 | 6.829794 | 0.6141 | 0.313509 | 0.061827 | 0.019 | 0.032 | 0.007 | 0.91 | ⋯ | 0.64 | 0.173 | 0.274 | 0.067 | 57.1 | 18.0 | 0.0940 | 0.1438 | 0.525 | 0.005750 |\n", + "| 0.06705148 | 1 | 8.895082 | 0.0000 | 0.204244 | 0.009186 | 0.260 | 0.325 | 0.201 | 1.00 | ⋯ | 18.14 | 2.573 | 2.478 | 2.667 | 26.5 | 20.7 | 0.1741 | 0.1750 | 1.082 | -0.010040 |\n", + "| 0.06408917 | 1 | 7.565275 | 0.1997 | 0.248714 | 0.036270 | 0.061 | 0.070 | 0.051 | 1.00 | ⋯ | 2.63 | 0.438 | 0.453 | 0.424 | 27.8 | 22.7 | 0.1265 | 0.1496 | 6.625 | -0.002195 |\n", + "| 0.02792955 | 1 | 7.162397 | 0.1740 | 0.299252 | 0.037367 | 0.017 | 0.027 | 0.007 | 0.82 | ⋯ | 2.11 | 0.257 | 0.287 | 0.229 | 34.5 | 17.6 | 0.1211 | 0.1308 | 2.500 | 0.003283 |\n", + "| 0.04640744 | 1 | 7.218910 | 0.0000 | 0.258865 | 0.020880 | 0.023 | 0.038 | 0.006 | 0.50 | ⋯ | 1.46 | 0.160 | 0.174 | 0.146 | 34.3 | 8.1 | 0.0634 | 0.0762 | 1.000 | -0.001747 |\n", + "| 0.06733234 | 1 | 7.853605 | 0.0000 | 0.182525 | 0.014385 | 0.039 | 0.063 | 0.014 | 0.92 | ⋯ | 1.59 | 0.342 | 0.484 | 0.207 | 46.6 | 14.7 | 0.0342 | 0.0428 | 12.499 | 0.009092 |\n", + "| 0.02097768 | 1 | 7.703910 | 0.2776 | 0.215275 | 0.029713 | 0.024 | 0.035 | 0.013 | 0.69 | ⋯ | 1.63 | 0.184 | 0.219 | 0.152 | 34.0 | 16.1 | 0.0864 | 0.0931 | 7.000 | 0.011630 |\n", + "| 0.03355124 | 1 | 9.063463 | 0.0000 | 0.109614 | 0.002171 | 0.402 | 0.488 | 0.314 | 1.00 | ⋯ | 24.72 | 3.206 | 3.154 | 3.253 | 28.2 | 20.6 | 0.0594 | 0.0460 | 1.000 | 0.008169 |\n", + "| 0.03914652 | 1 | 8.151910 | 0.1484 | 0.110885 | 0.028579 | 0.145 | 0.173 | 0.114 | 1.00 | ⋯ | 6.76 | 0.703 | 0.785 | 0.620 | 20.3 | 7.2 | 0.0524 | 0.0523 | 2.119 | 0.007584 |\n", + "| 0.07612651 | 1 | 6.929517 | 0.0296 | 0.165784 | 0.020115 | 0.046 | 0.066 | 0.025 | 0.73 | ⋯ | 6.21 | 1.316 | 1.683 | 0.969 | 27.8 | 17.2 | 0.0560 | 0.0826 | 11.879 | 0.086032 |\n", + "| 0.12795121 | 1 | 7.237778 | 0.2151 | 0.078488 | 0.011581 | 0.022 | 0.031 | 0.014 | 1.00 | ⋯ | 3.96 | 0.594 | 0.674 | 0.515 | 28.2 | 14.8 | 0.0270 | 0.0275 | 1.938 | 0.007666 |\n", + "| -0.02432609 | 1 | 8.115820 | 0.4318 | 0.137482 | 0.026547 | 0.059 | 0.073 | 0.045 | 1.00 | ⋯ | 11.36 | 1.132 | 1.126 | 1.138 | 52.1 | 18.8 | 0.0804 | 0.0930 | 0.003 | 0.016968 |\n", + "| 0.07829342 | 1 | 7.271704 | 0.1689 | 0.164598 | 0.044446 | 0.029 | 0.045 | 0.013 | 0.84 | ⋯ | 3.10 | 0.568 | 0.695 | 0.450 | 35.9 | 13.1 | 0.0617 | 0.0678 | 10.479 | 0.004573 |\n", + "| 0.11291155 | 1 | 7.121252 | 0.1832 | 0.188016 | 0.045678 | 0.033 | 0.051 | 0.015 | 0.91 | ⋯ | 3.16 | 0.440 | 0.512 | 0.369 | 37.4 | 12.7 | 0.0775 | 0.0780 | 18.476 | -0.020322 |\n", + "| 0.05230819 | 1 | 6.977281 | 0.0962 | 0.204611 | 0.077852 | 0.037 | 0.043 | 0.030 | 1.00 | ⋯ | 2.40 | 0.419 | 0.548 | 0.299 | 30.3 | 7.9 | 0.0668 | 0.0787 | 125.990 | 0.028916 |\n", + "| 0.03639089 | 1 | 7.649693 | 0.0227 | 0.136287 | 0.046730 | 0.081 | 0.105 | 0.056 | 0.99 | ⋯ | 3.51 | 0.562 | 0.699 | 0.427 | 35.7 | 14.7 | 0.0872 | 0.0938 | 26.800 | 0.020228 |\n", + "| 0.02973823 | 1 | 8.056744 | 0.0208 | 0.197853 | 0.037224 | 0.083 | 0.097 | 0.069 | 1.00 | ⋯ | 3.30 | 0.722 | 0.765 | 0.680 | 36.6 | 12.6 | 0.0557 | 0.0624 | 0.052 | 0.013407 |\n", + "| -0.05664358 | 1 | 8.780941 | 0.2654 | 0.189867 | 0.031747 | 0.068 | 0.089 | 0.046 | 0.94 | ⋯ | 2.99 | 0.372 | 0.462 | 0.281 | 34.0 | 20.3 | 0.3178 | 0.1583 | 4.500 | -0.024761 |\n", + "| 0.01920480 | 1 | 6.287859 | 0.4207 | 0.130682 | 0.109921 | 0.053 | 0.039 | 0.011 | 0.74 | ⋯ | 0.34 | 0.142 | 0.223 | 0.055 | 35.5 | 19.1 | 0.0201 | 0.0341 | 4.762 | -0.021656 |\n", + "| 0.08520600 | 1 | 6.137727 | 0.1371 | 0.123818 | 0.015897 | 0.028 | 0.025 | 0.007 | 0.72 | ⋯ | 0.56 | 0.148 | 0.232 | 0.065 | 41.3 | 21.3 | 0.0298 | 0.0297 | 4.125 | -0.054872 |\n", + "| 0.13398221 | 1 | 8.128880 | 0.0000 | 0.167210 | 0.003311 | 0.129 | 0.196 | 0.063 | 1.00 | ⋯ | 13.16 | 1.727 | 1.910 | 1.560 | 28.1 | 23.2 | 0.0570 | 0.0609 | 360.000 | -0.054874 |\n", + "| 0.17302474 | 1 | 6.680855 | 0.4713 | 0.228424 | 0.029328 | 0.062 | 0.090 | 0.032 | 1.00 | ⋯ | 3.95 | 0.974 | 1.526 | 0.470 | 62.4 | 34.9 | 0.0206 | 0.0618 | 265.690 | 0.018194 |\n", + "| 0.10969915 | 1 | 7.177019 | 0.0178 | 0.185240 | 0.015453 | 0.020 | 0.026 | 0.013 | 0.90 | ⋯ | 1.89 | 0.571 | 0.843 | 0.286 | 26.9 | 24.1 | 0.2295 | 0.1990 | 3.061 | -0.034733 |\n", + "| 0.01598990 | 1 | 6.648985 | 0.4762 | 0.171181 | 0.058937 | 0.018 | 0.028 | 0.007 | 0.40 | ⋯ | 0.76 | 0.357 | 0.512 | 0.185 | 39.9 | 26.9 | 0.0178 | 0.0634 | 4.762 | -0.000222 |\n", + "| 0.06224977 | 1 | 6.879356 | 0.2927 | 0.179508 | 0.035842 | 0.188 | 0.169 | 0.208 | 1.00 | ⋯ | 3.69 | 0.651 | 0.759 | 0.547 | 31.4 | 31.2 | 0.0695 | 0.0728 | 4.017 | 0.033636 |\n", + "| 0.10987069 | 1 | 7.347300 | 0.1017 | 0.247626 | 0.037392 | 0.080 | 0.133 | 0.027 | 0.78 | ⋯ | 0.72 | 0.195 | 0.303 | 0.085 | 36.2 | 21.6 | 0.0860 | 0.0898 | 3.177 | 0.010162 |\n", + "| 0.09210628 | 1 | 6.725034 | 0.0266 | 0.179933 | 0.046376 | 0.015 | 0.020 | 0.010 | 0.78 | ⋯ | 0.86 | 0.258 | 0.382 | 0.137 | 34.6 | 16.5 | 0.0558 | 0.0613 | 20.800 | -0.018514 |\n", + "| 0.08337604 | 1 | 8.451053 | 0.0000 | 0.358556 | 0.016468 | 0.090 | 0.133 | 0.044 | 1.00 | ⋯ | 2.91 | 0.766 | 1.087 | 0.510 | 21.9 | 15.3 | 0.1687 | 0.1635 | 26.000 | 0.010943 |\n", + "| 0.07623345 | 1 | 8.602453 | 0.0000 | 0.416234 | 0.014721 | 0.148 | 0.194 | 0.100 | 1.00 | ⋯ | 12.17 | 1.554 | 1.724 | 1.398 | 20.6 | 7.2 | 0.2629 | 0.2698 | 50.000 | -0.001521 |\n", + "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", + "| -0.03404539 | 1 | 8.346168 | 0.3199 | 0.110885 | 0.028579 | 0.272 | 0.289 | 0.272 | 1.00 | ⋯ | 9.58 | 0.919 | 0.936 | 0.902 | 18.2 | 7.7 | 0.0625 | 0.0578 | 36.603 | 0.014286 |\n", + "| -0.03380635 | 1 | 7.303170 | 0.3133 | 0.165784 | 0.020115 | 0.112 | 0.132 | 0.065 | 0.85 | ⋯ | 5.60 | 1.158 | 1.473 | 0.862 | 22.7 | 18.2 | 0.1071 | 0.1028 | 20.000 | 0.111198 |\n", + "| 0.06991488 | 1 | 7.859027 | 0.1222 | 0.078488 | 0.011581 | 0.107 | 0.103 | 0.092 | 0.88 | ⋯ | 2.80 | 0.596 | 0.645 | 0.548 | 21.7 | 14.4 | 0.0357 | 0.0466 | 8.127 | 0.006002 |\n", + "| -0.08172560 | 1 | 7.998335 | 1.6378 | 0.137482 | 0.026547 | 0.156 | 0.181 | 0.150 | 1.00 | ⋯ | 13.74 | 1.339 | 1.222 | 1.445 | 34.8 | 15.2 | 0.0783 | 0.0847 | 4.911 | -0.127025 |\n", + "| 0.04601005 | 1 | 7.655864 | 0.1345 | 0.164598 | 0.044446 | 0.080 | 0.097 | 0.058 | 1.00 | ⋯ | 8.25 | 1.076 | 1.143 | 1.013 | 32.1 | 19.4 | 0.0525 | 0.0572 | 30.929 | -0.004592 |\n", + "| 0.06659809 | 1 | 7.675082 | 0.0898 | 0.188016 | 0.045678 | 0.269 | 0.338 | 0.200 | 1.00 | ⋯ | 5.80 | 0.687 | 0.745 | 0.630 | 37.5 | 16.4 | 0.0906 | 0.0959 | 25.000 | 0.191066 |\n", + "| -0.01138424 | 1 | 7.830028 | 0.4880 | 0.136287 | 0.046730 | 0.146 | 0.193 | 0.094 | 1.00 | ⋯ | 6.42 | 0.950 | 1.129 | 0.772 | 39.1 | 23.8 | 0.0764 | 0.0866 | 40.500 | -0.007018 |\n", + "| -0.10098990 | 1 | 8.498622 | 0.0010 | 0.189867 | 0.031747 | 0.181 | 0.190 | 0.159 | 0.97 | ⋯ | 7.63 | 0.801 | 0.850 | 0.752 | 30.2 | 16.8 | 0.2131 | 0.1437 | 4.285 | 0.168536 |\n", + "| 0.05475087 | 1 | 6.216606 | 0.7557 | 0.214345 | 0.073495 | 0.023 | 0.051 | 0.006 | 0.73 | ⋯ | 0.44 | 0.282 | 0.488 | 0.051 | 50.6 | 21.8 | 0.0232 | 0.0407 | 8.876 | -0.084064 |\n", + "| 0.09461817 | 1 | 8.414496 | 0.0000 | 0.374328 | 0.000000 | 0.101 | 0.147 | 0.053 | 1.00 | ⋯ | 11.80 | 1.846 | 2.369 | 1.301 | 31.1 | 24.3 | 0.5958 | 0.5819 | 4.935 | 0.021808 |\n", + "| 0.04571529 | 1 | 6.383507 | 0.3556 | 0.130682 | 0.109921 | 0.086 | 0.130 | 0.042 | 0.79 | ⋯ | 1.00 | 0.446 | 0.713 | 0.163 | 42.3 | 19.8 | 0.0188 | 0.0222 | 8.653 | -0.012443 |\n", + "| 0.06549111 | 1 | 8.782323 | 0.0000 | 0.167210 | 0.003311 | 0.246 | 0.331 | 0.160 | 0.99 | ⋯ | 15.52 | 1.969 | 2.121 | 1.828 | 25.3 | 17.5 | 0.1032 | 0.0958 | 296.800 | -0.057094 |\n", + "| 0.02124651 | 1 | 7.251345 | 0.0516 | 0.263813 | 0.045225 | 0.090 | 0.053 | 0.030 | 0.88 | ⋯ | 4.00 | 0.817 | 1.205 | 0.413 | 34.7 | 21.1 | 0.0730 | 0.2227 | 0.320 | 0.128443 |\n", + "| 0.14144548 | 1 | 7.511525 | 0.1053 | 0.228424 | 0.029328 | 0.103 | 0.139 | 0.054 | 1.00 | ⋯ | 9.33 | 1.700 | 2.369 | 1.060 | 51.7 | 37.1 | 0.0903 | 0.1229 | 484.000 | 0.007257 |\n", + "| 0.09681623 | 1 | 7.713785 | 0.0050 | 0.185240 | 0.015453 | 0.031 | 0.042 | 0.016 | 0.91 | ⋯ | 4.35 | 0.891 | 1.255 | 0.517 | 31.8 | 27.3 | 0.1922 | 0.1821 | 2.402 | 0.030424 |\n", + "| 0.04053420 | 1 | 6.728629 | 0.6190 | 0.171181 | 0.058937 | 0.019 | 0.027 | 0.009 | 0.46 | ⋯ | 1.01 | 0.670 | 1.039 | 0.271 | 40.1 | 18.0 | 0.0281 | 0.0459 | 9.900 | -0.012137 |\n", + "| 0.01058841 | 1 | 7.186144 | 0.0760 | 0.179508 | 0.035842 | 0.184 | 0.173 | 0.217 | 1.00 | ⋯ | 5.34 | 0.943 | 1.049 | 0.837 | 29.0 | 31.5 | 0.0703 | 0.0716 | 7.248 | 0.009640 |\n", + "| 0.18552649 | 1 | 8.326033 | 0.0050 | 0.321658 | 0.005106 | 0.090 | 0.109 | 0.075 | 1.00 | ⋯ | 4.64 | 1.127 | 1.427 | 0.817 | 30.5 | 23.1 | 0.7470 | 0.8489 | 2.371 | 0.051395 |\n", + "| 0.09310491 | 1 | 7.894691 | 0.1062 | 0.247626 | 0.037392 | 0.121 | 0.175 | 0.063 | 0.96 | ⋯ | 1.47 | 0.481 | 0.761 | 0.200 | 33.8 | 19.6 | 0.0797 | 0.1018 | 3.017 | 0.207492 |\n", + "| 0.06522856 | 1 | 7.175490 | 0.0000 | 0.179933 | 0.046376 | 0.035 | 0.040 | 0.027 | 0.83 | ⋯ | 1.23 | 0.332 | 0.451 | 0.219 | 27.9 | 27.2 | 0.0636 | 0.0721 | 20.379 | 0.018019 |\n", + "| 0.03809502 | 1 | 9.030974 | 0.0000 | 0.293138 | 0.005517 | 0.245 | 0.251 | 0.238 | 1.00 | ⋯ | 7.50 | 1.167 | 1.210 | 1.128 | 22.5 | 15.5 | 0.1662 | 0.1617 | 4.286 | -0.006642 |\n", + "| 0.03421300 | 1 | 8.995537 | 0.0000 | 0.304720 | 0.011658 | 0.246 | 0.260 | 0.190 | 1.00 | ⋯ | 6.75 | 0.667 | 0.776 | 0.575 | 23.5 | 15.0 | 0.2597 | 0.2288 | 2.460 | -0.003241 |\n", + "| 0.05275914 | 1 | 8.234830 | 0.0363 | 0.288405 | 0.011589 | 0.183 | 0.222 | 0.142 | 1.00 | ⋯ | 8.18 | 1.010 | 1.220 | 0.821 | 30.2 | 28.3 | 0.1044 | 0.1796 | 32.051 | -0.034352 |\n", + "| 0.03841564 | 1 | 8.332549 | 0.0000 | 0.345485 | 0.006503 | 0.188 | 0.248 | 0.136 | 1.00 | ⋯ | 13.12 | 1.576 | 1.567 | 1.585 | 31.0 | 14.3 | 0.2866 | 0.3500 | 0.452 | -0.001660 |\n", + "| 0.03189479 | 1 | 8.645586 | 0.0000 | 0.288440 | 0.005995 | 0.256 | 0.301 | 0.199 | 1.00 | ⋯ | 6.91 | 1.307 | 1.579 | 1.062 | 18.9 | 11.3 | 0.1296 | 0.1458 | 652.850 | -0.046278 |\n", + "| 0.03119598 | 1 | 8.991064 | 0.0000 | 0.371898 | 0.014586 | 0.255 | 0.336 | 0.170 | 0.98 | ⋯ | 11.41 | 2.226 | 2.494 | 1.971 | 27.5 | 15.9 | 0.4407 | 0.4257 | 2.529 | -0.011883 |\n", + "| 0.03409566 | 1 | 8.025189 | 0.0050 | 0.296437 | 0.013615 | 0.108 | 0.117 | 0.093 | 1.00 | ⋯ | 1.95 | 0.510 | 0.694 | 0.362 | 20.2 | 15.7 | 0.1669 | 0.2201 | 25.553 | -0.039080 |\n", + "| 0.04690046 | 1 | 9.030137 | 0.0000 | 0.265778 | 0.008629 | 0.288 | 0.337 | 0.237 | 1.00 | ⋯ | 25.64 | 2.727 | 2.664 | 2.788 | 20.4 | 9.4 | 0.3238 | 0.3134 | 4.152 | 0.005175 |\n", + "| 0.03977337 | 1 | 8.865312 | 0.0000 | 0.282939 | 0.005048 | 0.188 | 0.236 | 0.139 | 1.00 | ⋯ | 10.76 | 1.888 | 1.920 | 1.860 | 20.0 | 16.0 | 0.1845 | 0.1940 | 0.452 | -0.029551 |\n", + "| 0.04064154 | 1 | 8.912339 | 0.0000 | 0.150366 | 0.024377 | 0.257 | 0.338 | 0.215 | 1.00 | ⋯ | 24.40 | 3.051 | 3.235 | 2.875 | 18.5 | 29.1 | 0.1876 | 0.2007 | 0.886 | -0.036482 |\n", + "\n" + ], "text/plain": [ " Outcome intercept gdpsh465 bmp1l freeop freetar h65 hm65 hf65 \n", "1 -0.02433575 1 6.591674 0.2837 0.153491 0.043888 0.007 0.013 0.001\n", @@ -447,85 +574,119 @@ "90 -0.036482" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "growth" ] }, { "cell_type": "markdown", + "metadata": { + "id": "-AMcbsgefhTg" + }, "source": [ "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." - ], - "metadata": { - "id": "-AMcbsgefhTg" - } + ] }, { "cell_type": "code", - "source": [ - "## Create the outcome variable y and covariates X\n", - "y <- growth$Outcome\n", - "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" - ], + "execution_count": 4, "metadata": { - "id": "DncWsRS9mgAp" + "id": "DncWsRS9mgAp", + "vscode": { + "languageId": "r" + } }, - "execution_count": 4, - "outputs": [] + "outputs": [], + "source": [ + "## Create the outcome variable y and covariates x\n", + "y <- growth$Outcome\n", + "x <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" + ] }, { "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vPO08MjomqfZ", + "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The estimated coefficient on gdpsh465 is -0.009377989 and the corresponding robust standard error is 0.032421195% Confidence Interval: [ -0.07292335 , 0.05416737 ]" + ] + } + ], "source": [ - "fit <- lm(Outcome ~ ., data=X)\n", - "est <- summary(fit)$coef[\"gdpsh465\",1]\n", + "fit <- lm(Outcome ~ ., data=x)\n", + "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", "\n", - "HCV.coefs <- vcovHC(fit, type = 'HC1'); # HC - \"heteroskedasticity cosistent\"\n", + "HCV.coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", "se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gdpsh465 and the corresponding standard error\n", - "cat (\"The estimated coefficient on gdpsh465 is\",est,\" and the corresponding robust standard error is\",se)\n", + "cat (\"The estimated coefficient on gdpsh465 is\", est,\n", + " \" and the corresponding robust standard error is\", se)\n", "\n", "# Calculate the 95% confidence interval for 'gdpsh465'\n", "lower_ci <- est - 1.96 * se\n", "upper_ci <- est + 1.96 * se\n", "\n", "cat (\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" - ], + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D7nJZzhGfjQT" + }, + "source": [ + "## Summarize OLS results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": { - "id": "vPO08MjomqfZ", - "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "EwGVcIVAfRe5", + "outputId": "87f41279-8907-415b-f8eb-589f736089b2", + "vscode": { + "languageId": "r" } }, - "execution_count": 6, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "The estimated coefficient on gdpsh465 is -0.009377989 and the corresponding robust standard error is 0.032421195% Confidence Interval: [ -0.07292335 , 0.05416737 ]" + " Method Estimate Std. Error lower bound CI\n", + "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", + " upper bound CI\n", + "1 0.0541673700112012\n" ] } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Summarize OLS results" ], - "metadata": { - "id": "D7nJZzhGfjQT" - } - }, - { - "cell_type": "code", "source": [ "# Create an empty data frame with column names\n", "table <- data.frame(\n", - " Method = character(0),\n", - " Estimate = character(0),\n", + " Method = character(0),\n", + " Estimate = character(0),\n", " `Std. Error` = numeric(0),\n", " `Lower Bound CI` = numeric(0),\n", " `Upper Bound CI` = numeric(0)\n", @@ -539,152 +700,137 @@ "\n", "# Print the table\n", "print(table)" - ], - "metadata": { - "id": "EwGVcIVAfRe5", - "outputId": "87f41279-8907-415b-f8eb-589f736089b2", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - " upper bound CI\n", - "1 0.0541673700112012\n" - ] - } ] }, { "cell_type": "markdown", + "metadata": { + "id": "KfrhJqKhfwKB" + }, "source": [ "Least squares provides a rather noisy estimate (high standard error) of the\n", "speed of convergence, and does not allow us to answer the question\n", "about the convergence hypothesis since the confidence interval includes zero.\n", "\n", "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." - ], - "metadata": { - "id": "KfrhJqKhfwKB" - } + ] }, { "cell_type": "code", - "source": [ - "y <- growth$Outcome\n", - "W <- growth[-which(colnames(growth) %in% c('Outcome', 'intercept', 'gdpsh465'))]\n", - "D <- growth$gdpsh465" - ], + "execution_count": 8, "metadata": { - "id": "D9Y2U1Ldf1eB" + "id": "D9Y2U1Ldf1eB", + "vscode": { + "languageId": "r" + } }, - "execution_count": 8, - "outputs": [] + "outputs": [], + "source": [ + "y <- growth$Outcome\n", + "w <- growth[-which(colnames(growth) %in% c('Outcome', 'intercept', 'gdpsh465'))]\n", + "d <- growth$gdpsh465" + ] }, { "cell_type": "markdown", - "source": [ - "## Method 1: Lasso with Theoretical Penalty using HDM" - ], "metadata": { "id": "8yNU2UgefzCZ" - } + }, + "source": [ + "## Method 1: Lasso with Theoretical Penalty using HDM" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "tQPxdzQ2f84M" + }, "source": [ "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", "\n", "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." - ], - "metadata": { - "id": "tQPxdzQ2f84M" - } + ] }, { "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "DIzy51tZsoWp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "double_lasso <- function(y, D, W) {\n", + "double_lasso <- function(y, d, w) {\n", " # residualize outcome with Lasso\n", - " yfit.rlasso <- rlasso(W,y, post=FALSE)\n", - " yhat.rlasso <- predict(yfit.rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat.rlasso)\n", + " yfit_rlasso <- rlasso(w, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(w))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " Dfit.rlasso <- rlasso(W,D, post=FALSE)\n", - " Dhat.rlasso <- predict(Dfit.rlasso, as.data.frame(W))\n", - " Dres <- D - as.numeric(Dhat.rlasso)\n", + " dfit_rlasso <- rlasso(w, d, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(w))\n", + " dres <- d - as.numeric(dhat_rlasso)\n", "\n", " # rest is the same as in the OLS case\n", - " hat <- mean(yres * Dres) / mean(Dres^2)\n", - " epsilon <- yres - hat * Dres\n", - " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr = sqrt(V / length(y))\n", + " hat <- mean(yres * dres) / mean(dres^2)\n", + " epsilon <- yres - hat * dres\n", + " v <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", + " stderr <- sqrt(v / length(y))\n", "\n", " return(list(hat = hat, stderr = stderr))\n", "}" - ], - "metadata": { - "id": "DIzy51tZsoWp" - }, - "execution_count": 9, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Ncz7Uqn5sqqU", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "results <- double_lasso(y, D, W)\n", + "results <- double_lasso(y, d, w)\n", "hat <- results$hat\n", "stderr <- results$stderr\n", "# Calculate the 95% confidence interval\n", "ci_lower <- hat - 1.96 * stderr\n", "ci_upper <- hat + 1.96 * stderr" - ], - "metadata": { - "id": "Ncz7Uqn5sqqU" - }, - "execution_count": 10, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "P5PEjKw9gLvC" + }, "source": [ "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", "\n", "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." - ], - "metadata": { - "id": "P5PEjKw9gLvC" - } + ] }, { "cell_type": "code", - "source": [ - "# Add Double Lasso results to the table\n", - "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", - "\n", - "# Print the table\n", - "print(table)" - ], + "execution_count": 11, "metadata": { - "id": "tNLVM4WEgL9v", - "outputId": "1f2683b7-630a-43c5-e110-74c527603850", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "tNLVM4WEgL9v", + "outputId": "1f2683b7-630a-43c5-e110-74c527603850", + "vscode": { + "languageId": "r" } }, - "execution_count": 11, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ " Method Estimate Std. Error lower bound CI\n", "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", @@ -694,28 +840,43 @@ "2 -0.00975949506187093\n" ] } + ], + "source": [ + "# Add Double Lasso results to the table\n", + "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", + "\n", + "# Print the table\n", + "print(table)" ] }, { "cell_type": "markdown", - "source": [ - "## Method 2: Lasso with Cross-Validation" - ], "metadata": { "id": "smPkxqCpgMR8" - } + }, + "source": [ + "## Method 2: Lasso with Cross-Validation" + ] }, { "cell_type": "markdown", - "source": [ - "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." - ], "metadata": { "id": "MH-eUye8liRq" - } + }, + "source": [ + "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." + ] }, { "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "YhpTUkE_wQz9", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Choose penalty based on KFold cross validation\n", "set.seed(123)\n", @@ -725,91 +886,88 @@ "\n", "# Define LassoCV models for y and D\n", "model_y <- cv.glmnet(\n", - " x = as.matrix(W),\n", + " x = as.matrix(w),\n", " y = y,\n", - " alpha = 1, # Lasso penalty\n", + " alpha = 1, # Lasso penalty\n", " nfolds = n_folds,\n", " family = \"gaussian\"\n", ")\n", "\n", - "model_D <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = D,\n", - " alpha = 1, # Lasso penalty\n", + "model_d <- cv.glmnet(\n", + " x = as.matrix(w),\n", + " y = d,\n", + " alpha = 1, # Lasso penalty\n", " nfolds = n_folds,\n", " family = \"gaussian\"\n", ")\n", "\n", "# Get the best lambda values for y and D\n", "best_lambda_y <- model_y$lambda.min\n", - "best_lambda_D <- model_D$lambda.min\n", + "best_lambda_d <- model_d$lambda.min\n", "\n", "# Fit Lasso models with the best lambda values\n", - "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", - "lasso_model_D <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_D)\n", + "lasso_model_y <- glmnet(as.matrix(w), y, alpha = 1, lambda = best_lambda_y)\n", + "lasso_model_d <- glmnet(as.matrix(w), d, alpha = 1, lambda = best_lambda_d)\n", "\n", "# Calculate the residuals\n", - "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", - "res_D <- D - predict(lasso_model_D, s = best_lambda_D, newx = as.matrix(W))" - ], - "metadata": { - "id": "YhpTUkE_wQz9" - }, - "execution_count": 12, - "outputs": [] + "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(w))\n", + "res_d <- d - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(w))" + ] }, { "cell_type": "code", - "source": [ - "tmp_df = as.data.frame(cbind(res_y, res_D))\n", - "colnames(tmp_df) = c(\"res_y\",\"res_D\")" - ], + "execution_count": 13, "metadata": { - "id": "cbVsr86tyqTY" + "id": "cbVsr86tyqTY", + "vscode": { + "languageId": "r" + } }, - "execution_count": 13, - "outputs": [] + "outputs": [], + "source": [ + "tmp_df = as.data.frame(cbind(res_y, res_d))\n", + "colnames(tmp_df) = c(\"res_y\", \"res_D\")" + ] }, { "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "D7SzuZ2P0P0X", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "fit.cv <- lm(res_y ~ res_D, data = tmp_df)\n", - "est.cv <- summary(fit.cv)$coef[\"res_D\",1]\n", + "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", + "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", "\n", - "HCV.cv.coefs <- vcovHC(fit.cv, type = 'HC1'); # HC - \"heteroskedasticity cosistent\"\n", - "se.cv <- sqrt(diag(HCV.cv.coefs))[2] # Estimated std errors\n", + "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", "\n", "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci.cv <- est.cv - 1.96 * se.cv\n", - "upper_ci.cv <- est.cv + 1.96 * se.cv" - ], - "metadata": { - "id": "D7SzuZ2P0P0X" - }, - "execution_count": 14, - "outputs": [] + "lower_ci_cv <- est_cv - 1.96 * se_cv\n", + "upper_ci_cv <- est_cv + 1.96 * se_cv" + ] }, { "cell_type": "code", - "source": [ - "# Add LassoCV results to the table\n", - "table <- rbind(table, c(\"Double Lasso CV\", est.cv, se.cv, lower_ci.cv, upper_ci.cv))\n", - "\n", - "# Print the table\n", - "print(table)" - ], + "execution_count": 15, "metadata": { - "id": "Ctl5T5vUygRk", - "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "Ctl5T5vUygRk", + "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", + "vscode": { + "languageId": "r" } }, - "execution_count": 15, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ " Method Estimate Std. Error lower bound CI\n", "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", @@ -821,99 +979,126 @@ "3 0.0140233785280299\n" ] } + ], + "source": [ + "# Add LassoCV results to the table\n", + "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", + "\n", + "# Print the table\n", + "print(table)" ] }, { "cell_type": "markdown", - "source": [ - "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." - ], "metadata": { "id": "0LzDsUi8gmQM" - } + }, + "source": [ + "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." + ] }, { "cell_type": "code", - "source": [ - "# Create a data frame to store the results\n", - "results_y <- data.frame(\n", - " Alphas = model_y$lambda,\n", - " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", - ")\n", - "\n", - "results_D <- data.frame(\n", - " Alphas = model_D$lambda,\n", - " OutOfSampleR2 = 1 - model_D$cvm / var(D)\n", - ")\n", - "\n", - "# Plot Outcome Lasso-CV Model\n", - "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n", - "\n", - "# Plot Treatment Lasso-CV Model\n", - "ggplot(data = results_D, aes(x = (Alphas), y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n" - ], + "execution_count": 16, "metadata": { - "id": "7uzcIGhVgmei", - "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", "colab": { "base_uri": "https://localhost:8080/", "height": 857 + }, + "id": "7uzcIGhVgmei", + "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", + "vscode": { + "languageId": "r" } }, - "execution_count": 16, "outputs": [ { - "output_type": "display_data", "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deaBM9f/H8fddrddyUdakFLIVkaVIi+qXaEUpUimlpEUqRAqlpH0laVPas5WkolIpVL7JEsp6nTZEcnF+Z5mZ+5l7z3vOLO85M4fX8497586d+7nvc8zDnTt35gzpCKGEo1QPgNCBECAhJBAgISQQICEkECAhJBAgISQQICEkECBF0RLqGH7GUHpMZuFWOeXWJbKA1CAxVmKHsBeMaQOH0FOxjRH/7ou046LdqWGXSxDSlpHtquXkHz9kbclPvfheYksvoSMTW8CoYGSbqrm12j6w1Th9No0Pnd+GpoS+C3ULnjudaCgzSsfwM5x2dTy7ogl1umkrP757zL+5sVVWZRpctyqR5bmihhT1Blq76L66L8c0RgK7z95xzv8wKYD0TBnKbdn5hNKUPabE56pfmdDSEpCmlKWc408/Jpcqf6jrM6lh8PxllP9v6LtkZBcETl+YkQCkeHbFbqpQ6LoREWMhle9h1L1jZSrzUWLfwbFoIUW/gfFcWxLZffaOc/6u3kN6lrLv3W683zWhNI0q9rlfKOWQplLm0G3G+4KBlLVY31ePPgt84ka6pei7HBv8SfVnqSbxQ4prV/xFdd23ImIspMC+23ktHbYvwe/huH7HqC4X9QbGdW1JZPdZO475rp5D2lKG3gic/DAz82ddH0CTzQ8W0tn6BeYti/a6vu+J48uVP8W6Bu95+PjypY4csNE4OYze+7Jj+ap9tu+f0KBMozH7jbP2P9emfOmGw/4JrR4GadvtDUvn1r/VZKG/0alyTo0zZxU7qaweaHul0E3uIXS1ccOBLrU/2p2fEbrBs4SurNbEPvkk3WRBUlda16NKmeaT7OuNMqG9C7NofSy7IlDR8t2sm1+hUZSNUTa35K66id6a16FC+ZPmhgaJsO/2VaClRbtE+RZru+cbW7aFWoTNGvat76T3nqhZMXx5dYdE/KcxC2ygur6xOcvOrVaq+avWcMErR2AX2b8jhV9Pii4dcfepO0UdOGwFZVRzx9nftT3NsFb4JPRvVHyndqT3rfOn0ykO14JAiUAaZe15u0toQNi/yIzLqc2EacbNJTrmul7l6UVjr51FDW8cfhbVWGd+5R2VegyoTZcNqdO/d675Wf1SqnHLnW3o2O3BFVVIe06ilrfe2IBa7TX/7692zfAr8jNeDDuprh7oSTo+ePLfX403WqnSf1ofvUKd9aLvckV/WmSdbH3oNBOSutKfdajDsGuq97WuN8qEJSBFsyvslOXfH0OVJ0z4O/AJZWPUzS25q4bQ9WW63tItI+uT4CCR9l19+jo0mfIt/qhNne7pX/1a89qjXtHVbz2Sbi57cb+w5cN2SKR/GqvABqrrj6KRFTvfdDbRx7py5QjsIgtS+PVEuXTE3afuFHVgdQV1VHPH2d91Il1grXAtPR34HsV36jPUxzr/MnM7SlwLAiUCqSO9FDo9k44O/xd5w/qxOZXOMqb+uWy5HcZebrtbN/+L6K7rY6mUscG/ZuU0/EPXJ1IXXX+dWhqj7b+ebg+uqEJ6i9oYy/zX0PyvoSmtNs5Zn9cm7KS6eqALlHsXrHrRw9b7k+md0HlLqM83dJ15ajnd/IYJSV3pLuphnNxc3bzeqBPau/Djj3bHsCsCqcuH3TZRNkbd3JK7aihlTjcu+QC1CQwSad+tyMz52+lb3EUXmlt2qLll6qzqtx5DFY3fLcOWV3dIxH8aO3sD1fXHUq65p241r5vKlcPeRRak8OtJ0aUj7z51p6gDqyuoo1o7zvqu28vm/m5cYu8hpf4KrRW+U//MrbzHOHt3xTLbHa4FgRKBVIeWhE5vpOz9DteezrTAPGfCLb/o7cm+1ZKbu8vYvDPM08fSE8bbLdRI10+jOdaez6kRXFGFtPZt64fGELrX+K4Zm83T/+lhJ9XVAx1nL1nUF9TYfLcyo/be0HkGJL1pZfPf5nZaZkFSV2pOC83Td5vXG3XCEreio9gVgdTlwyApG6NubsldNdS+DbK7bMYf9iD8vtvydn3zx6PDt2hOn5snRxSHFP6tLRHq8uoOsWP+aeycIFnDf01tw64cCqTw60nRpSPvPnWnqAOrK6ijFkEyftA8arz9yPqfJbBWsZ16Dn1gnHyXeka4FiQCqVLR7Xt9B9FOh2tPOQper/eXJvu/xqbGLamxNMQ82cHak7vocF0vT/Yt/OPo18AXFL+zYfvmzXebV/TrqOGkzfZZRSfDVg9U3/5HV2pOXxhvb1PvDTAhPUxTjdsMtVvrJiR1pX259vgfmNcbdcISkKLYFQ67IQySsl3q5pbcVUPpNusSzeh7exCnfRfqyv+cvoWxZTvN97OLQwr/1jeZHyrLh+2QkrMW2wRnSNb9PCvpWPXKoUAqdj0punTk3afuFHV/FF8hOKoCaZ75e6Lej94rWqvYTp1KV+nmTfaZEa4FiUCqW3Sl1ddTrsPtmX+odPAC280LmJ1Cs43Ne8A82ZGWG2//NfbHrqJ/+S8DXxAG6Z32pa1PGrtgzzU5RMfctkZXT6qr7+1o9rHegmbp4T1t/oDfc2j2pqKzTEi/lzJ+Z/rQ+Ec0IakrbQuMv8i43oRNWAKS+64ITBW2G6xrQnBcZbvUzS2xq4xvbt9k7Wj852gO4rjvrLu/T6SGP5kflvwW26hUaMvCIYV9a/OefHV5dYdE/qexc4Jkbc4qaq5eORRITtcT89J2JXdfIGWnhO2PsBWUURVI++vRD3phlap7itYqtlN3lq+6V/8375DCCNeCRCCdQc+ETk+nZg7Xnl2UtT9wgR2UY5842fg5WeLa8S9ljAgU/GdQIT1DeYNenTn7Gvvu6U1Pd8uj3NfVk+rqhdZ2TtUvLnE/9D8Vyvytv0kXKWeZkPTumb/plxifMiGpK/0duLp9ZVxvwiYsAcl9VwSmCtsN1jUhOK6yXermOkGaYK3QwfgF2hyE33e7jjCubrrTtwhu2cLikEp+a3V5dYe4/dOYRYSkXDkUSE7XkyJIJXdfIGWnhO0PdQV1VAWScfv2ZuNH8w3KWsV3ai9jV79FA/UI14JEII0vuulq/NwzfrReT8+bp98uuj2TR1rwEmXJ/mWuMX3ncO2oSMX/RK1CqmX/Ceiu0N95/n0qu9Ju9aS6eqBJdGTw73X7Ry233g8wftM4k+aFfZc+5k+j0dvL9NItSOpKe7PsGx/vmtcbdcISkKLYFQ67oeQfQuyNUTfXCdIw67LNaJk9CL/vZtNh24t9yv4We7PtLXvb3DJ1VodvrSwftkPsIv3TBDZQXV+9YitXDuV3JIfrSREkdvepO0XdH+oK6qgqpLUZtfZfptykKLlTZ9J1ek/rEuy1IBFIf1UI3WM4LzP3N/O+Eevn6x1F155O1v+D+phTvzD+r7Dur/8ju8xuh2tHZ3rTWuiP0OoKpN1U3ny3v7W5C9bZN8w60E/qSXX1QLsOCdxu1vV7qZ31/n90UkFW6AEO9nfpY/zGcFirF837Ry1I6koN6Svz9K3m9UadsASkKHZFIHX5MEhFGxO2uU6QrHvvt+dmbbMHibDvLqJrlSmV/dXIvhl4k7llyqxO31pdXt0hVtw/TWC3WBuo7gv1iq1cORRIDtcTBRK3+9Sdog6srBA2qgrJGGRW+Ua6slbxnVpYtc6ucg2L7wy5RzaYDx0w/8fb/WRZ606lJ6mj8cP6p6rmHptO5xrnvEAtjV/O1lYu+6c+hdqZv/feYv7iVvLa8To1ManPzw7dd6L+RMon47q5f2R1GqAvpVPMZbbXzvpdORm2erDpGdTX/N1383VUIfBP2zHjDnpE3QQLkj4i89R6+wOQ1JVutu5NX1PZvvu7aEJ7F376SQhtFLsikLq8ek1QN0bZXEdIWea9KI9Sp9A9tey+21gh4zPHb3ErnW/Mt7qSuWXqrA7fWl1e3SGR/mkCn7Q3UF1fpaFcOexdZEFyuJ4okLjdp+4UdWB1BXXbrB0X/IeZQvVIeWCXw069lkab9/U5XQsCJfZYu5fLU+4J/9e+HJW2HkNQUIHa3tSz/AN0lvFvlJFzRX99Xxeq2/+yPHrO2IJu1GTwnafS0b87XTuM32hq3Tzigpy80J8Pl1DZs+1eMf79jrr33jYNPqQq962/hI4YcNd1delG80ZU6KS6eqg3K1Bm89Ob5FKdHwLnvEbZZf9SLhGAtC6T7tYDkNSVNlal42/oWdH+D1iZsMQfZKPYFYHU5cN+Iikbo26uE6Q+eX3u6Z2VszA4iMO+C/4n9AjV3+X0LTZVoxNu7VVxgLll6qwO31pdPmyHWDH/NHb2Bqrrq1ds5cph7yILksP1RIHE7T51p6gDqyuoo1o7LvgPszOPjN+SlbVK7NQFlJextvjOkHz099a721bNyW89LHA/2I+nlC1/wrsanWycvq9qqRbGD8XxzcqU62D9VlL4cIuypRreYV6PHSDte65tXnbt3stDSxfdhTtC/3fokaXqXPe7fnm56j/se6Jd1ayKJz2/33yISeikunpRv49qUzW7coeJwYeo6nuqF3t4lQ1JPz3TvOfYghS20vJulUo3fe4POiF8QgdI7rsikLJ8GCRlY9TNdYL05NyO5ct3nB8axGHfhR4i1LLocYVh++tnc8smLrVEKLM6fOuw5dUdYsX809gFNlBZP4yGcuWwdpH9EKGS1xMFErf71J2iDqyuoI5q77jgP8yV5oN/lLVK7NT9h9NJevGdIQkJeZ/kc5Cifj5EupfYThmjPDAlzgDJdwGSQwntlD11qu52v1TkAMl3AZJDCe2UQSX+4hh7gOS7AMmh+HfK8iEnUvNd7pdzCZB8FyA5FP9O+Tiz/MUJPd3fDpAQEgiQEBIIkBASCJAQEgiQEBIIkBASCJAQEgiQEBIIkBASKAFI//zt1r+F210vE3//7Eri4n8X/pfM1XcndccU7kzi6tuSu2MKtyVx9Z3SO6boifwJQPpbc2u3/qfrZeJvx84kLq7pe5K5+p5k7pjt+o4krv5HYRIX1/bovydx9X+kd8yfgOQWIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApeQbp3YnCkysBEhMgcfkXUrOsn4VHLwqQmACJy7+QjqYvhEcvCpCYAInLv5COpE+FRy8KkJgAicu/kOrSHOHRiwIkJkDi8i+k2jRTePSiAIkJkLj8C6k6vSM8elGAxARIXP6FVI2mCY9eFCAxARKXfyHl0wPCoxcFSEyAxOVfSBWor/DoRQESEyBx+RdSWeomPHpRgMQESFz+hVSKThIevShAYgIkLv9CyqLjhEcvCpCYAInLt5AKiZoIj14UIDEBEpdvIe0kaig8elGAxARIXL6F9BfREcKjFwVITIDE5VtIBUR1hEcvCpCYAInLt5DWE1UXHr0oQGICJC7fQlpDlC88elGAxARIXL6FtIIoT3j0ogCJCZC4fAtpGVFp4dGLAiQmQOLyLaQlRFnCoxcFSEyAxOVbSIuIqEB49lCAxARIXL6FtNCAtEF49lCAxARIXL6FNN+AtFZ49lCAxARIXL6F9LEBaaXw7KEAiQmQuHwL6QMD0nLh2UMBEhMgcfkW0vsGpB+EZw8FSEyAxOVbSG8ZkBYLzx4KkJgAicu3kF4zIH0tPHsoQGICJC7fQnrRgJS0g38DEhMgcfkW0iTKTN7BvwGJCZC40hPSjr/d2vM0laVPXS8WZ7t2J2tlM31vMlcv3J7ExXfpu5K4+vbk7hh9WxJX/1d6x2yXgLTbtb2PURWa7365+Crcm6yVzfR9yVx9339JXLxQL0zi6v8ld8dEcbWKv0LxHSMBKYqbdg9RLZou/NM0FG7aMeGmHVd63rSLAtL9VD95R9EHJCZA4vItpNHUNHlH0QckJkDi8i2kkdSaXhGePRQgMQESl28hDacONEV49lCAxARIXL6FdAedTs8Lzx4KkJgAicu3kG6js+k54dlDARITIHH5FtLNdB49LTx7KEBiAiQu30K6kXrQE8KzhwIkJkDi8i2kAXQZPSY8eyhAYgIkLt9C6k9X0sPCs4cCJCZA4vItpH7Un8YLzx4KkJgAicu3kPrSjTROePZQgMQESFy+hXQZDaaxwrOHAiQmQOLyLaRLaCiNFp49FCAxARKXbyF1p5E0Snj2UIDEBEhcvoV0AY2lEcKzhwIkJkDi8i2kc+lBGiY8eyhAYgIkLt9COocepTuFZw8FSEyAxOVbSGfTkzREePZQgMQESFy+hXQWPUe3Cs8eCpCYAInLt5A602S6SXj2UIDEBEhcvoV0Gr1CNwrPHgqQmACJy7eQTqFpdL3w7KEAiQmQuHwL6WR6m64Vnj0UIDEBEpdvIXWg6XS18OyhAIkJkLh8C6k9zaarhGcPBUhMgMTlW0jtaC71FZ49FCAxARKXbyGdQJ9Rb+HZQwESEyBx+RZS68wF1Et49lCAxARIXL6FdHz2l9RTePZQgMQESFy+hdQi52u6SHj2UIDEBEhcvoV0bKlv6QLh2UMBEhMgcfkWUvPSS6ib8OyhAIkJkLh8C6lp2R+oi/DsoQCJCZC4fAupcbmf6Czh2UMBEhMgcfkWUqO8FdRZePZQgMQESFy+hdSw4mo6RXj2UIDEBEhcvoV0dOW1dLLw7KEAiQmQuHwL6aj89XSi8OyhAIkJkLh8C+mIqhuprfDsoQCJCZC4fAvp8GoF1Ep49lCAxARIXL6FdNihWkZL4dlDARITIHH5FlLtGlp2c+HZQwESEyBx+RZSzVpaqSbCs4cCJCZA4vItpOp1tLKNhGcPBUhMgMTlW0iH1tUqHC08eyhAYgIkLt9CqlZPq3yE8OyhAIkJkLh8C6nKkVrVw4RnDwVITIDE5VtIletrh9YWnj0UIDEBEpdvIVU8WqtVQ3j2UIDEBEhcvoWU11CrU0149lCAxARIXL6FVO4YrV6+8OyhAIkJkLh8C6lME61+ReHZQwESEyBx+RZSqWZaw/LCs4cCJCZA4vItpJxjtWNKC88eCpCYAInLt5CyjtOa5QrPHgqQmACJy7eQMlpqx2YJzx4KkJgAicu3kOh4rWWG8OyhAIkJkLj8CmkXtdZa0Vbh4YMBEhMgcfkV0k46QTuBNgsPHwyQmACJy6+QdlBbrR1tFB4+tDogOQdIXH6FtJ3aaSfSeuHhgwESEyBx+RXS33Si1pHWCQ8fDJCYAInLr5D+opO0TvSL8PDBAIkJkLj8CukP6qidSquEhw8GSEyAxOVXSL8bkDrTz8LDBwMkJkDi8iskjU7WzqTlwsMHAyQmQOLyK6QC6qSdTcuEhw8GSEyAxOVXSFvoVO0c+l54+GCAxARIXH6FtJlO07rREuHhgwESEyBx+RXSJjpdO5++Ex4+GCAxARKXfyF11i6ib4SHDwZITIDE5VdIG+hMrQd9JTx8MEBiAiQuv0JaT2dpF9MXwsMHAyQmQOLyK6Tf6P+0XjRfePhggMQESFx+hfQrna31ps+Ehw8GSEyAxOVXSOuoi9aX5gkPHwyQmACJy6+Q1tA52pU0V3j4YIDEBEhcfoX0C3XV+tGHwsMHAyQmQOLyK6TV1E3rT7OEhw8GSEyAxOVXSCvpPO06mik8fDBAYgIkLr9CWkHnazfQ+8LDBwMkJkDi8iukn+kCbRC9Izx8MEBiAiQuv0L6iS7SbqK3hIcPBkhMgMTlV0j/o+7arTRNePhggMQESFx+hbSMemi30WvCwwcDJCZA4vIrpB+pp3YHvSI8fDBAYgIkLr9C+p4u0YbSS8LDBwMkJkDi8iukpdRLG05ThIcPBkhMgMTlV0hLDEgjabLw8MEAiQmQuPwKaTFdqo2iicLDBwMkJkDi8iuk7+gybQw9Izx8MEBiAiQuv0L6lnprY+lJ4eGDARITIHH5FdIi6qM9QI8JDx8MkJgAicuvkL6hy7WH6BHh4YMBEhMgcfkV0tfUV3uEHhIePhggMQESl18hfUVXaI/ROOHhgwESEyBx+RXSQrpSe4rGCg8fDJCYAInLr5C+pKu0Z2m08PDBAIkJkLj8CukL6qdNoruFhw8GSEyAxOVXSJ/T1doLdJfw8MEAiQmQuPwM6SUaKjx8MEBiAiQuv0JaQNdor9IdwsMHAyQmQOLyL6T+2us0WHj4YIDEBEhcfoU0n67V3qSbhYcPBkhMgMTlZ0jv0CDh4YMBEhMgcfkV0mcGpPfpBuHhgwESEyBx+RXSp3SdNsvAlJwAiQmQuPwK6RMD0gfUT3j4YIDEBEhcfoU0j67XPqKrhIcPBkhMgMTlZ0jzqK/w8MEAiQmQuPwK6WO6QZtPlwkPHwyQmACJy6+Q5tJA7XO6RHj4YIDEBEhcfoa0kHoKDx8MkJgAicuvkD6iG7VFdKHw8MEAiQmQuFIIacf4PpfcXWCf/uOBS7vfviIGSHNokLaYzhMePhggMQESVwoh3TNkzcYHBuyzTt805JdND/b6NzZIS6mr8PDBAIkJkLhSB0nr+ovxU+ncpebp7WN+0/Wt56yMHtKHdJP2I3URHj4YIDEBElfqIH15wX7j7fWvh85Y3s382l0bjLQ/3fqQbvlzBZ3lern42rkrSQtb6YXJXL3w7yQu/o++M4mr/7U3iYv/uUf/K4mr75TeMX9HC+mDy823Q58Nfrz9usnmu3ktjb6O/KVGc2mo/jud7Xo5hPzZvtApN0h9zbchSOuvftL8CaUvG2K0fLdbc+j23Vuos+vl4qtwb5IWttL3JXP1ff8lcfFCvTCJq/+3P4mL796nJ3P1QvEdEy2kr+ybdm/YHy29ZLryOfffkWbRrdo66ih8uzQYfkdiwu9IXKn7HemPrqt0fVu3ZdYH/7v4W/Vz7pBm0mBtPZ0oPHwwQGICJK4U3v09dtCaDSNv3q/PeV//r99U82tjuPvbhLSR2ggPHwyQmACJK4WQdk7o3WuMcfFxw/Sl51jNiB7SDLpN20qthIcPBkhMgMTl14cImZC07ObCwwcDJCZA4vIrpOk0RNNKHyM8fDBAYgIkLr9Cet+EVOFo4eGDARITIHH5GlL+4cLDBwMkJkDi8iuk90xIh9YWHj4YIDEBEpd/Id2uaXUOER4+GCAxARKXXyG9ax5Av16+8PDBAIkJkLj8CukdE1KDPOHhgwESEyBx+RfSnZrWuJTw8MEAiQmQuPwK6W0T0nFZwsMHAyQmQOLyL6ShmtaKCoSnDwRITIDE5VdIb5mQ2tMG4ekDARITIHH5FdKbNEzTTqa1wtMHAiQmQOLyL6ThmnYarRSePhAgMQESl18hvWFC+j/6n/D0gQCJCZC4/AvpLk3rSkuFpw8ESEyAxOVXSNNMSBfQIuHpAwESEyBx+RfSCE3rSV8KTx8IkJgAicuvkF43IV1G84WnDwRITIDE5WtIV9Jc4ekDARITIHH5FdJrdLemXUOzhacPBEhMgMTlV0hTTUgDaIbw9IEAiQmQuPwLaZSm3UjvCk8fCJCYAInLr5BeNSHdQm8ITx8IkJgAicu/kO7RtNvoNeHpAwESEyBx+RXSKyakofSy8PSBAIkJkLh8DWkETRaePhAgMQESl18hvUyjNW0UTRSePhAgMQESl18hvWRCGktPC08fCJCYAInL15DG0WPC0wcCJCZA4vIrpBdpjKY9RA8LTx8IkJgAicuvkKaYkB6jB4WnDwRITIDE5V9IYzXtSfNNMgIkJkDi8iukF+g+TXuW7hWePhAgMQESl18hTTYhTaaRwtMHAiQmQOLyL6T7zXschgpPHwiQmACJy9eQXjVf2yUZARITIHH5F9I488ANtwpPHwiQmACJy6+QnqcHzAOADxKePhAgMQESl18hTTL/hPQ+3SA8fSBAYgIkLr9CmkjjNW0W9ReePhAgMQESl18hPWdCmkNXCU8fCJCYAInLv5Ae0rR51Fd4+kCAxARIXH6F9CxN0LT5dKnw9IEAiQmQuPwK6Rnzgd9fUk/h6QMBEhMgcfkV0tMmpG/oIuHpAwESEyBx+RfSI5q2mM4Vnj4QIDEBEpdfIT1Fj2raD9RFePpAgMQESFx+hfSk+Szzn+gs4ekDARITIHH5GtJKOk14+kCAxARIXH6F9IQJ6RfqJDx9IEBiAiQuv0J6nB7XtF+pg/D0gQCJCZC4/ArpMRPSRmonPH0gQGICJC7/QnpC07ZQa+HpAwESEyBx+RXSo/Sk8TajpfD0gQCJCZC4/ArpEXrKeJt9rPD0gQCJCZC4/ArpYeuw36WaCE8fCJCYAInLr5AmWJDKNRKePhAgMQESl38hPWO8rXiU8PSBAIkJkLj8CukhetZ4m19PePpAgMQESFx+hTTegnRIHeHpAwESEyBx+RXSg9aL9dWoJTx9IEBiAiQuv0J6wIJU51Dh6QMBEhMgcfkX0iTj7eFVhKcPBEhMgMTlV0jj6Hnjbf1KwtMHAiQmQOLyK6T7abLxtkF54ekDARITIHH5FdJ9FqRjSgtPHwiQmACJy9+QmuUITx8IkJgAicuvkMbSFONti0zh6QMBEhMgcfkV0hgLUisqEB7fDpCYAInL35Da0kbh8e0AiQmQuPwKaTS9aLw9kX4THt8OkJgAicuvkO61IJ1Ma4XHtwMkJkDi8jekU2ml8Ph2gMQESFx+hXQPvWy8PYN+Fh7fDpCYAInLr5BGWZDOpmXC49sBEhMgcfkX0ivG2660VHh8O0BiAiQuv0K6m1413p5Hi4XHtwMkJkDi8jek7vS18Ph2gMQESFx+hSNkk4kAACAASURBVDSCphpve9KXwuPbARITIHH5G9KlNF94fDtAYgIkLv9Ces14ezl9Ijy+HSAxARKXJ5DKKeXKQLrLgnQVfSQ8vh0gMQESlyeQehg1yGl7wbnHZrS8XgrS68bbq+kD4fHtAIkJkLi8umn3RpNN5rufG74vA2m4Bek6mik8vh0gMQESl1eQmkyz3z/VXBLSQHpXeHw7QGICJC6vIOXODfxkKiUDaRhNM97eRG8Jj28HSEyAxOUVpJq9rHf7e9SQhDTYeisfIDEBEpdXkEZQ04H33DOgEd0uBelN4+3t1uMb5AMkJkDi8grS/vtrkFHV4XslIQ21npUkHyAxARKXd3+Q3f/r11/9si8aRtFAGmr9djTCOiiXfIDEBEhc3kH695u3Nb1QFtK99Jzw+HaAxARIXJ5BejCPaKF+5+VRUXKHdAe9o5lHt3tKeHw7QGICJC6vID1LXZ82IE3JHicD6XYL0jh6THh8O0BiAiQuryA166//a0DS7zhaBtIQC9JD9LDw+HaAxARIXF5BKv2RDenDHBlIu3Xz+vIoPSg8vh0gMQESl1eQDpluQ5pWQRLSE3Sf8Ph2gMQESFxeQTqt4y4T0h9NOktCeppGC49vB0hMgMTlFaRPsurfSFf0qZDzuSSkiTRKeHw7QGICJC7P7v6ee5z5yIbWn0bjKGpIk+ku4fHtAIkJkLg8fKp5wZIlf+rRFS2kl2io8Ph2gMQESFxeQWo7M0pDMUGaSkOEx7cDJCZA4vIKUu3xyYA0jW4VHt8OkJgAicsrSO81emePPKS3aZDw+HaAxARIXF5BOqkp5dasayYJ6X26QXh8O0BiAiQuryC1P+XUQJKQZtG1wuPbARITIHF5fYDIHSslIX1I/YTHtwMkJkDi8hrS3HxJSB/TFcLj2wESEyBxeQZpRq+T2rdv3yavqiSkz6i38Ph2gMQESFxeQZpK2bWpZmnqFNXfk6KF9DldIjy+HSAxARKXV5Banrldz/qx8NGTt0tC+op6CI9vB0hMgMTlFaS8Gbqe9YOuDxogCelbukB4fDtAYgIkLs+e2Ddb1yvM1/UFNSUhLaFuwuPbARITIHF5Bem4C//TGw/V9ffKSUL6gboIj28HSEyAxOUVpJfoVH14Vr+7a7WThPQTnSk8vh0gMQESl2d3f08dq+88najOomggbf/Trf/0v423q6mz6yXj6Z9/k7JsIL0wmasXbkvi4v/oO5O4+t9J3TF79L+SuPou6R3zNwPJatVP0T1ydfcet/bphcZbjTq7XjKe9u5LyrKB9P3JXH1/YRIX36vvTeLqhcndMXoyV98rvWP+iwQp2qK9afcrdRD+gWqHm3ZMuGnH5dVNuyrB8iQhbaR2wuPbARITIHF5BambVesyTUT/jrSFWguPbwdITIDE5fGDVjd3mCEJSctoKTy+HSAxARKX14/+XtRSFFJOc+Hx7QCJCZC4vIa0uYwopNKNhce3AyQmQOLyGNL+0bVFIZVvKDy+HSAxARKXV5CaWzWpSreKQqpUX3h8O0BiAiQubyEdd8oj/5VQkwik/MOFx7cDJCZA4vL6d6ToihrSIXWEx7cDJCZA4vI5pJo1hce3AyQmQOLyClJ22XJKYpDqHCI8vh0gMQESl1eQrjsmu8355x6bcWzPHkZikA6vIjy+HSAxARKXV5DeaLrRfLe8wXQ3RDFBql9JeHw7QGICJC6vIDV+w37/VHNRSA3KC49vB0hMgMTlFaTcj+z300qJQjqmjPD4doDEBEhcXkGqecl+893ec2qIQmqWKzy+HSAxARKXV5DuovrXjhgx4Bi6QxTScZnC49sBEhMgcXkFad+YGuZryFYbsVcUUivaKjy/FSAxARKXd3+Q3f/r11/9si8aRjFAakObhOe3AiQmQOLyDNLOTbq+a/KDv8hCak/rhee3AiQmQOLyCtLyQ8bqhccTVVwsCqkjrROe3wqQmACJyytI5zddrb9ET65ud6EopFNotfD8VoDEBEhcXkE65BVdP6+Jrr9SRxRSZ1ohPL8VIDEBEpdnf5Cdp++tfJuuz8kVhXQW/SQ8vxUgMQESl1eQ6kzU59A8XZ8k+wfZLvSD8PxWgMQESFxeQbqy+u11j9yrFzST/R3pXFosPL8VIDEBEpdXkDa1oaoLdb1Hxe9FIV1A3wrPbwVITIDE5d0fZLeZx89ftCUaR9FD6kFfCc9vBUhMgMTl4VPNd80uiEpRLJAuoc+F57cCJCZA4vIQ0lp6RxzSZTRfeH4rQGICJC6fQ+pL84TntwIkJkDi8jmkq2iO8PxWgMQESFw+h3QNzRKe3wqQmACJy0NI/y35W4+yqCENoBnC81sBEhMgcXl+gMi1opBupHeF57cCJCZA4vIG0med63eeZZ7Yfa/sy7rcTG8Kz28FSEyAxOUJpIU5GYflZEzT9Q+PogaikAbTNOH5rQCJCZC4PIHUreJSveD4RusvpEoT9ohCup1eFZ7fCpCYAInLE0iH32i8mU2ls67VomEUA6Sh9KLw/FaAxARIXJ5Ayn7CeLOOOv4YHaMYII2gycLzWwESEyBxeQKJnjPebKbZ0TqKHtIomig8vxUgMQESl88hjaGnhee3AiQmQOLyOaT76Qnh+a0AiQmQuLyBdMfChQtn0oSFZqKQHqRHhee3AiQmQOLyBpKaKKSH6SHh+a0AiQmQuDyBNEJNFNJjNE54fitAYgIkLp+/GPOTNFZ4fitAYgIkLp9DepbuFZ7fCpCYAInL55Cep5HC81sBEhMgcfkc0gs0XHh+K0BiAiQun0N6me4Unt8KkJgAicvnkF6n24TntwIkJkDi8g7Sv9+8remFwpDepJuF57cCJCZA4vIM0oN5RAv1Oy+PilLUkN6lgcLzWwESEyBxeQXpWer6tAFpSvY4UUgz6Trh+a0AiQmQuLyC1Ky//q8BSb/jaFFIH1I/4fmtAIkJkLi8glT6IxvShzmikOZRX+H5rQCJCZC4PHvpy+k2pGkVRCHNp8uE57cCJCZA4vIK0mkdd5mQ/mjSWRTSl9RTeH4rQGICJC6vIH2SVf9GuqJPhZzPRSEtoguF57cCJCZA4vLs7u+5x5lPRmr9aTSOooe0mM4Vnt8KkJgAicvDRzYULFnypx5dUUP6kboIz28FSEyAxOXzhwj9TGcIz28FSEyAxOUJpAZqopBW0anC81sBEhMgcXkCqb2aKKR11FF4fitAYgIkLp/ftNtI7YTntwIkJkDi8g7SllmTp3ywRRjSFmolPL8VIDEBEpdXkP66KNu8+zuj1z+ikLTMFsLzWwESEyBxeQXp8pwrp8x495lu1F8WUm5T4fmtAIkJkLi8glR5iv1+SBVZSGUbCc9vBUhMgMTlFaRSm+3388rKQqpwlPD8VoDEBEhcXkFq8YX9/skOspDyDxee3wqQmACJyytIc49fsF/X985s/J0spENrC89vBUhMgMTlFaQ21ajcEUeUoToNo3l0Q/SQalcXnt8KkJgAicuzm3ZtY3l0Q/SQ6lYRnt8KkJgAicvnj2zQ6lcUnt8KkJgAictDSNv/spKF1Kic8PxWgMQESFxeQfrl7HLJeKExrUmu8PxWgMQESFxeQTq5Yq9bh1jJQjouU3h+K0BiAiQuryCV+yIaQLFDakUFwhtgBkhMgMTl2eG4NiYHUlvaKLwBZoDEBEhcXkG65Z7kQOpAvwpvgBkgMQESl1eQ/jut/a1jrWQhnUKrhTfADJCYAInLK0hjiZJyr11n+ll4A8wAiQmQuLyCVOOCz1evtZKFdDYtE94AM0BiAiQuz55GkaQ7G7rREuENMAMkJkDi8grScUuTA+kCWiS8AWaAxARIXF5B+uyU75MCqSctFN4AM0BiAiQuryC1r03l61rJQrqU5gtvgBkgMQESl1eQTjo1mCyky2me8AaYARITIHF5/TSKHStlIfWjD4U3wAyQmACJy2tIc/NlId1A04U3wAyQmACJyzNIM3qd1L59+zZ5VWUhDaZpwhtgBkhMgMTlFaSplF2bapamTjNlId1FLwpvgBkgMQESl1eQWp65Xc/6sfDRk7fLQhpLzwhvgBkgMQESl1eQ8mboetYPuj5ogCykCfSI8AaYARITIHF5Ban0bF2vMF/XF9SUhfQU3Se8AWaAxARIXJ49ROjC//TGQ3X9vXKykCbTSOENMAMkJkDi8grSS3SqPjyr39212slCep2GCG+AGSAxARKXZ3d/Tx2r7zydqM4iWUjv0Y3CG2AGSEyAxOXtH2RX/bQnGkcxQPqQrhbeADNAYgIkLs8g7dyk67smP/iLMKT51Ft4A8wAiQmQuLyCtPyQsXrh8UQVF8tCWkTdhTfADJCYAInLK0jnN12tv0RPrm53oSykH+kc4Q0wAyQmQOLy7Lh2r+j6eU10/ZU6spBW0WnCG2AGSEyAxOUVpNx5+t7Kt+n6nFxZSBvoROENMAMkJkDi8gpSnYn6HJqn65NqyELSMo8X3gAzQGICJC6vIF1Z/fa6R+7VC5oJ/46klWksvAFmgMQESFxeQdrUhqou1PUeFaM6BkoMkPKPEN4AM0BiAiQu7/4gu838W+yiLdE4igVSrZrCG2AGSEyAxOXhIxt2zS6ISlFskI7MF94AM0BiAiQuDyGtpXeSAKlxGeENMAMkJkDi8j2kVhmbhLdAAyQ2QOLyPaSu9J3wFmiAxAZIXL6HdAO9I7wFGiCxARKXJ5DW79TX/qf/t+TvMC07xve55O7g/Q8bbukWH6RxyThoAyAxARKXJ5BKT9ep5BP67hmyZuMDA/ZZp+f3nhAnpNfpZuEt0ACJDZC4PIFU5rL5NHFBoOCZWtdfjJ9K59ov9/Lx1oVxQlpIFwlvgQZIbIDE5QmkS0gpeOaXF+w33l7/euDDIKTCbUZ//u7Wbv0v+8TGzBNcLxxz/+ySX7MovTCZq+/5K4mL79D/SeLqfyZ3x+h/JHH1ndI75i8HSIUzXqARkwMFz/zgcvPt0GeLQZrX0ujrErcD+Y4s/08Ml0bIH+0LnQq71+7UFcUv+EFf820JSEuvNfpxj1v79MLAqcH0guulY23vXvEllfT9yVw9qYvv1ZO6Z5K7Y/Rkri5+lfnPGZKu/z7j2YkfKAcs/sq+afdGMUhWMfyOpC2gDsK3TvE7Eht+R+Ly6u9I+27JMX9BKjcudM4fXVfp+rZuyxKFpLXK+ER4GwCJC5C4vII0js6bNGvGM2fQlNBZYwet2TDy5v36nPd1Q8Wcbpr2b1yQXqQTpR/cAEhMgMTlFaRGN9vvr24ROmvnhN69xhgXHzdM1688x+y9uCAVtKT8LbIbAUhMgMTlFaRSH9vvZ5bRoygmSNqmk2mx7EYAEhMgcXkFqdx0+/275eUhaddLP94OkJgAicsrSCd2su7O+7fzyUmA9CBNkN0IQGICJC6vIM3MOKz/PaP61cz8KAmQ3pQ+kj4gMQESl2dPo3inoXn3d9OoXkI2VkjfUVfZjQAkJkDi8vD5SBu/ifLQJzFD2pLbTHYjAIkJkLh8f/ATq3p5shsBSEyAxOX7Z8hadaIVohsBSEyAxHVgQLqSPhDdCEBiAiSuAwPSUHpRdCMAiQmQuA4MSA8J/yEJkJgAictDSMUPfiIIaQoNE90IQGICJC6vILX8yX7/ZqNkQJpJ14puBCAxARKXV5ACRxEqvFv4hcbsvhJ+JVlAYgIkLm8gKcc+aeHgJmFIq+hU0Y0AJCZA4vIG0tJHqNuVZlfdtT4ZkLZmHyu6EYDEBEhcXt20O2NlNIDihaRVqyO6EYDEBEhcHt5rl0xIjWRf3AWQmACJyytIVYLlJQXSifSb5EYAEhMgcXkFqZtV6zJNBiQFUjdaIrkRgMQESFwe37Tb3GFGUiBdQXMlNwKQmACJy+vfkRa1TAqkwfS65EYAEhMgcXkNaXMSjiJkdB89IbkRgMQESFweQ9o/unZSID1HoyQ3ApCYAInLK0jNrZpUpVuTAultGiS5EYDEBEhc3kI67pRH/iuhRgLSfLpUciMAiQmQuA6QP8guo/+T3AhAYgIkLs8grX7/lRkbkgZpU0YbyY0AJCZA4vII0ntNrId+t/00SZC0vKMlNwKQmACJyxtI46lsr4cnT7i4bObzSYJUt4rkRgASEyBxeQJpaWb7TdaJje1ySrwEpgykFlkFghsBSEyAxOUJpMsr/x449Xvla5ID6TRaKbgRgMQESFyeQDq8X+jk1fWTA6k7fSW4EYDEBEhcnkAq9UDo5EPJeYiQ1p9mCW4EIDEBEpcnkMqPDZ28LznPR9LupJcENwKQmACJyxNITS8KnTyneXIgPUiPCG4EIDEBEpcnkG7LWRY49WXmsORAmkx3CW4EIDEBEpcnkDZVrDXbfL9van6V30uykYD0Ht0guBGAxARIXN78QXZuBTr8/D5da1DVL6NxFAekBdRLcCMAiQmQuDx6iNC662oRUb1bN0flKA5I/6OzBDcCkJgAicu7R39v27AjOkVxQdqU0VpwIwCJCZC4DpCnUWhaxfqCGwFITIDEdcBAqpcvuBGAxARIXAcMpJaZW+Q2ImZI3z557RMbor0wIDEBklKqIJ0u+XrMsUDaOGtUl0PMJ1vlX78ouq8AJCZAUkoVpIvpc7mNiBbS8hdvOKGUYajKmXdNu7YyZZ7yYjQ/FgGJCZCUUgXpVslDREYFafMNRxqGMhte9pj9wPP1j7Ygqn3n/1y/EJCYAEkpVZAm0ENyGxENpI1dqOyJN09drZ43t1cZyun2rstXAhITICmlCtI0ukVuI6KAtLEznbCm5NmrRx9FdPTYXyJ9KSAxAZJSqiB9Tj3lNsId0vpOdOKvjp/Z+vY5OVR2bISvBSQmQFJKFaR1dJLcRrhC+vVE6rSe/eyPQ/IzJ/NfDEhMgKSUKkhapXpyG+EGad2JdGrEPxvNK1v6I/aTgMQESEopg9Q4d6vYRrhAWn08dd4YeYXJGbV/4j4HSEyApJQySJ1pudhGRIa0qgV12+S2xK3UirMGSEyApJQySFcQf2Mq1iJCWn4Mnb/ZdYmt51J35lOAxARISimDNIxeENuISJB+akSXRnMwyvXH0QjnzwASEyAppQzSUzRabCMiQFp6BPWJ7qCuP9bIfNnxE4DEBEhKKYM0na4T2wge0uLDqH+0d2rMLpW3wOl8QGICJKWUQVpM3cQ2goW0pTndGP0yT1Bdp4ekAxITICmlDNKmzFZiG8FCuj82rQOpncPde4DEBEhKKYOk5cv9RZaDtCK//A+xrFNwJl1c8lxAYgIkpdRBOrKS2EZwkC6mkbEttO4YGlPiTEBiAiSl1EFqleH+150oYyDNyWzg+ofYYi2umjW1+HmAxARISqmD1FnuyebOkLY0pbdjXmpmbom77gCJCZCUUgepBy2U2ghnSGPpgjjWeqzEXXeAxARISqmDJPgSSY6QlleM7Z6GYNdQx/DbnIDEBEhKqYN0Jzk/kCCOHCH1pFFxLbalM10VdgYgMQGSUuogPUCPSW2EE6SZGQ1jvach0NqG9ID6MSAxAZJS6iBNorulNsIB0ubGGe/Hu9w3lXM+UT4EJCZAUkodpLdpkNRGOEAaTRfFv95L1EJ5oCsgMQGSUuogfUq9pTaiJKSfKub9mMCCXei+og8AiQmQlFIH6Xs6R2ojSkK6KLEnaSyrkFd0jx8gMQGSUuogracTpTaiBKQZGY3ivKch0BjqGjoNSEyApJQ6SFrpY6Q2ojikzY0z3ktsxYLji+6cByQmQFJKIaSaNaQ2ojikUYkfffKznNrrAicBiQmQlFIIqXEpqY0oBmlZXkX22FpR15+uD5wCJCZAUkohpJPI+SDCsVcM0mUOz4WIuXW1sz+1TwESEyAppRBSV1oqtBHhkL7KPszlcJBRNZWOs189CZCYAEkphZAup08cLhlP4ZC60HMiq55N91vvAYkJkJRSCGkQvSW0EWGQ5mQ0ju74W24F/5gESEyApJRCSKNootBGhEE6id4QWvZeOtd8B0hMgKSUQkiP0TihjVAhvU7thFbVClrSKxogsQGSUgohvUJ3CG2EAmlrs4zZQqtq2qfZdX4FJDZAUkohpFl0jdBGKJCeVh7bk3hX00BAYgMkpRRCWkg9hDaiCNKmw7O/ElrUbG3NnPmAxAVISimE9DOdIbQRRZDGUh+hNe1eoBYFgMQESEophLQp4wShjQhB+vWQ0t8LrRnoLHoAkJgASSmFkLRyDYU2IgTpdrln3QZaWi5vIyA5B0hKqYRUq7rQRgQhrcirtFJoyVCj6GJAcg6QlFIJ6ZjSQhsRhHRNrMf6jqItzehd8UWVAIkJkJxyhNSONshsRADS4twav8ksqPZR1mFSj1J3CpCYAMkpR0j/R/+T2YgApB70iMx64Q0U/8VLDZCYAMkpR0gX0xcyG2FDmp9VX+z1LdS21wo+MykZARITIDnlCOlaqaN/25A60xSZ5Yqlv04tZR5P7hQgMQGSU46Q7qBXZTbCgvQdtYj2ZZdjS99zBo1PyspmgMQESE45QrqPnpTZCAvSWBors1rx9D1LylX6OTlrAxIbIDnlCOlpqau+Belk+k5mteLpe7S76IrkrA1IbIDklCOk12iIzEaYkNblNpJZrEQGpI31soq/jp9UgMQESE45Qpot9TwKE9LzSbuT2nys3UTqnKTVAYkJkJxyhLQw8eM42pmQesq9AGCxrAetnkDTkrM6IDEBklOOkJbTmTIbYUAqqFZli8xiJbIgfZBxTHLWByQmQHLKEdJGaiOzEQakWVI/3UpmP43iPHo4KasDEhMgOeUISSsrdP+AAWkQPS+zVslsSEvLVFuTjNUBiQmQnHKGJHUYfQNSo5xfZNYqWeCJfTfSLclYHZCYAMkpZ0iNyshsxI6dS6ijzFIOBSCtPaT0kiSsDkhMgOSUM6S2JHGQbhPS2MReoS9iwaeajxM7WIsaIDEBklPOkM6ixF9+xWzHzk70jchKTgUhbWmU+ZH86oDEBEhOOUPqSV+KbMSOraUaiCzkWOjgJ1PlDuJaFCAxAZJTzpD6C/0RdcdUukFkIceKjiLUKQlP1AAkJkByyhnS7TRVZCN29KbpIgs5VgTp8+zDZX6pUwIkJkByyhnSWHpKZCO2HVo5Kc+NtVOOa3eZ/H0agMQESE45Q3pK6HkUn9BFIus4p0BanldphfDqgMQESE45Q5pKt4tsxG1Cr9HnnHqk1Tuov/DqgMQESE45Q5oldLVslr1aZB3nVEjr6+R8Lbs6IDEBklPOkL6UeaTp9xknSSzDFXbs7yepi+zqgMR0MELa9rtbu/W/HM5dQae7fmUUPUD3SSzDpRcqH2jH0XTR1fc47Ripduj/JHH1PwvdLxN/e/Q/krj6Tukd85cEpD173dqvO527J6uV61dG0Vn0s8QyXPp+9aPPM45139wY2u9+kfjbp+9L5vJJnd35KiPVPukdUygBKd6bdlqVwwR+qK4vc9RO90vFX7GXdelCT0iujpt2TAfjTbu4ITUoL7AJL9FALyF9K3uEcUBiAiSnGEhtJQ6jfxnN8hKS1l/sRaTNAIkJkJxiIHWhxF9gb2uNCn96CmlVfrllcqsDEhMgOcVAupzmJbwFH9H5OzyFpI2my+RWByQmQHKKgXQLvZHwFgympz2GtOnIzI/FVgckJkByioE0hp5OeAuOzV7pMSRtCp0stjogMQGSUwykZxJ/NPWarOM1ryFp7YSe/6EBEhsgOcVAeoNuTnQDptEA7yF9knmU1PM2AIkJkJxiIH1CfRLdgFtpsveQtJ40Tmh1QGICJKcYSD8k/hDQjrQsBZB+KFtF6Dh6gMQESE4xkBI/aPGWvLpaCiBpt9BAmdUBiQmQnGIgaXlHJzj/p+aTY1MAaV31Ut+KrA5ITIDkFAepbn6C899HD6QEkva40BOTAIkJkJziILXMTPDFUi6gz1IDaWsLelNidUBiAiSnOEidKcFXOa6TtyU1kLTZGY0lXjEJkJgAySkO0sX0RULj/0idtBRBMn4YPiSwOiAxAZJTHKTr6f2Exn+ebtNSBemHslUF7gIHJCZAcoqDNIImJzR+f+sXldRA0gbTgMRXByQmQHKKg/SoeadbArXMMl9GL0WQ1tfJ/Srh1QGJCZCc4iC9ktizTdfnNjHfpQiS9rTAq0kDEhMgOcVB+oD6JTL9+3Sl+S5VkLaeQNMSXR2QmADJKQ7St3R+ItMPs4/CnypI2seZDRJ9FDggMQGSUxykNYk9Re4MWmy+SxkkrQfdn+DqgMQESE5xkDT7l5w425p/qPU+dZB+LJe/KrHVAYkJkJxiIdWsnsDwC+kc633qIGl30tWJrQ5ITIDkFAupWc7W+Id/hO6x3qcQ0oY62QsSWh2QmADJKRZSJ0rgplEv+sB6n0JI2iTqmNDqgMQESE6xkLpTAn/TPLq0/ZKuqYSktaPXElkdkJgAySkW0nU0I+7ZV2W2tU+kFNInWUdtSmB1QGICJKdYSMMTeLDdq3SjfSKlkIwbmIkcUwyQmADJKRbSI/Rg3LMPopftE6mFtLxCIq/QDEhMgOQUC+mVBF6PuV1G4FmBqYWk3UVXxL86IDEBklMspA/pqnhH31SmfuBUiiFtPCJrftyrAxITIDnFQvqOzo139Dl0SeBUiiFpUyj+V4MGJCZAcoqF9Fv818HRNCFwKtWQtE70UryrAxITIDnFQtLKNIp39G6h4z2kHNLn2YdvjHN1QGICJKd4SHWqxTt6zUoFgVMph6RdTiPjXB2QmADJKR5Si6w4D2q1mE4Pnkw9pJWVy/8vvtUBiQmQnOIhnU5x/hHmGRoaPJl6SNo91Du+1QGJCZCc4iFdTJ/HN/lV9G7wZBpA2lQ/c25cqwMSEyA5xUO6ochDbDXP/jV4Mg0gaa9Q+7hW9x7ShqVzXn1seP8BsxJ4AosVICmlHNLdNCmuwddmtwidTgdIWqf4HjXoEaRVX06ffN/gvmefUD+Pgh12U2JPpQIkpZRDboLdGAAAGDBJREFUepzui2vw1+i60Om0gPR5dt0NcayeREibfvxi9rOjru95WrMauSE9lY5qc86Vt42bMnNKt9JEjUcsjX99QFJKOaTXaXBcgw+kV0Kn0wKS8UvbsDhWTwakn4df1a3t0fkhPDnVG5/S/dqRj7827wf1z11rH++URZntxq+M89sAklLKIX1MfeMavGXW6tDp9IC0snL5ZbGvLg9py/2VTD3l6rU664q7Hpz4/hcR7hb9aUxLotyzJq6P5xsBklLKIS0NHMAkxtZlNy/6ID0gaWPp4thXF4c0uxmVG/7BEotGNPfaLbr9KKLyPabF/uc8QFJKOaQN1DaeuV+na4s+SBNImxtkfhDz6sKQfr4kg877IfhRlHd/f3xtDaJq/WIdHpCUUg5JyzsqnrlvDD6pzyxNIGlvUeOYD7wqCsm8Vdfg7aKPo/47UsHbvSoSHX7Ll7F8N0BSSj2kenG9jOzxmUW/IqUNJO0iujvW1SUhzW1JZQardyfE8gfZjS93L0vUYPB3UX8FICmlHlKrjDiOHbIup5nyUdpAWpFfJvrroZ0cpJX9MqnzkrCzYnxkwy+Pd86mzNajo3zQFiAppR7SWRTHwz2nqb8ipQ8k7SE6NcbVpSAVPJ5PRxZ/bYzYHyK0YnzrDMrt/Piv7hcFJLXUQ7rMfF3yWBsU9ky69IG09YRYH98gBGlWUyo/ssSP9rgea/fV4COIKlz8puvdeICklHpIg+M5wGKrTPUArekDSVuQc2hsLywrAsnhVp1VvA9aXTDwUKL83jMiPxoPkJRSD+nhOA7I9VtuU/XDNIJk/KyM7aj6ApAcb9VZxf/o74IZvfOI6gyMdBxcQFJKPaQ36KaYp55G16gfphOk9XWzYno+ReKQnG/VWSX0NIr1z/9fLlHLd9gLAJJS6iEtpO4xTz2IXlQ/TCdI2uvUPJYHCSQKafnFGXT+j8wnE30+0qoJ7TMzevzMfBaQlFIPaX1Gu5inbpUZdg9tWkHSzqUxMVw6MUj8rTorgSf2fXwsVRxd4PgpQFJKPSQt/7BYh/6t2Ov8pRek/1Us/330l04I0kctqOzgCMcvkniG7ObR5am14wEwAUkpDSA1y3H+D4+v2K9IaQZJu5+6RH/hBCBx99UVJfNU8x/Ooex+60qeD0hKaQDp/4i7ic81iKaEfZxmkApaxXC8yLghudyqs5I6ZsPLdahGyb+PAZJSGkDqR7NjHLp1+K9I6QZJm59T2+E/cOfiheR2q85K7OAnvw3OKfnDD5CU0gDSSJoY28y/5TYOPyPdIGkDaEC0F40PkvutOivBowh9ejyVHR5+dyQgKaUBpImxHqX0jeJ/80w7SL8dlj0vyovGAymaW3VWkofj2mp80yZhT1kCJKU0gDSb+sU28830QvgZaQdJe4VaRnkPShyQfmxJ5e+O6iHzsse1W3YuZV21puhjQFJKA0g/0v/FNvMJxX5FSkNIWhe6P7oLxg5pTnXqEuXdM9IHiJx2OFV/PvQRICmlAaSC3Gbs55z6LfeYYuekIaRlFfJ+cL+UFgekiWUyBkd7aEfxI62uH5xLJwUffwdISmkASTsstufIvlnipmAaQtLGRPkKajFC2jo8o9wL7hcLlIRDFn/dkUoH7i0EJKV0gNSOfotl5JtLPOUnHSEVtFQOvBeh2CD9ejbV/Dj6iyfj2N9bH69CR75lngIkpXSA1J0WxjJym4ziT4VOR0jap9l1onmaaUyQvm9OrX6K4fLJOYj+qn6ZGd1/BqSw0gHSTfRGDBOvzy3xGn9pCUm7mgZFcalYIM2qRhfGdFjkZL0axYyGVHn8VkBSSgdID9LDMUz8VsnXQU9PSGtr5kTxYucxQHo0N+ue2EZI2su6bBxamk78CpCKSgdIr8V0+O9r1CPa2aUnJO0Fau3+x6SoIW25liq+HuMESXx9pG9PoVJD43nVgGgDJKciQloQ06F+65UpcddEmkLSzqSHXC8TLaS1Z9DhMb8kW1JfaOzlWnR4NA+viDNAcioipHXUIfqB5zv8+TZdIS0pV2m522WihLSoAbXlnqnKl9xX7Ft7fRZ1juGpV7EFSE5FhKRVrhv9wEPpkRLnpSskbRRd6HaR6CC9VZmuivloyMl/6cs5TaniA7E+myzKAMmpyJBaZ0b9rAOtZWbJ40mmLaTNTelNl4tEBWl8TvbYeL5/0l9DtmB8HjX7KCmrA5JTkSFdTh9GO+9Pma1Knpm2kLQ5WfVcfiGPAtLmflT5bddLOeXBizEv607Z/dYmYXVAcioypPsdbq4xPUTDS56ZvpC0K+mWyBdwh7SyAx0Z01+si/LkVc1frUPV43sd4IgBklORIb1f7BgMETqDvih5ZhpDWlMj12FgJVdIX9WnU2I7eGtRnkCyHsnaebH06oDkVGRIq6K+2259mcMdzk1jSNpEahvxsdpukF6vSP1ifzG9QN5A0rT5rYq9noxAgORUZEhajSpRjvui8lrmRaUzJK0zPRbp0y6QRmflRvzyyHkFyXr67DGzRFcHJKdcIJ1KUT4Usxe953BuWkP6rkx+pNcbighpY086JJGrp2eQNO2nCynzeskfSoDklAukG+itqKYtqFbZ6a8paQ1JG06tI/xZNhKkn1pR06WJfGsPIWnaW3WoScyPveADJKdcID1B90Y17Sy6yOns9Ia06Wyq8wn72QiQPqlN50TzTAw+TyFpa7pT6bHRPnnXNUByygXSPOoV1bQ30vNOZ6c3JPNZrWXZlx/jIb2alzEwwUcNeAtJ0ybnU8fonmHvHiA55QJpY3aLqKZtkOv4p780h2QdZ4EzwUHaOjyz1JOJfl+vIWn/O5XypzhcNI4AySkXSNpRZaP5v3cRneJ4ftpD0j6pw91KYyBtuIiqz0n423oOSds6Ope6R/+ArwgBklNukLrRN1EMew9zjKv0h6QtO56aOB4Z1RnSoubUclni39V7SJr2aSOq94HTJ2IMkJxyg3Q7RXODoH2G831YPoCkbehB1WY6nO8E6eercukiiSfNpQKStuHqjOwhcTxUvViA5JQbpCl0u/usK7ObOn/CD5A0bWRWrsNDCktCWjekPNV5QuRbpgSSpk2rTq0WJbo6IDnlBumbaF5S6EnuKen+gKRNrUDXlHi0T3FIm+6rRvn3CP1lM0WQtBVdqHzUj0NmAiSn3CBtrVPK/VeCrsQc0s0nkLQvjqBOq4udFw5p67OHU5lB8T5GtUSpgqRpj5SnLpEe0OEeIDnlBkkb6/ggurBWl6/F/LXPL5C0VR1LPCMiDNIbzSm7j8CdDMFSB0lb1IqqJ3REB0ByyhXS+mrl3P4DG8T+HuUbSOZz9IodCkiBNLcjZZzzVfEvSaQUQtI235adcXUC95gAklOukLS73J4Bt6xsFe6JmP6BpGkTcrPCnpoYgrS4dya1miH5rVILSdM+PJIaRPsyUSUDJKfcIf2aX6H4rw/hXU7sYQv8BEmbWY0uWF/0YQDSioG51FD8eaaphaSt70c5g+N9lBMgOeUOSbuNhkb69Dc5h7F3ZfkKkra4sXoAbwvS2lvLUZ0n5Q/Hk2JImvZCPnWI84BdgORUFJBWls+9NcJN6vOIf+SZvyBp67pQrdBNHgPSxjFVqMpo4SeYWqUckrasE1WK8SWCAwGSU1FA0l46hOqx9/N8nHEM/x+2zyBpW2/JKBO8Gbfn96frUtlb1kT8gnhLPSRt69jS1COerQMkp6KBpK3um0l3MJ/rRK/yX+g3SJo2qUzGLfZ9+TOaUk7fkofqkykNIGnagiZ0WBx3ogCSU1FB0rQPazK/KL1DbSJ8mf8gafNqUZd1mjbnJMro9nUS1rdLC0jaxuszs26K6sWj1QDJqSghad/UcDpunba1JTk94DOYDyGZTyNv/HbXDDolhhfgi7n0gGT8P1iLjo310HyA5FS0kLSvq9OIkudOpjMjfZEfIWkbLyaiptNif1XzGEoXSNrq86jsg7GtDkhORQ1JW3govVb8vC+PzIr4kl2+hKRpo497piD2VzWPpbSBpGlPVqAzXF+aQw2QnIoekvZR1pHh9wQv7plFvSN+iU8hWR0skLTFbahahDuMSgRITsUASetDdykf/TywFB02PvKxRgGJKZ0g2c9Cj/6oSIDkVCyQVlQu/2Po9PVlqM4jbk+3BCSmtIKkaXOPoqPmRnthQHIqFkjafcGj160dkkeHjHX/kz8gMaUZJO23vhk5Q6M8kjkgORUTpC3HZJh3dv86qgpVGhbNbQFAYko3SJr2ajVqG90LVwCSUzFB0t6jZpP7t8imcjdFfkB4MEBiSj9I2vIzqEJUx+sDJKdig6SdS0RZzQZFe3cpIDGlISRNe7AMnRfFf5CA5FSMkJZddOu0GF5OEZCY0hKStvBYqvWO66UAyakYIcUYIDGlJyRt8+CsjH5u9yEBklOAxHUwQtK02fWo0aeRLwJITgES18EJSVvTg0r3nhbpIeEHLKQd4/tccndBydOAlFAHKSRNm1iVqGL3F37jPn/AQrpnyJqNDwzYV+I0ICXUQQtJ2zKjX3Wi0p0fd74L70CFpHX9xfhJdO7S4qcBKbEOXkhGBXMH1yfKaj3a4aCYByqkLy/Yb7y9/vXipwEpsQ5qSGYLBjcnymw9vPjzhA9USB9cbr4d+myx0ws6GX273y1dd71I2pbc2ZO7Y/yx+i8T2mcQHXPXorDF/XWV2Rs1pL4KpKLTiy41+r7Qrf36XtfLxN++fUlcvFDfn8zV9ydzx+zVk7pnBHfMmqfOziGqd/2ne0KL63Krl0z8KrMnWkhf2Tfn3ih+2gw37eLvoL9pF2rl451ziWr3ftl+1syBetPuj66rdH1bt2XFTwNSYgGS0m8vd88jyu/+8sYDF5I+dtCaDSNv3q/Peb/oNCAlHCCFt+Hl3tWIynZ+/I8DFdLOCb17jTEuPm5Y0WlASjhAKtGWGf1qEJXu8lBMB0uJLTxEyClAYvInJLMFg4+2/sD0o/tF4wqQnAIkJv9CMnbMsrtaE2U2Hyz60mvBAMkpQGLyNSTjd6TFo1tnEDUYHPURU6IOkJwCJCa/QzL6+fHOOUR1+81gXj84zgDJKUBiOgAgaeYfmLqWJarV++WYD8TPB0hOARLTgQHJaO1z3coRVb3sNbdDHEYbIDkFSEwHDCTN/ANTz3zD0tUyr94BSE4BEtOBBMlo81u9KxE1vOuHxFcHJKcAiekAg2S08eWuOZTZevy6BFcHJKcAienAg2S0anxrotJdX07o1yVAcgqQmA5ISEZfDD6MqEY/l+MQRQqQnAIkpgMVkqYVzOhdjqjB8HgfjgdITgES04ELyWj9pM7ZlNXxcfY4RJECJKcAiemAhmT0w+gm5jG93oz9UQ+A5BQgMR3okIwWDKxKVGfgohhXBySnAInpIICkaVve7F6GqPnoFbGsDkhOARLTQQHJaPXjHTOoVOdJ0T8YD5CcAiSmgwWS0ZLhhxMd2i/aJ1wAklOAxHQQQTKa2y/fvEf8f9FcFpCcAiSmgwuS+cBW8wFEHR93f+FhQHIKkJgONkhGK80HEOW53iMOSE4BEtNBCEkz7xE/lKjmwIjHegAkpwCJ6eCEpGkFb3Yva94j/jN7CUByCpCYDlZIRmvMe8RzO09iXqoWkJwCJKaDGJLR98OPJKrUe4bT5wDJKUBiOrghaeY94lWIjhr8XYlPAJJTgMR00EPStI2Tz8qlzA4Pzw1/Ti0gOQVITIBktmLscWRU++Srxr0dPAQyIDkFSEyAFOiLe3q3rWJqogoteg574ctNgOQUIDEBktrquZMGd22QZXLKrtep3/g3JV/tApBcAyQmv0Gy2/Tty8N7ty5v/Xiq1Lz78Je/LZBYFpBcAyQmf0Ky+2f9jPEDO9fNMDnlNug68PG5cT1jvShAcg2QmHwNyd4xv5i39pqXtn48Hdqx9+g3l8W7ICC5BkhMBwAku83fvjm6d8dqoVt7kxZsiXlBQHINkJgOGEiBVs193Li1l2lyyqnbeeD4GbEczhWQXAMkpgMNkt3GBZOGd29e1v7x1Nq4tfdtVMcoAiTXAInpwIRkt2XRqyMvPSHf4lSx5Q3uCwKSa4DEdCBDCmTe2uvaIOsU90sCkmuAxHQQQLLbGMV9eYDkGiAxHTSQogmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCURSO69O/pP9wulZ/tHT0n1CHH3w+jvUj1C3L0y+r9UjxBfSYU0rOX6ZC6fzPa17JvqEeJuRstpqR4h7q5ruTPVI8QXIDkHSKkJkJwCpJQESCkIkJwDpNQESAgdzAESQgIBEkICARJCAolC2jG+zyV3F4SfLv4+XYs0+w3nGF2U2vki5TS7vuGWbsU/l4ZFGj3dd3tYopDuGbJm4wMD9oWdLv4+XYs0e9/pmqb9keIBI+Q0+/zeE7oV/1waFmn0dN/tYUlC0rr+Yvy3cu5S9XTx94LfTrRIs+sXLkr1eBFzml3/eOvCbsU+l4ZFGj3dd3t4kpC+vGC/8fb619XTxd8LfjvRIs2+55xHb7xizIaUzhcpp9mNN9a1Mc33e6TR0323hycJ6YPLzbdDn1VPF38v+O1EizT735c9tGLFyMv+Sd10kXOaXQ9cG9N8v0caPd13e3iikKwHAwT2SuB08feC3060SLNbF9h10ZwUjeaa0+x6EFJ67/dIo1ul8W4PTxLSV/bP5jfU08XfC3470SLNbl/iuldTNpxLTrPrgWtjmu/3SKPbpe9uD08S0h9dV+n6tm7L1NPF3wt+O9Eizb7usUJd//eieamekctpdj1wbUzz/R5p9HTf7eGJ3v09dtCaDSNv3q/Peb/odPH36VqE2bdfMmHzhjF9d6d6RDan2f/U5nTTtH/Tfb9HGD3td3tYopB2Tujda8yfuj5uWNHp4u/TtUiz/zKsx6X3bEn1hHxOs19p/jXznPfSfb9HGj3dd3tYeIgQQgIBEkICARJCAgESQgIBEkICARJCAgESQgIBEkICAVKaNILM8jq85XrJ9g2YBRZG832Yr0YJBkhp0gi647nnnhl2GD3sdkmTwpKS/26AlNIAKU0KONheN+9fl0uaFB4FpDQLkNKkoIOb6Wtd//S0vDLHTTI+OunExafkVetpHhFkaqsyeS2n6haFM4xbgS3bV7FeuKFj1T1hC+hFX96+SqH54Qk19hatCEjJCZDSpKCDYfS5Pjerw/Q5/elBXT+1TquPCt7M6qPrr9F5M2acSTMsCiu70aKfJtGbxuU3Zw4MX0Av+vInyHxW3K8ZNysrAlJyAqQ0KejgxOy/9ePqmwfA7mrcyDvVYGVwqqnrY04xfv5sy+5lU7jS+HfbUf4c43OP0XfhC+hFX65lX22ceICWKCsCUnICpDRpBM3cvHnTN1fQtXoB3fiv0dP0jX5qWfNzfTKDl6p9UhEkvW+2cZPvpCahBQKQlC8/65B9ut6qsXoWICUnQEqT7Lu/Kfu63foSCvS2fmpd83Mmm23Dm1TIyqL2CqQFNF7fmDEutEAAkvLlL9En+lq6Tz0LkJITIKVJI2jC7NkfLPhLNyVcsdBKUyB1yLpz/g8/1lQh6Uc30x/O2hRaIAQp9OU7yg7Q78/4TT0LkJITIKVJyp1uf1Cf4MkQpFXUzzhRWDoM0lha1vrMEgsoX673qKkff3LYWYCUnAApTVL/DNS6ovmDacrQwiJIP9HduvnnozY2havIvGN7U9YlNLXkAkVfrr9H79CksLMAKTkBUpqkQvo0p9mUD4flXK78RNpTp9Z7n99y8sl58/4xKdxFd5v3fZ9NFXYVLXDLY2afKV+u78k/ovS2sBUBKTkBUpoU9sCEBafn5Rw9rlCBpC9qW/bQa7ZNr1p5hUlh/XE5Joi36CplAbsBypfr+tV0UfiKgJScAMnPvW8+DAKlQ4Dk4/Yc3ybVI6BAgOTbfnvvzKxvUj0ECgRIvm1SRr2ZqZ4BBQMkhAQCJIQEAiSEBAIkhAQCJIQEAiSEBAIkhAQCJIQE+n/r20q2hiJ7owAAAABJRU5ErkJggg==", "text/plain": [ "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deaBM9f/H8fddrddyUdakFLIV\nkaVIi+qXaEUpUimlpEUqRAqlpH0laVPas5WkolIpVL7JEsp6nTZEcnF+Z5mZ+5l7z3vOLO85\nM4fX8497586d+7nvc8zDnTt35gzpCKGEo1QPgNCBECAhJBAgISQQICEkECAhJBAgISQQICEk\nECBF0RLqGH7GUHpMZuFWOeXWJbKA1CAxVmKHsBeMaQOH0FOxjRH/7ou046LdqWGXSxDSlpHt\nquXkHz9kbclPvfheYksvoSMTW8CoYGSbqrm12j6w1Th9No0Pnd+GpoS+C3ULnjudaCgzSsfw\nM5x2dTy7ogl1umkrP757zL+5sVVWZRpctyqR5bmihhT1Blq76L66L8c0RgK7z95xzv8wKYD0\nTBnKbdn5hNKUPabE56pfmdDSEpCmlKWc408/Jpcqf6jrM6lh8PxllP9v6LtkZBcETl+YkQCk\neHbFbqpQ6LoREWMhle9h1L1jZSrzUWLfwbFoIUW/gfFcWxLZffaOc/6u3kN6lrLv3W683zWh\nNI0q9rlfKOWQplLm0G3G+4KBlLVY31ePPgt84ka6pei7HBv8SfVnqSbxQ4prV/xFdd23ImIs\npMC+23ktHbYvwe/huH7HqC4X9QbGdW1JZPdZO475rp5D2lKG3gic/DAz82ddH0CTzQ8W0tn6\nBeYti/a6vu+J48uVP8W6Bu95+PjypY4csNE4OYze+7Jj+ap9tu+f0KBMozH7jbP2P9emfOmG\nw/4JrR4GadvtDUvn1r/VZKG/0alyTo0zZxU7qaweaHul0E3uIXS1ccOBLrU/2p2fEbrBs4Su\nrNbEPvkk3WRBUlda16NKmeaT7OuNMqG9C7NofSy7IlDR8t2sm1+hUZSNUTa35K66id6a16FC\n+ZPmhgaJsO/2VaClRbtE+RZru+cbW7aFWoTNGvat76T3nqhZMXx5dYdE/KcxC2ygur6xOcvO\nrVaq+avWcMErR2AX2b8jhV9Pii4dcfepO0UdOGwFZVRzx9nftT3NsFb4JPRvVHyndqT3rfOn\n0ykO14JAiUAaZe15u0toQNi/yIzLqc2EacbNJTrmul7l6UVjr51FDW8cfhbVWGd+5R2Vegyo\nTZcNqdO/d675Wf1SqnHLnW3o2O3BFVVIe06ilrfe2IBa7TX/7692zfAr8jNeDDuprh7oSTo+\nePLfX403WqnSf1ofvUKd9aLvckV/WmSdbH3oNBOSutKfdajDsGuq97WuN8qEJSBFsyvslOXf\nH0OVJ0z4O/AJZWPUzS25q4bQ9WW63tItI+uT4CCR9l19+jo0mfIt/qhNne7pX/1a89qjXtHV\nbz2Sbi57cb+w5cN2SKR/GqvABqrrj6KRFTvfdDbRx7py5QjsIgtS+PVEuXTE3afuFHVgdQV1\nVHPH2d91Il1grXAtPR34HsV36jPUxzr/MnM7SlwLAiUCqSO9FDo9k44O/xd5w/qxOZXOMqb+\nuWy5HcZebrtbN/+L6K7rY6mUscG/ZuU0/EPXJ1IXXX+dWhqj7b+ebg+uqEJ6i9oYy/zX0Pyv\noSmtNs5Zn9cm7KS6eqALlHsXrHrRw9b7k+md0HlLqM83dJ15ajnd/IYJSV3pLuphnNxc3bze\nqBPau/Djj3bHsCsCqcuH3TZRNkbd3JK7aihlTjcu+QC1CQwSad+tyMz52+lb3EUXmlt2qLll\n6qzqtx5DFY3fLcOWV3dIxH8aO3sD1fXHUq65p241r5vKlcPeRRak8OtJ0aUj7z51p6gDqyuo\no1o7zvqu28vm/m5cYu8hpf4KrRW+U//MrbzHOHt3xTLbHa4FgRKBVIeWhE5vpOz9DteezrTA\nPGfCLb/o7cm+1ZKbu8vYvDPM08fSE8bbLdRI10+jOdaez6kRXFGFtPZt64fGELrX+K4Zm83T\n/+lhJ9XVAx1nL1nUF9TYfLcyo/be0HkGJL1pZfPf5nZaZkFSV2pOC83Td5vXG3XCEreio9gV\ngdTlwyApG6NubsldNdS+DbK7bMYf9iD8vtvydn3zx6PDt2hOn5snRxSHFP6tLRHq8uoOsWP+\naeycIFnDf01tw64cCqTw60nRpSPvPnWnqAOrK6ijFkEyftA8arz9yPqfJbBWsZ16Dn1gnHyX\neka4FiQCqVLR7Xt9B9FOh2tPOQper/eXJvu/xqbGLamxNMQ82cHak7vocF0vT/Yt/OPo18AX\nFL+zYfvmzXebV/TrqOGkzfZZRSfDVg9U3/5HV2pOXxhvb1PvDTAhPUxTjdsMtVvrJiR1pX25\n9vgfmNcbdcISkKLYFQ67IQySsl3q5pbcVUPpNusSzeh7exCnfRfqyv+cvoWxZTvN97OLQwr/\n1jeZHyrLh+2QkrMW2wRnSNb9PCvpWPXKoUAqdj0punTk3afuFHV/FF8hOKoCaZ75e6Lej94r\nWqvYTp1KV+nmTfaZEa4FiUCqW3Sl1ddTrsPtmX+odPAC280LmJ1Cs43Ne8A82ZGWG2//NfbH\nrqJ/+S8DXxAG6Z32pa1PGrtgzzU5RMfctkZXT6qr7+1o9rHegmbp4T1t/oDfc2j2pqKzTEi/\nlzJ+Z/rQ+Ec0IakrbQuMv8i43oRNWAKS+64ITBW2G6xrQnBcZbvUzS2xq4xvbt9k7Wj852gO\n4rjvrLu/T6SGP5kflvwW26hUaMvCIYV9a/OefHV5dYdE/qexc4Jkbc4qaq5eORRITtcT89J2\nJXdfIGWnhO2PsBWUURVI++vRD3phlap7itYqtlN3lq+6V/8375DCCNeCRCCdQc+ETk+nZg7X\nnl2UtT9wgR2UY5842fg5WeLa8S9ljAgU/GdQIT1DeYNenTn7Gvvu6U1Pd8uj3NfVk+rqhdZ2\nTtUvLnE/9D8Vyvytv0kXKWeZkPTumb/plxifMiGpK/0duLp9ZVxvwiYsAcl9VwSmCtsN1jUh\nOK6yXermOkGaYK3QwfgF2hyE33e7jjCubrrTtwhu2cLikEp+a3V5dYe4/dOYRYSkXDkUSE7X\nkyJIJXdfIGWnhO0PdQV1VAWScfv2ZuNH8w3KWsV3ai9jV79FA/UI14JEII0vuulq/NwzfrRe\nT8+bp98uuj2TR1rwEmXJ/mWuMX3ncO2oSMX/RK1CqmX/Ceiu0N95/n0qu9Ju9aS6eqBJdGTw\n73X7Ry233g8wftM4k+aFfZc+5k+j0dvL9NItSOpKe7PsGx/vmtcbdcISkKLYFQ67oeQfQuyN\nUTfXCdIw67LNaJk9CL/vZtNh24t9yv4We7PtLXvb3DJ1VodvrSwftkPsIv3TBDZQXV+9YitX\nDuV3JIfrSREkdvepO0XdH+oK6qgqpLUZtfZfptykKLlTZ9J1ek/rEuy1IBFIf1UI3WM4LzP3\nN/O+Eevn6x1F155O1v+D+phTvzD+r7Dur/8ju8xuh2tHZ3rTWuiP0OoKpN1U3ny3v7W5C9bZ\nN8w60E/qSXX1QLsOCdxu1vV7qZ31/n90UkFW6AEO9nfpY/zGcFirF837Ry1I6koN6Svz9K3m\n9UadsASkKHZFIHX5MEhFGxO2uU6QrHvvt+dmbbMHibDvLqJrlSmV/dXIvhl4k7llyqxO31pd\nXt0hVtw/TWC3WBuo7gv1iq1cORRIDtcTBRK3+9Sdog6srBA2qgrJGGRW+Ua6slbxnVpYtc6u\ncg2L7wy5RzaYDx0w/8fb/WRZ606lJ6mj8cP6p6rmHptO5xrnvEAtjV/O1lYu+6c+hdqZv/fe\nYv7iVvLa8To1ManPzw7dd6L+RMon47q5f2R1GqAvpVPMZbbXzvpdORm2erDpGdTX/N1383VU\nIfBP2zHjDnpE3QQLkj4i89R6+wOQ1JVutu5NX1PZvvu7aEJ7F376SQhtFLsikLq8ek1QN0bZ\nXEdIWea9KI9Sp9A9tey+21gh4zPHb3ErnW/Mt7qSuWXqrA7fWl1e3SGR/mkCn7Q3UF1fpaFc\nOexdZEFyuJ4okLjdp+4UdWB1BXXbrB0X/IeZQvVIeWCXw069lkab9/U5XQsCJfZYu5fLU+4J\n/9e+HJW2HkNQUIHa3tSz/AN0lvFvlJFzRX99Xxeq2/+yPHrO2IJu1GTwnafS0b87XTuM32hq\n3Tzigpy80J8Pl1DZs+1eMf79jrr33jYNPqQq962/hI4YcNd1delG80ZU6KS6eqg3K1Bm89Ob\n5FKdHwLnvEbZZf9SLhGAtC6T7tYDkNSVNlal42/oWdH+D1iZsMQfZKPYFYHU5cN+Iikbo26u\nE6Q+eX3u6Z2VszA4iMO+C/4n9AjV3+X0LTZVoxNu7VVxgLll6qwO31pdPmyHWDH/NHb2Bqrr\nq1ds5cph7yILksP1RIHE7T51p6gDqyuoo1o7LvgPszOPjN+SlbVK7NQFlJextvjOkHz099a7\n21bNyW89LHA/2I+nlC1/wrsanWycvq9qqRbGD8XxzcqU62D9VlL4cIuypRreYV6PHSDte65t\nXnbt3stDSxfdhTtC/3fokaXqXPe7fnm56j/se6Jd1ayKJz2/33yISeikunpRv49qUzW7coeJ\nwYeo6nuqF3t4lQ1JPz3TvOfYghS20vJulUo3fe4POiF8QgdI7rsikLJ8GCRlY9TNdYL05NyO\n5ct3nB8axGHfhR4i1LLocYVh++tnc8smLrVEKLM6fOuw5dUdYsX809gFNlBZP4yGcuWwdpH9\nEKGS1xMFErf71J2iDqyuoI5q77jgP8yV5oN/lLVK7NT9h9NJevGdIQkJeZ/kc5Cifj5EupfY\nThmjPDAlzgDJdwGSQwntlD11qu52v1TkAMl3AZJDCe2UQSX+4hh7gOS7AMmh+HfK8iEnUvNd\n7pdzCZB8FyA5FP9O+Tiz/MUJPd3fDpAQEgiQEBIIkBASCJAQEgiQEBIIkBASCJAQEgiQEBII\nkBASKAFI//zt1r+F210vE3//7Eri4n8X/pfM1XcndccU7kzi6tuSu2MKtyVx9Z3SO6boifwJ\nQPpbc2u3/qfrZeJvx84kLq7pe5K5+p5k7pjt+o4krv5HYRIX1/bovydx9X+kd8yfgOQWIDEB\nkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQE\nSEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARIT\nICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhM\ngKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAx\nAZISILkGSEyApARIrgESEyApeQbp3YnCkysBEhMgcfkXUrOsn4VHLwqQmACJy7+QjqYvhEcv\nCpCYAInLv5COpE+FRy8KkJgAicu/kOrSHOHRiwIkJkDi8i+k2jRTePSiAIkJkLj8C6k6vSM8\nelGAxARIXP6FVI2mCY9eFCAxARKXfyHl0wPCoxcFSEyAxOVfSBWor/DoRQESEyBx+RdSWeom\nPHpRgMQESFz+hVSKThIevShAYgIkLv9CyqLjhEcvCpCYAInLt5AKiZoIj14UIDEBEpdvIe0k\naig8elGAxARIXL6F9BfREcKjFwVITIDE5VtIBUR1hEcvCpCYAInLt5DWE1UXHr0oQGICJC7f\nQlpDlC88elGAxARIXL6FtIIoT3j0ogCJCZC4fAtpGVFp4dGLAiQmQOLyLaQlRFnCoxcFSEyA\nxOVbSIuIqEB49lCAxARIXL6FtNCAtEF49lCAxARIXL6FNN+AtFZ49lCAxARIXL6F9LEBaaXw\n7KEAiQmQuHwL6QMD0nLh2UMBEhMgcfkW0vsGpB+EZw8FSEyAxOVbSG8ZkBYLzx4KkJgAicu3\nkF4zIH0tPHsoQGICJC7fQnrRgJS0g38DEhMgcfkW0iTKTN7BvwGJCZC40hPSjr/d2vM0laVP\nXS8WZ7t2J2tlM31vMlcv3J7ExXfpu5K4+vbk7hh9WxJX/1d6x2yXgLTbtb2PURWa7365+Crc\nm6yVzfR9yVx9339JXLxQL0zi6v8ld8dEcbWKv0LxHSMBKYqbdg9RLZou/NM0FG7aMeGmHVd6\n3rSLAtL9VD95R9EHJCZA4vItpNHUNHlH0QckJkDi8i2kkdSaXhGePRQgMQESl28hDacONEV4\n9lCAxARIXL6FdAedTs8Lzx4KkJgAicu3kG6js+k54dlDARITIHH5FtLNdB49LTx7KEBiAiQu\n30K6kXrQE8KzhwIkJkDi8i2kAXQZPSY8eyhAYgIkLt9C6k9X0sPCs4cCJCZA4vItpH7Un8YL\nzx4KkJgAicu3kPrSjTROePZQgMQESFy+hXQZDaaxwrOHAiQmQOLyLaRLaCiNFp49FCAxARKX\nbyF1p5E0Snj2UIDEBEhcvoV0AY2lEcKzhwIkJkDi8i2kc+lBGiY8eyhAYgIkLt9COocepTuF\nZw8FSEyAxOVbSGfTkzREePZQgMQESFy+hXQWPUe3Cs8eCpCYAInLt5A602S6SXj2UIDEBEhc\nvoV0Gr1CNwrPHgqQmACJy7eQTqFpdL3w7KEAiQmQuHwL6WR6m64Vnj0UIDEBEpdvIXWg6XS1\n8OyhAIkJkLh8C6k9zaarhGcPBUhMgMTlW0jtaC71FZ49FCAxARKXbyGdQJ9Rb+HZQwESEyBx\n+RZS68wF1Et49lCAxARIXL6FdHz2l9RTePZQgMQESFy+hdQi52u6SHj2UIDEBEhcvoV0bKlv\n6QLh2UMBEhMgcfkWUvPSS6ib8OyhAIkJkLh8C6lp2R+oi/DsoQCJCZC4fAupcbmf6Czh2UMB\nEhMgcfkWUqO8FdRZePZQgMQESFy+hdSw4mo6RXj2UIDEBEhcvoV0dOW1dLLw7KEAiQmQuHwL\n6aj89XSi8OyhAIkJkLh8C+mIqhuprfDsoQCJCZC4fAvp8GoF1Ep49lCAxARIXL6FdNihWkZL\n4dlDARITIHH5FlLtGlp2c+HZQwESEyBx+RZSzVpaqSbCs4cCJCZA4vItpOp1tLKNhGcPBUhM\ngMTlW0iH1tUqHC08eyhAYgIkLt9CqlZPq3yE8OyhAIkJkLh8C6nKkVrVw4RnDwVITIDE5VtI\nletrh9YWnj0UIDEBEpdvIVU8WqtVQ3j2UIDEBEhcvoWU11CrU0149lCAxARIXL6FVO4YrV6+\n8OyhAIkJkLh8C6lME61+ReHZQwESEyBx+RZSqWZaw/LCs4cCJCZA4vItpJxjtWNKC88eCpCY\nAInLt5CyjtOa5QrPHgqQmACJy7eQMlpqx2YJzx4KkJgAicu3kOh4rWWG8OyhAIkJkLj8CmkX\ntdZa0Vbh4YMBEhMgcfkV0k46QTuBNgsPHwyQmACJy6+QdlBbrR1tFB4+tDogOQdIXH6FtJ3a\naSfSeuHhgwESEyBx+RXS33Si1pHWCQ8fDJCYAInLr5D+opO0TvSL8PDBAIkJkLj8CukP6qid\nSquEhw8GSEyAxOVXSL8bkDrTz8LDBwMkJkDi8iskjU7WzqTlwsMHAyQmQOLyK6QC6qSdTcuE\nhw8GSEyAxOVXSFvoVO0c+l54+GCAxARIXH6FtJlO07rREuHhgwESEyBx+RXSJjpdO5++Ex4+\nGCAxARKXfyF11i6ib4SHDwZITIDE5VdIG+hMrQd9JTx8MEBiAiQuv0JaT2dpF9MXwsMHAyQm\nQOLyK6Tf6P+0XjRfePhggMQESFx+hfQrna31ps+Ehw8GSEyAxOVXSOuoi9aX5gkPHwyQmACJ\ny6+Q1tA52pU0V3j4YIDEBEhcfoX0C3XV+tGHwsMHAyQmQOLyK6TV1E3rT7OEhw8GSEyAxOVX\nSCvpPO06mik8fDBAYgIkLr9CWkHnazfQ+8LDBwMkJkDi8iukn+kCbRC9Izx8MEBiAiQuv0L6\niS7SbqK3hIcPBkhMgMTlV0j/o+7arTRNePhggMQESFx+hbSMemi30WvCwwcDJCZA4vIrpB+p\np3YHvSI8fDBAYgIkLr9C+p4u0YbSS8LDBwMkJkDi8iukpdRLG05ThIcPBkhMgMTlV0hLDEgj\nabLw8MEAiQmQuPwKaTFdqo2iicLDBwMkJkDi8iuk7+gybQw9Izx8MEBiAiQuv0L6lnprY+lJ\n4eGDARITIHH5FdIi6qM9QI8JDx8MkJgAicuvkL6hy7WH6BHh4YMBEhMgcfkV0tfUV3uEHhIe\nPhggMQESl18hfUVXaI/ROOHhgwESEyBx+RXSQrpSe4rGCg8fDJCYAInLr5C+pKu0Z2m08PDB\nAIkJkLj8CukL6qdNoruFhw8GSEyAxOVXSJ/T1doLdJfw8MEAiQmQuPwM6SUaKjx8MEBiAiQu\nv0JaQNdor9IdwsMHAyQmQOLyL6T+2us0WHj4YIDEBEhcfoU0n67V3qSbhYcPBkhMgMTlZ0jv\n0CDh4YMBEhMgcfkV0mcGpPfpBuHhgwESEyBx+RXSp3SdNsvAlJwAiQmQuPwK6RMD0gfUT3j4\nYIDEBEhcfoU0j67XPqKrhIcPBkhMgMTlZ0jzqK/w8MEAiQmQuPwK6WO6QZtPlwkPHwyQmACJ\ny6+Q5tJA7XO6RHj4YIDEBEhcfoa0kHoKDx8MkJgAicuvkD6iG7VFdKHw8MEAiQmQuFIIacf4\nPpfcXWCf/uOBS7vfviIGSHNokLaYzhMePhggMQESVwoh3TNkzcYHBuyzTt805JdND/b6NzZI\nS6mr8PDBAIkJkLhSB0nr+ovxU+ncpebp7WN+0/Wt56yMHtKHdJP2I3URHj4YIDEBElfqIH15\nwX7j7fWvh85Y3s382l0bjLQ/3fqQbvlzBZ3lern42rkrSQtb6YXJXL3w7yQu/o++M4mr/7U3\niYv/uUf/K4mr75TeMX9HC+mDy823Q58Nfrz9usnmu3ktjb6O/KVGc2mo/jud7Xo5hPzZvtAp\nN0h9zbchSOuvftL8CaUvG2K0fLdbc+j23Vuos+vl4qtwb5IWttL3JXP1ff8lcfFCvTCJq/+3\nP4mL796nJ3P1QvEdEy2kr+ybdm/YHy29ZLryOfffkWbRrdo66ih8uzQYfkdiwu9IXKn7HemP\nrqt0fVu3ZdYH/7v4W/Vz7pBm0mBtPZ0oPHwwQGICJK4U3v09dtCaDSNv3q/PeV//r99U82tj\nuPvbhLSR2ggPHwyQmACJK4WQdk7o3WuMcfFxw/Sl51jNiB7SDLpN20qthIcPBkhMgMTl14cI\nmZC07ObCwwcDJCZA4vIrpOk0RNNKHyM8fDBAYgIkLr9Cet+EVOFo4eGDARITIHH5GlL+4cLD\nBwMkJkDi8iuk90xIh9YWHj4YIDEBEpd/Id2uaXUOER4+GCAxARKXXyG9ax5Av16+8PDBAIkJ\nkLj8CukdE1KDPOHhgwESEyBx+RfSnZrWuJTw8MEAiQmQuPwK6W0T0nFZwsMHAyQmQOLyL6Sh\nmtaKCoSnDwRITIDE5VdIb5mQ2tMG4ekDARITIHH5FdKbNEzTTqa1wtMHAiQmQOLyL6ThmnYa\nrRSePhAgMQESl18hvWFC+j/6n/D0gQCJCZC4/AvpLk3rSkuFpw8ESEyAxOVXSNNMSBfQIuHp\nAwESEyBx+RfSCE3rSV8KTx8IkJgAicuvkF43IV1G84WnDwRITIDE5WtIV9Jc4ekDARITIHH5\nFdJrdLemXUOzhacPBEhMgMTlV0hTTUgDaIbw9IEAiQmQuPwLaZSm3UjvCk8fCJCYAInLr5Be\nNSHdQm8ITx8IkJgAicu/kO7RtNvoNeHpAwESEyBx+RXSKyakofSy8PSBAIkJkLh8DWkETRae\nPhAgMQESl18hvUyjNW0UTRSePhAgMQESl18hvWRCGktPC08fCJCYAInL15DG0WPC0wcCJCZA\n4vIrpBdpjKY9RA8LTx8IkJgAicuvkKaYkB6jB4WnDwRITIDE5V9IYzXtSfNNMgIkJkDi8iuk\nF+g+TXuW7hWePhAgMQESl18hTTYhTaaRwtMHAiQmQOLyL6T7zXschgpPHwiQmACJy9eQXjVf\n2yUZARITIHH5F9I488ANtwpPHwiQmACJy6+QnqcHzAOADxKePhAgMQESl18hTTL/hPQ+3SA8\nfSBAYgIkLr9CmkjjNW0W9ReePhAgMQESl18hPWdCmkNXCU8fCJCYAInLv5Ae0rR51Fd4+kCA\nxARIXH6F9CxN0LT5dKnw9IEAiQmQuPwK6Rnzgd9fUk/h6QMBEhMgcfkV0tMmpG/oIuHpAwES\nEyBx+RfSI5q2mM4Vnj4QIDEBEpdfIT1Fj2raD9RFePpAgMQESFx+hfSk+Szzn+gs4ekDARIT\nIHH5GtJKOk14+kCAxARIXH6F9IQJ6RfqJDx9IEBiAiQuv0J6nB7XtF+pg/D0gQCJCZC4/Arp\nMRPSRmonPH0gQGICJC7/QnpC07ZQa+HpAwESEyBx+RXSo/Sk8TajpfD0gQCJCZC4/ArpEXrK\neJt9rPD0gQCJCZC4/ArpYeuw36WaCE8fCJCYAInLr5AmWJDKNRKePhAgMQESl38hPWO8rXiU\n8PSBAIkJkLj8CukhetZ4m19PePpAgMQESFx+hTTegnRIHeHpAwESEyBx+RXSg9aL9dWoJTx9\nIEBiAiQuv0J6wIJU51Dh6QMBEhMgcfkX0iTj7eFVhKcPBEhMgMTlV0jj6Hnjbf1KwtMHAiQm\nQOLyK6T7abLxtkF54ekDARITIHH5FdJ9FqRjSgtPHwiQmACJy9+QmuUITx8IkJgAicuvkMbS\nFONti0zh6QMBEhMgcfkV0hgLUisqEB7fDpCYAInL35Da0kbh8e0AiQmQuPwKaTS9aLw9kX4T\nHt8OkJgAicuvkO61IJ1Ma4XHtwMkJkDi8jekU2ml8Ph2gMQESFx+hXQPvWy8PYN+Fh7fDpCY\nAInLr5BGWZDOpmXC49sBEhMgcfkX0ivG2660VHh8O0BiAiQuv0K6m1413p5Hi4XHtwMkJkDi\n8jek7vS18Ph2gMQESFx+hSNkk4kAACAASURBVDSCphpve9KXwuPbARITIHH5G9KlNF94fDtA\nYgIkLv9Ces14ezl9Ijy+HSAxARKXJ5DKKeXKQLrLgnQVfSQ8vh0gMQESlyeQehg1yGl7wbnH\nZrS8XgrS68bbq+kD4fHtAIkJkLi8umn3RpNN5rufG74vA2m4Bek6mik8vh0gMQESl1eQmkyz\n3z/VXBLSQHpXeHw7QGICJC6vIOXODfxkKiUDaRhNM97eRG8Jj28HSEyAxOUVpJq9rHf7e9SQ\nhDTYeisfIDEBEpdXkEZQ04H33DOgEd0uBelN4+3t1uMb5AMkJkDi8grS/vtrkFHV4XslIQ21\nnpUkHyAxARKXd3+Q3f/r11/9si8aRtFAGmr9djTCOiiXfIDEBEhc3kH695u3Nb1QFtK99Jzw\n+HaAxARIXJ5BejCPaKF+5+VRUXKHdAe9o5lHt3tKeHw7QGICJC6vID1LXZ82IE3JHicD6XYL\n0jh6THh8O0BiAiQuryA166//a0DS7zhaBtIQC9JD9LDw+HaAxARIXF5BKv2RDenDHBlIu3Xz\n+vIoPSg8vh0gMQESl1eQDpluQ5pWQRLSE3Sf8Ph2gMQESFxeQTqt4y4T0h9NOktCeppGC49v\nB0hMgMTlFaRPsurfSFf0qZDzuSSkiTRKeHw7QGICJC7P7v6ee5z5yIbWn0bjKGpIk+ku4fHt\nAIkJkLg8fKp5wZIlf+rRFS2kl2io8Ph2gMQESFxeQWo7M0pDMUGaSkOEx7cDJCZA4vIKUu3x\nyYA0jW4VHt8OkJgAicsrSO81emePPKS3aZDw+HaAxARIXF5BOqkp5dasayYJ6X26QXh8O0Bi\nAiQuryC1P+XUQJKQZtG1wuPbARITIHF5fYDIHSslIX1I/YTHtwMkJkDi8hrS3HxJSB/TFcLj\n2wESEyBxeQZpRq+T2rdv3yavqiSkz6i38Ph2gMQESFxeQZpK2bWpZmnqFNXfk6KF9DldIjy+\nHSAxARKXV5Banrldz/qx8NGTt0tC+op6CI9vB0hMgMTlFaS8Gbqe9YOuDxogCelbukB4fDtA\nYgIkLs+e2Ddb1yvM1/UFNSUhLaFuwuPbARITIHF5Bem4C//TGw/V9ffKSUL6gboIj28HSEyA\nxOUVpJfoVH14Vr+7a7WThPQTnSk8vh0gMQESl2d3f08dq+88najOomggbf/Trf/0v423q6mz\n6yXj6Z9/k7JsIL0wmasXbkvi4v/oO5O4+t9J3TF79L+SuPou6R3zNwPJatVP0T1ydfcet/bp\nhcZbjTq7XjKe9u5LyrKB9P3JXH1/YRIX36vvTeLqhcndMXoyV98rvWP+iwQp2qK9afcrdRD+\ngWqHm3ZMuGnH5dVNuyrB8iQhbaR2wuPbARITIHF5BambVesyTUT/jrSFWguPbwdITIDE5fGD\nVjd3mCEJSctoKTy+HSAxARKX14/+XtRSFFJOc+Hx7QCJCZC4vIa0uYwopNKNhce3AyQmQOLy\nGNL+0bVFIZVvKDy+HSAxARKXV5CaWzWpSreKQqpUX3h8O0BiAiQubyEdd8oj/5VQkwik/MOF\nx7cDJCZA4vL6d6ToihrSIXWEx7cDJCZA4vI5pJo1hce3AyQmQOLyClJ22XJKYpDqHCI8vh0g\nMQESl1eQrjsmu8355x6bcWzPHkZikA6vIjy+HSAxARKXV5DeaLrRfLe8wXQ3RDFBql9JeHw7\nQGICJC6vIDV+w37/VHNRSA3KC49vB0hMgMTlFaTcj+z300qJQjqmjPD4doDEBEhcXkGqecl+\n893ec2qIQmqWKzy+HSAxARKXV5DuovrXjhgx4Bi6QxTScZnC49sBEhMgcXkFad+YGuZryFYb\nsVcUUivaKjy/FSAxARKXd3+Q3f/r11/9si8aRjFAakObhOe3AiQmQOLyDNLOTbq+a/KDv8hC\nak/rhee3AiQmQOLyCtLyQ8bqhccTVVwsCqkjrROe3wqQmACJyytI5zddrb9ET65ud6EopFNo\ntfD8VoDEBEhcXkE65BVdP6+Jrr9SRxRSZ1ohPL8VIDEBEpdnf5Cdp++tfJuuz8kVhXQW/SQ8\nvxUgMQESl1eQ6kzU59A8XZ8k+wfZLvSD8PxWgMQESFxeQbqy+u11j9yrFzST/R3pXFosPL8V\nIDEBEpdXkDa1oaoLdb1Hxe9FIV1A3wrPbwVITIDE5d0fZLeZx89ftCUaR9FD6kFfCc9vBUhM\ngMTl4VPNd80uiEpRLJAuoc+F57cCJCZA4vIQ0lp6RxzSZTRfeH4rQGICJC6fQ+pL84TntwIk\nJkDi8jmkq2iO8PxWgMQESFw+h3QNzRKe3wqQmACJy0NI/y35W4+yqCENoBnC81sBEhMgcXl+\ngMi1opBupHeF57cCJCZA4vIG0med63eeZZ7Yfa/sy7rcTG8Kz28FSEyAxOUJpIU5GYflZEzT\n9Q+PogaikAbTNOH5rQCJCZC4PIHUreJSveD4RusvpEoT9ohCup1eFZ7fCpCYAInLE0iH32i8\nmU2ls67VomEUA6Sh9KLw/FaAxARIXJ5Ayn7CeLOOOv4YHaMYII2gycLzWwESEyBxeQKJnjPe\nbKbZ0TqKHtIomig8vxUgMQESl88hjaGnhee3AiQmQOLyOaT76Qnh+a0AiQmQuLyBdMfChQtn\n0oSFZqKQHqRHhee3AiQmQOLyBpKaKKSH6SHh+a0AiQmQuDyBNEJNFNJjNE54fitAYgIkLp+/\nGPOTNFZ4fitAYgIkLp9DepbuFZ7fCpCYAInL55Cep5HC81sBEhMgcfkc0gs0XHh+K0BiAiQu\nn0N6me4Unt8KkJgAicvnkF6n24TntwIkJkDi8g7Sv9+8remFwpDepJuF57cCJCZA4vIM0oN5\nRAv1Oy+PilLUkN6lgcLzWwESEyBxeQXpWer6tAFpSvY4UUgz6Trh+a0AiQmQuLyC1Ky//q8B\nSb/jaFFIH1I/4fmtAIkJkLi8glT6IxvShzmikOZRX+H5rQCJCZC4PHvpy+k2pGkVRCHNp8uE\n57cCJCZA4vIK0mkdd5mQ/mjSWRTSl9RTeH4rQGICJC6vIH2SVf9GuqJPhZzPRSEtoguF57cC\nJCZA4vLs7u+5x5lPRmr9aTSOooe0mM4Vnt8KkJgAicvDRzYULFnypx5dUUP6kboIz28FSEyA\nxOXzhwj9TGcIz28FSEyAxOUJpAZqopBW0anC81sBEhMgcXkCqb2aKKR11FF4fitAYgIkLp/f\ntNtI7YTntwIkJkDi8g7SllmTp3ywRRjSFmolPL8VIDEBEpdXkP66KNu8+zuj1z+ikLTMFsLz\nWwESEyBxeQXp8pwrp8x495lu1F8WUm5T4fmtAIkJkLi8glR5iv1+SBVZSGUbCc9vBUhMgMTl\nFaRSm+3388rKQqpwlPD8VoDEBEhcXkFq8YX9/skOspDyDxee3wqQmACJyytIc49fsF/X985s\n/J0spENrC89vBUhMgMTlFaQ21ajcEUeUoToNo3l0Q/SQalcXnt8KkJgAicuzm3ZtY3l0Q/SQ\n6lYRnt8KkJgAicvnj2zQ6lcUnt8KkJgAictDSNv/spKF1Kic8PxWgMQESFxeQfrl7HLJeKEx\nrUmu8PxWgMQESFxeQTq5Yq9bh1jJQjouU3h+K0BiAiQuryCV+yIaQLFDakUFwhtgBkhMgMTl\n2eG4NiYHUlvaKLwBZoDEBEhcXkG65Z7kQOpAvwpvgBkgMQESl1eQ/jut/a1jrWQhnUKrhTfA\nDJCYAInLK0hjiZJyr11n+ll4A8wAiQmQuLyCVOOCz1evtZKFdDYtE94AM0BiAiQuz55GkaQ7\nG7rREuENMAMkJkDi8grScUuTA+kCWiS8AWaAxARIXF5B+uyU75MCqSctFN4AM0BiAiQuryC1\nr03l61rJQrqU5gtvgBkgMQESl1eQTjo1mCyky2me8AaYARITIHF5/TSKHStlIfWjD4U3wAyQ\nmACJy2tIc/NlId1A04U3wAyQmACJyzNIM3qd1L59+zZ5VWUhDaZpwhtgBkhMgMTlFaSplF2b\napamTjNlId1FLwpvgBkgMQESl1eQWp65Xc/6sfDRk7fLQhpLzwhvgBkgMQESl1eQ8mboetYP\nuj5ogCykCfSI8AaYARITIHF5Ban0bF2vMF/XF9SUhfQU3Se8AWaAxARIXJ49ROjC//TGQ3X9\nvXKykCbTSOENMAMkJkDi8grSS3SqPjyr39212slCep2GCG+AGSAxARKXZ3d/Tx2r7zydqM4i\nWUjv0Y3CG2AGSEyAxOXtH2RX/bQnGkcxQPqQrhbeADNAYgIkLs8g7dyk67smP/iLMKT51Ft4\nA8wAiQmQuLyCtPyQsXrh8UQVF8tCWkTdhTfADJCYAInLK0jnN12tv0RPrm53oSykH+kc4Q0w\nAyQmQOLy7Lh2r+j6eU10/ZU6spBW0WnCG2AGSEyAxOUVpNx5+t7Kt+n6nFxZSBvoROENMAMk\nJkDi8gpSnYn6HJqn65NqyELSMo8X3gAzQGICJC6vIF1Z/fa6R+7VC5oJ/46klWksvAFmgMQE\nSFxeQdrUhqou1PUeFaM6BkoMkPKPEN4AM0BiAiQu7/4gu838W+yiLdE4igVSrZrCG2AGSEyA\nxOXhIxt2zS6ISlFskI7MF94AM0BiAiQuDyGtpXeSAKlxGeENMAMkJkDi8j2kVhmbhLdAAyQ2\nQOLyPaSu9J3wFmiAxAZIXL6HdAO9I7wFGiCxARKXJ5DW79TX/qf/t+TvMC07xve55O7g/Q8b\nbukWH6RxyThoAyAxARKXJ5BKT9ep5BP67hmyZuMDA/ZZp+f3nhAnpNfpZuEt0ACJDZC4PIFU\n5rL5NHFBoOCZWtdfjJ9K59ov9/Lx1oVxQlpIFwlvgQZIbIDE5QmkS0gpeOaXF+w33l7/euDD\nIKTCbUZ//u7Wbv0v+8TGzBNcLxxz/+ySX7MovTCZq+/5K4mL79D/SeLqfyZ3x+h/JHH1ndI7\n5i8HSIUzXqARkwMFz/zgcvPt0GeLQZrX0ujrErcD+Y4s/08Ml0bIH+0LnQq71+7UFcUv+EFf\n820JSEuvNfpxj1v79MLAqcH0guulY23vXvEllfT9yVw9qYvv1ZO6Z5K7Y/Rkri5+lfnPGZKu\n/z7j2YkfKAcs/sq+afdGMUhWMfyOpC2gDsK3TvE7Eht+R+Ly6u9I+27JMX9BKjcudM4fXVfp\n+rZuyxKFpLXK+ER4GwCJC5C4vII0js6bNGvGM2fQlNBZYwet2TDy5v36nPd1Q8Wcbpr2b1yQ\nXqQTpR/cAEhMgMTlFaRGN9vvr24ROmvnhN69xhgXHzdM1688x+y9uCAVtKT8LbIbAUhMgMTl\nFaRSH9vvZ5bRoygmSNqmk2mx7EYAEhMgcXkFqdx0+/275eUhaddLP94OkJgAicsrSCd2su7O\n+7fzyUmA9CBNkN0IQGICJC6vIM3MOKz/PaP61cz8KAmQ3pQ+kj4gMQESl2dPo3inoXn3d9Oo\nXkI2VkjfUVfZjQAkJkDi8vD5SBu/ifLQJzFD2pLbTHYjAIkJkLh8f/ATq3p5shsBSEyAxOX7\nZ8hadaIVohsBSEyAxHVgQLqSPhDdCEBiAiSuAwPSUHpRdCMAiQmQuA4MSA8J/yEJkJgAictD\nSMUPfiIIaQoNE90IQGICJC6vILX8yX7/ZqNkQJpJ14puBCAxARKXV5ACRxEqvFv4hcbsvhJ+\nJVlAYgIkLm8gKcc+aeHgJmFIq+hU0Y0AJCZA4vIG0tJHqNuVZlfdtT4ZkLZmHyu6EYDEBEhc\nXt20O2NlNIDihaRVqyO6EYDEBEhcHt5rl0xIjWRf3AWQmACJyytIVYLlJQXSifSb5EYAEhMg\ncXkFqZtV6zJNBiQFUjdaIrkRgMQESFwe37Tb3GFGUiBdQXMlNwKQmACJy+vfkRa1TAqkwfS6\n5EYAEhMgcXkNaXMSjiJkdB89IbkRgMQESFweQ9o/unZSID1HoyQ3ApCYAInLK0jNrZpUpVuT\nAultGiS5EYDEBEhc3kI67pRH/iuhRgLSfLpUciMAiQmQuA6QP8guo/+T3AhAYgIkLs8grX7/\nlRkbkgZpU0YbyY0AJCZA4vII0ntNrId+t/00SZC0vKMlNwKQmACJyxtI46lsr4cnT7i4bObz\nSYJUt4rkRgASEyBxeQJpaWb7TdaJje1ySrwEpgykFlkFghsBSEyAxOUJpMsr/x449Xvla5ID\n6TRaKbgRgMQESFyeQDq8X+jk1fWTA6k7fSW4EYDEBEhcnkAq9UDo5EPJeYiQ1p9mCW4EIDEB\nEpcnkMqPDZ28LznPR9LupJcENwKQmACJyxNITS8KnTyneXIgPUiPCG4EIDEBEpcnkG7LWRY4\n9WXmsORAmkx3CW4EIDEBEpcnkDZVrDXbfL9van6V30uykYD0Ht0guBGAxARIXN78QXZuBTr8\n/D5da1DVL6NxFAekBdRLcCMAiQmQuDx6iNC662oRUb1bN0flKA5I/6OzBDcCkJgAicu7R39v\n27AjOkVxQdqU0VpwIwCJCZC4DpCnUWhaxfqCGwFITIDEdcBAqpcvuBGAxARIXAcMpJaZW+Q2\nImZI3z557RMbor0wIDEBklKqIJ0u+XrMsUDaOGtUl0PMJ1vlX78ouq8AJCZAUkoVpIvpc7mN\niBbS8hdvOKGUYajKmXdNu7YyZZ7yYjQ/FgGJCZCUUgXpVslDREYFafMNRxqGMhte9pj9wPP1\nj7Ygqn3n/1y/EJCYAEkpVZAm0ENyGxENpI1dqOyJN09drZ43t1cZyun2rstXAhITICmlCtI0\nukVuI6KAtLEznbCm5NmrRx9FdPTYXyJ9KSAxAZJSqiB9Tj3lNsId0vpOdOKvjp/Z+vY5OVR2\nbISvBSQmQFJKFaR1dJLcRrhC+vVE6rSe/eyPQ/IzJ/NfDEhMgKSUKkhapXpyG+EGad2JdGrE\nPxvNK1v6I/aTgMQESEopg9Q4d6vYRrhAWn08dd4YeYXJGbV/4j4HSEyApJQySJ1pudhGRIa0\nqgV12+S2xK3UirMGSEyApJQySFcQf2Mq1iJCWn4Mnb/ZdYmt51J35lOAxARISimDNIxeENuI\nSJB+akSXRnMwyvXH0QjnzwASEyAppQzSUzRabCMiQFp6BPWJ7qCuP9bIfNnxE4DEBEhKKYM0\nna4T2wge0uLDqH+0d2rMLpW3wOl8QGICJKWUQVpM3cQ2goW0pTndGP0yT1Bdp4ekAxITICml\nDNKmzFZiG8FCuj82rQOpncPde4DEBEhKKYOk5cv9RZaDtCK//A+xrFNwJl1c8lxAYgIkpdRB\nOrKS2EZwkC6mkbEttO4YGlPiTEBiAiSl1EFqleH+150oYyDNyWzg+ofYYi2umjW1+HmAxARI\nSqmD1FnuyebOkLY0pbdjXmpmbom77gCJCZCUUgepBy2U2ghnSGPpgjjWeqzEXXeAxARISqmD\nJPgSSY6QlleM7Z6GYNdQx/DbnIDEBEhKqYN0Jzk/kCCOHCH1pFFxLbalM10VdgYgMQGSUuog\nPUCPSW2EE6SZGQ1jvach0NqG9ID6MSAxAZJS6iBNorulNsIB0ubGGe/Hu9w3lXM+UT4EJCZA\nUkodpLdpkNRGOEAaTRfFv95L1EJ5oCsgMQGSUuogfUq9pTaiJKSfKub9mMCCXei+og8AiQmQ\nlFIH6Xs6R2ojSkK6KLEnaSyrkFd0jx8gMQGSUuogracTpTaiBKQZGY3ivKch0BjqGjoNSEyA\npJQ6SFrpY6Q2ojikzY0z3ktsxYLji+6cByQmQFJKIaSaNaQ2ojikUYkfffKznNrrAicBiQmQ\nlFIIqXEpqY0oBmlZXkX22FpR15+uD5wCJCZAUkohpJPI+SDCsVcM0mUOz4WIuXW1sz+1TwES\nEyAppRBSV1oqtBHhkL7KPszlcJBRNZWOs189CZCYAEkphZAup08cLhlP4ZC60HMiq55N91vv\nAYkJkJRSCGkQvSW0EWGQ5mQ0ju74W24F/5gESEyApJRCSKNootBGhEE6id4QWvZeOtd8B0hM\ngKSUQkiP0TihjVAhvU7thFbVClrSKxogsQGSUgohvUJ3CG2EAmlrs4zZQqtq2qfZdX4FJDZA\nUkohpFl0jdBGKJCeVh7bk3hX00BAYgMkpRRCWkg9hDaiCNKmw7O/ElrUbG3NnPmAxAVISimE\n9DOdIbQRRZDGUh+hNe1eoBYFgMQESEophLQp4wShjQhB+vWQ0t8LrRnoLHoAkJgASSmFkLRy\nDYU2IgTpdrln3QZaWi5vIyA5B0hKqYRUq7rQRgQhrcirtFJoyVCj6GJAcg6QlFIJ6ZjSQhsR\nhHRNrMf6jqItzehd8UWVAIkJkJxyhNSONshsRADS4twav8ksqPZR1mFSj1J3CpCYAMkpR0j/\nR/+T2YgApB70iMx64Q0U/8VLDZCYAMkpR0gX0xcyG2FDmp9VX+z1LdS21wo+MykZARITIDnl\nCOlaqaN/25A60xSZ5Yqlv04tZR5P7hQgMQGSU46Q7qBXZTbCgvQdtYj2ZZdjS99zBo1Pyspm\ngMQESE45QrqPnpTZCAvSWBors1rx9D1LylX6OTlrAxIbIDnlCOlpqau+Belk+k5mteLpe7S7\n6IrkrA1IbIDklCOk12iIzEaYkNblNpJZrEQGpI31soq/jp9UgMQESE45Qpot9TwKE9LzSbuT\n2nys3UTqnKTVAYkJkJxyhLQw8eM42pmQesq9AGCxrAetnkDTkrM6IDEBklOOkJbTmTIbYUAq\nqFZli8xiJbIgfZBxTHLWByQmQHLKEdJGaiOzEQakWVI/3UpmP43iPHo4KasDEhMgOeUISSsr\ndP+AAWkQPS+zVslsSEvLVFuTjNUBiQmQnHKGJHUYfQNSo5xfZNYqWeCJfTfSLclYHZCYAMkp\nZ0iNyshsxI6dS6ijzFIOBSCtPaT0kiSsDkhMgOSUM6S2JHGQbhPS2MReoS9iwaeajxM7WIsa\nIDEBklPOkM6ixF9+xWzHzk70jchKTgUhbWmU+ZH86oDEBEhOOUPqSV+KbMSOraUaiCzkWOjg\nJ1PlDuJaFCAxAZJTzpD6C/0RdcdUukFkIceKjiLUKQlP1AAkJkByyhnS7TRVZCN29KbpIgs5\nVgTp8+zDZX6pUwIkJkByyhnSWHpKZCO2HVo5Kc+NtVOOa3eZ/H0agMQESE45Q3pK6HkUn9BF\nIus4p0BanldphfDqgMQESE45Q5pKt4tsxG1Cr9HnnHqk1Tuov/DqgMQESE45Q5oldLVslr1a\nZB3nVEjr6+R8Lbs6IDEBklPOkL6UeaTp9xknSSzDFXbs7yepi+zqgMR0MELa9rtbu/W/HM5d\nQae7fmUUPUD3SSzDpRcqH2jH0XTR1fc47Ripduj/JHH1PwvdLxN/e/Q/krj6Tukd85cEpD17\n3dqvO527J6uV61dG0Vn0s8QyXPp+9aPPM45139wY2u9+kfjbp+9L5vJJnd35KiPVPukdUygB\nKd6bdlqVwwR+qK4vc9RO90vFX7GXdelCT0iujpt2TAfjTbu4ITUoL7AJL9FALyF9K3uEcUBi\nAiSnGEhtJQ6jfxnN8hKS1l/sRaTNAIkJkJxiIHWhxF9gb2uNCn96CmlVfrllcqsDEhMgOcVA\nupzmJbwFH9H5OzyFpI2my+RWByQmQHKKgXQLvZHwFgympz2GtOnIzI/FVgckJkByioE0hp5O\neAuOzV7pMSRtCp0stjogMQGSUwykZxJ/NPWarOM1ryFp7YSe/6EBEhsgOcVAeoNuTnQDptEA\n7yF9knmU1PM2AIkJkJxiIH1CfRLdgFtpsveQtJ40Tmh1QGICJKcYSD8k/hDQjrQsBZB+KFtF\n6Dh6gMQESE4xkBI/aPGWvLpaCiBpt9BAmdUBiQmQnGIgaXlHJzj/p+aTY1MAaV31Ut+KrA5I\nTIDkFAepbn6C899HD6QEkva40BOTAIkJkJziILXMTPDFUi6gz1IDaWsLelNidUBiAiSnOEid\nKcFXOa6TtyU1kLTZGY0lXjEJkJgAySkO0sX0RULj/0idtBRBMn4YPiSwOiAxAZJTHKTr6f2E\nxn+ebtNSBemHslUF7gIHJCZAcoqDNIImJzR+f+sXldRA0gbTgMRXByQmQHKKg/SoeadbArXM\nMl9GL0WQ1tfJ/Srh1QGJCZCc4iC9ktizTdfnNjHfpQiS9rTAq0kDEhMgOcVB+oD6JTL9+3Sl\n+S5VkLaeQNMSXR2QmADJKQ7St3R+ItMPs4/CnypI2seZDRJ9FDggMQGSUxykNYk9Re4MWmy+\nSxkkrQfdn+DqgMQESE5xkDT7l5w425p/qPU+dZB+LJe/KrHVAYkJkJxiIdWsnsDwC+kc633q\nIGl30tWJrQ5ITIDkFAupWc7W+Id/hO6x3qcQ0oY62QsSWh2QmADJKRZSJ0rgplEv+sB6n0JI\n2iTqmNDqgMQESE6xkLpTAn/TPLq0/ZKuqYSktaPXElkdkJgAySkW0nU0I+7ZV2W2tU+kFNIn\nWUdtSmB1QGICJKdYSMMTeLDdq3SjfSKlkIwbmIkcUwyQmADJKRbSI/Rg3LMPopftE6mFtLxC\nIq/QDEhMgOQUC+mVBF6PuV1G4FmBqYWk3UVXxL86IDEBklMspA/pqnhH31SmfuBUiiFtPCJr\nftyrAxITIDnFQvqOzo139Dl0SeBUiiFpUyj+V4MGJCZAcoqF9Fv818HRNCFwKtWQtE70Uryr\nAxITIDnFQtLKNIp39G6h4z2kHNLn2YdvjHN1QGICJKd4SHWqxTt6zUoFgVMph6RdTiPjXB2Q\nmADJKR5Si6w4D2q1mE4Pnkw9pJWVy/8vvtUBiQmQnOIhnU5x/hHmGRoaPJl6SNo91Du+1QGJ\nCZCc4iFdTJ/HN/lV9G7wZBpA2lQ/c25cqwMSEyA5xUO6ochDbDXP/jV4Mg0gaa9Q+7hW9x7S\nhqVzXn1seP8BsxJ4AosVICmlHNLdNCmuwddmtwidTgdIWqf4HjXoEaRVX06ffN/gvmefUD+P\ngh12U2JPpQIkpZRDboLdGAAAGDBJREFUepzui2vw1+i60Om0gPR5dt0NcayeREibfvxi9rOj\nru95WrMauSE9lY5qc86Vt42bMnNKt9JEjUcsjX99QFJKOaTXaXBcgw+kV0Kn0wKS8UvbsDhW\nTwakn4df1a3t0fkhPDnVG5/S/dqRj7827wf1z11rH++URZntxq+M89sAklLKIX1MfeMavGXW\n6tDp9IC0snL5ZbGvLg9py/2VTD3l6rU664q7Hpz4/hcR7hb9aUxLotyzJq6P5xsBklLKIS0N\nHMAkxtZlNy/6ID0gaWPp4thXF4c0uxmVG/7BEotGNPfaLbr9KKLyPabF/uc8QFJKOaQN1Dae\nuV+na4s+SBNImxtkfhDz6sKQfr4kg877IfhRlHd/f3xtDaJq/WIdHpCUUg5JyzsqnrlvDD6p\nzyxNIGlvUeOYD7wqCsm8Vdfg7aKPo/47UsHbvSoSHX7Ll7F8N0BSSj2kenG9jOzxmUW/IqUN\nJO0iujvW1SUhzW1JZQardyfE8gfZjS93L0vUYPB3UX8FICmlHlKrjDiOHbIup5nyUdpAWpFf\nJvrroZ0cpJX9MqnzkrCzYnxkwy+Pd86mzNajo3zQFiAppR7SWRTHwz2nqb8ipQ8k7SE6NcbV\npSAVPJ5PRxZ/bYzYHyK0YnzrDMrt/Piv7hcFJLXUQ7rMfF3yWBsU9ky69IG09YRYH98gBGlW\nUyo/ssSP9rgea/fV4COIKlz8puvdeICklHpIg+M5wGKrTPUArekDSVuQc2hsLywrAsnhVp1V\nvA9aXTDwUKL83jMiPxoPkJRSD+nhOA7I9VtuU/XDNIJk/KyM7aj6ApAcb9VZxf/o74IZvfOI\n6gyMdBxcQFJKPaQ36KaYp55G16gfphOk9XWzYno+ReKQnG/VWSX0NIr1z/9fLlHLd9gLAJJS\n6iEtpO4xTz2IXlQ/TCdI2uvUPJYHCSQKafnFGXT+j8wnE30+0qoJ7TMzevzMfBaQlFIPaX1G\nu5inbpUZdg9tWkHSzqUxMVw6MUj8rTorgSf2fXwsVRxd4PgpQFJKPSQt/7BYh/6t2Ov8pRek\n/1Us/330l04I0kctqOzgCMcvkniG7ObR5am14wEwAUkpDSA1y3H+D4+v2K9IaQZJu5+6RH/h\nBCBx99UVJfNU8x/Ooex+60qeD0hKaQDp/4i7ic81iKaEfZxmkApaxXC8yLghudyqs5I6ZsPL\ndahGyb+PAZJSGkDqR7NjHLp1+K9I6QZJm59T2+E/cOfiheR2q85K7OAnvw3OKfnDD5CU0gDS\nSJoY28y/5TYOPyPdIGkDaEC0F40PkvutOivBowh9ejyVHR5+dyQgKaUBpImxHqX0jeJ/80w7\nSL8dlj0vyovGAymaW3VWkofj2mp80yZhT1kCJKU0gDSb+sU28830QvgZaQdJe4VaRnkPShyQ\nfmxJ5e+O6iHzsse1W3YuZV21puhjQFJKA0g/0v/FNvMJxX5FSkNIWhe6P7oLxg5pTnXqEuXd\nM9IHiJx2OFV/PvQRICmlAaSC3Gbs55z6LfeYYuekIaRlFfJ+cL+UFgekiWUyBkd7aEfxI62u\nH5xLJwUffwdISmkASTsstufIvlnipmAaQtLGRPkKajFC2jo8o9wL7hcLlIRDFn/dkUoH7i0E\nJKV0gNSOfotl5JtLPOUnHSEVtFQOvBeh2CD9ejbV/Dj6iyfj2N9bH69CR75lngIkpXSA1J0W\nxjJym4ziT4VOR0jap9l1onmaaUyQvm9OrX6K4fLJOYj+qn6ZGd1/BqSw0gHSTfRGDBOvzy3x\nGn9pCUm7mgZFcalYIM2qRhfGdFjkZL0axYyGVHn8VkBSSgdID9LDMUz8VsnXQU9PSGtr5kTx\nYucxQHo0N+ue2EZI2su6bBxamk78CpCKSgdIr8V0+O9r1CPa2aUnJO0Fau3+x6SoIW25liq+\nHuMESXx9pG9PoVJD43nVgGgDJKciQloQ06F+65UpcddEmkLSzqSHXC8TLaS1Z9DhMb8kW1Jf\naOzlWnR4NA+viDNAcioipHXUIfqB5zv8+TZdIS0pV2m522WihLSoAbXlnqnKl9xX7Ft7fRZ1\njuGpV7EFSE5FhKRVrhv9wEPpkRLnpSskbRRd6HaR6CC9VZmuivloyMl/6cs5TaniA7E+myzK\nAMmpyJBaZ0b9rAOtZWbJ40mmLaTNTelNl4tEBWl8TvbYeL5/0l9DtmB8HjX7KCmrA5JTkSFd\nTh9GO+9Pma1Knpm2kLQ5WfVcfiGPAtLmflT5bddLOeXBizEv607Z/dYmYXVAcioypPsdbq4x\nPUTDS56ZvpC0K+mWyBdwh7SyAx0Z01+si/LkVc1frUPV43sd4IgBklORIb1f7BgMETqDvih5\nZhpDWlMj12FgJVdIX9WnU2I7eGtRnkCyHsnaebH06oDkVGRIq6K+2259mcMdzk1jSNpEahvx\nsdpukF6vSP1ifzG9QN5A0rT5rYq9noxAgORUZEhajSpRjvui8lrmRaUzJK0zPRbp0y6QRmfl\nRvzyyHkFyXr67DGzRFcHJKdcIJ1KUT4Usxe953BuWkP6rkx+pNcbighpY086JJGrp2eQNO2n\nCynzeskfSoDklAukG+itqKYtqFbZ6a8paQ1JG06tI/xZNhKkn1pR06WJfGsPIWnaW3WoScyP\nveADJKdcID1B90Y17Sy6yOns9Ia06Wyq8wn72QiQPqlN50TzTAw+TyFpa7pT6bHRPnnXNUBy\nygXSPOoV1bQ30vNOZ6c3JPNZrWXZlx/jIb2alzEwwUcNeAtJ0ybnU8fonmHvHiA55QJpY3aL\nqKZtkOv4p780h2QdZ4EzwUHaOjyz1JOJfl+vIWn/O5XypzhcNI4AySkXSNpRZaP5v3cRneJ4\nftpD0j6pw91KYyBtuIiqz0n423oOSds6Ope6R/+ArwgBklNukLrRN1EMew9zjKv0h6QtO56a\nOB4Z1RnSoubUclni39V7SJr2aSOq94HTJ2IMkJxyg3Q7RXODoH2G831YPoCkbehB1WY6nO8E\n6eercukiiSfNpQKStuHqjOwhcTxUvViA5JQbpCl0u/usK7ObOn/CD5A0bWRWrsNDCktCWjek\nPNV5QuRbpgSSpk2rTq0WJbo6IDnlBumbaF5S6EnuKen+gKRNrUDXlHi0T3FIm+6rRvn3CP1l\nM0WQtBVdqHzUj0NmAiSn3CBtrVPK/VeCrsQc0s0nkLQvjqBOq4udFw5p67OHU5lB8T5GtUSp\ngqRpj5SnLpEe0OEeIDnlBkkb6/ggurBWl6/F/LXPL5C0VR1LPCMiDNIbzSm7j8CdDMFSB0lb\n1IqqJ3REB0ByyhXS+mrl3P4DG8T+HuUbSOZz9IodCkiBNLcjZZzzVfEvSaQUQtI235adcXUC\n95gAklOukLS73J4Bt6xsFe6JmP6BpGkTcrPCnpoYgrS4dya1miH5rVILSdM+PJIaRPsyUSUD\nJKfcIf2aX6H4rw/hXU7sYQv8BEmbWY0uWF/0YQDSioG51FD8eaaphaSt70c5g+N9lBMgOeUO\nSbuNhkb69Dc5h7F3ZfkKkra4sXoAbwvS2lvLUZ0n5Q/Hk2JImvZCPnWI84BdgORUFJBWls+9\nNcJN6vOIf+SZvyBp67pQrdBNHgPSxjFVqMpo4SeYWqUckrasE1WK8SWCAwGSU1FA0l46hOqx\n9/N8nHEM/x+2zyBpW2/JKBO8Gbfn96frUtlb1kT8gnhLPSRt69jS1COerQMkp6KBpK3um0l3\nMJ/rRK/yX+g3SJo2qUzGLfZ9+TOaUk7fkofqkykNIGnagiZ0WBx3ogCSU1FB0rQPazK/KL1D\nbSJ8mf8gafNqUZd1mjbnJMro9nUS1rdLC0jaxuszs26K6sWj1QDJqSghad/UcDpunba1JTk9\n4DOYDyGZTyNv/HbXDDolhhfgi7n0gGT8P1iLjo310HyA5FS0kLSvq9OIkudOpjMjfZEfIWkb\nLyaiptNif1XzGEoXSNrq86jsg7GtDkhORQ1JW3govVb8vC+PzIr4kl2+hKRpo497piD2VzWP\npbSBpGlPVqAzXF+aQw2QnIoekvZR1pHh9wQv7plFvSN+iU8hWR0skLTFbahahDuMSgRITsUA\nSetDdykf/TywFB02PvKxRgGJKZ0g2c9Cj/6oSIDkVCyQVlQu/2Po9PVlqM4jbk+3BCSmtIKk\naXOPoqPmRnthQHIqFkjafcGj160dkkeHjHX/kz8gMaUZJO23vhk5Q6M8kjkgORUTpC3HZJh3\ndv86qgpVGhbNbQFAYko3SJr2ajVqG90LVwCSUzFB0t6jZpP7t8imcjdFfkB4MEBiSj9I2vIz\nqEJUx+sDJKdig6SdS0RZzQZFe3cpIDGlISRNe7AMnRfFf5CA5FSMkJZddOu0GF5OEZCY0hKS\ntvBYqvWO66UAyakYIcUYIDGlJyRt8+CsjH5u9yEBklOAxHUwQtK02fWo0aeRLwJITgES18EJ\nSVvTg0r3nhbpIeEHLKQd4/tccndBydOAlFAHKSRNm1iVqGL3F37jPn/AQrpnyJqNDwzYV+I0\nICXUQQtJ2zKjX3Wi0p0fd74L70CFpHX9xfhJdO7S4qcBKbEOXkhGBXMH1yfKaj3a4aCYByqk\nLy/Yb7y9/vXipwEpsQ5qSGYLBjcnymw9vPjzhA9USB9cbr4d+myx0ws6GX273y1dd71I2pbc\n2ZO7Y/yx+i8T2mcQHXPXorDF/XWV2Rs1pL4KpKLTiy41+r7Qrf36XtfLxN++fUlcvFDfn8zV\n9ydzx+zVk7pnBHfMmqfOziGqd/2ne0KL63Krl0z8KrMnWkhf2Tfn3ih+2gw37eLvoL9pF2rl\n451ziWr3ftl+1syBetPuj66rdH1bt2XFTwNSYgGS0m8vd88jyu/+8sYDF5I+dtCaDSNv3q/P\neb/oNCAlHCCFt+Hl3tWIynZ+/I8DFdLOCb17jTEuPm5Y0WlASjhAKtGWGf1qEJXu8lBMB0uJ\nLTxEyClAYvInJLMFg4+2/sD0o/tF4wqQnAIkJv9CMnbMsrtaE2U2Hyz60mvBAMkpQGLyNSTj\nd6TFo1tnEDUYHPURU6IOkJwCJCa/QzL6+fHOOUR1+81gXj84zgDJKUBiOgAgaeYfmLqWJarV\n++WYD8TPB0hOARLTgQHJaO1z3coRVb3sNbdDHEYbIDkFSEwHDCTN/ANTz3zD0tUyr94BSE4B\nEtOBBMlo81u9KxE1vOuHxFcHJKcAiekAg2S08eWuOZTZevy6BFcHJKcAienAg2S0anxrotJd\nX07o1yVAcgqQmA5ISEZfDD6MqEY/l+MQRQqQnAIkpgMVkqYVzOhdjqjB8HgfjgdITgES04EL\nyWj9pM7ZlNXxcfY4RJECJKcAiemAhmT0w+gm5jG93oz9UQ+A5BQgMR3okIwWDKxKVGfgohhX\nBySnAInpIICkaVve7F6GqPnoFbGsDkhOARLTQQHJaPXjHTOoVOdJ0T8YD5CcAiSmgwWS0ZLh\nhxMd2i/aJ1wAklOAxHQQQTKa2y/fvEf8f9FcFpCcAiSmgwuS+cBW8wFEHR93f+FhQHIKkJgO\nNkhGK80HEOW53iMOSE4BEtNBCEkz7xE/lKjmwIjHegAkpwCJ6eCEpGkFb3Yva94j/jN7CUBy\nCpCYDlZIRmvMe8RzO09iXqoWkJwCJKaDGJLR98OPJKrUe4bT5wDJKUBiOrghaeY94lWIjhr8\nXYlPAJJTgMR00EPStI2Tz8qlzA4Pzw1/Ti0gOQVITIBktmLscWRU++Srxr0dPAQyIDkFSEyA\nFOiLe3q3rWJqogoteg574ctNgOQUIDEBktrquZMGd22QZXLKrtep3/g3JV/tApBcAyQmv0Gy\n2/Tty8N7ty5v/Xiq1Lz78Je/LZBYFpBcAyQmf0Ky+2f9jPEDO9fNMDnlNug68PG5cT1jvShA\ncg2QmHwNyd4xv5i39pqXtn48Hdqx9+g3l8W7ICC5BkhMBwAku83fvjm6d8dqoVt7kxZsiXlB\nQHINkJgOGEiBVs193Li1l2lyyqnbeeD4GbEczhWQXAMkpgMNkt3GBZOGd29e1v7x1Nq4tfdt\nVMcoAiTXAInpwIRkt2XRqyMvPSHf4lSx5Q3uCwKSa4DEdCBDCmTe2uvaIOsU90sCkmuAxHQQ\nQLLbGMV9eYDkGiAxHTSQogmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZA\nUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgA\nSQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGIC\nJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJ\nkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCURSO69O/pP9wulZ/tH\nT0n1CHH3w+jvUj1C3L0y+r9UjxBfSYU0rOX6ZC6fzPa17JvqEeJuRstpqR4h7q5ruTPVI8QX\nIDkHSKkJkJwCpJQESCkIkJwDpNQESAgdzAESQgIBEkICARJCAolC2jG+zyV3F4SfLv4+XYs0\n+w3nGF2U2vki5TS7vuGWbsU/l4ZFGj3dd3tYopDuGbJm4wMD9oWdLv4+XYs0e9/pmqb9keIB\nI+Q0+/zeE7oV/1waFmn0dN/tYUlC0rr+Yvy3cu5S9XTx94LfTrRIs+sXLkr1eBFzml3/eOvC\nbsU+l4ZFGj3dd3t4kpC+vGC/8fb619XTxd8LfjvRIs2+55xHb7xizIaUzhcpp9mNN9a1Mc33\ne6TR0323hycJ6YPLzbdDn1VPF38v+O1EizT735c9tGLFyMv+Sd10kXOaXQ9cG9N8v0caPd13\ne3iikKwHAwT2SuB08feC3060SLNbF9h10ZwUjeaa0+x6EFJ67/dIo1ul8W4PTxLSV/bP5jfU\n08XfC3470SLNbl/iuldTNpxLTrPrgWtjmu/3SKPbpe9uD08S0h9dV+n6tm7L1NPF3wt+O9Ei\nzb7usUJd//eieamekctpdj1wbUzz/R5p9HTf7eGJ3v09dtCaDSNv3q/Peb/odPH36VqE2bdf\nMmHzhjF9d6d6RDan2f/U5nTTtH/Tfb9HGD3td3tYopB2Tujda8yfuj5uWNHp4u/TtUiz/zKs\nx6X3bEn1hHxOs19p/jXznPfSfb9HGj3dd3tYeIgQQgIBEkICARJCAgESQgIBEkICARJCAgES\nQgIBEkICAVKaNILM8jq85XrJ9g2YBRZG832Yr0YJBkhp0gi647nnnhl2GD3sdkmTwpKS/26A\nlNIAKU0KONheN+9fl0uaFB4FpDQLkNKkoIOb6Wtd//S0vDLHTTI+OunExafkVetpHhFkaqsy\neS2n6haFM4xbgS3bV7FeuKFj1T1hC+hFX96+SqH54Qk19hatCEjJCZDSpKCDYfS5Pjerw/Q5\n/elBXT+1TquPCt7M6qPrr9F5M2acSTMsCiu70aKfJtGbxuU3Zw4MX0Av+vInyHxW3K8ZNysr\nAlJyAqQ0KejgxOy/9ePqmwfA7mrcyDvVYGVwqqnrY04xfv5sy+5lU7jS+HfbUf4c43OP0Xfh\nC+hFX65lX22ceICWKCsCUnICpDRpBM3cvHnTN1fQtXoB3fiv0dP0jX5qWfNzfTKDl6p9UhEk\nvW+2cZPvpCahBQKQlC8/65B9ut6qsXoWICUnQEqT7Lu/Kfu63foSCvS2fmpd83Mmm23Dm1TI\nyqL2CqQFNF7fmDEutEAAkvLlL9En+lq6Tz0LkJITIKVJI2jC7NkfLPhLNyVcsdBKUyB1yLpz\n/g8/1lQh6Uc30x/O2hRaIAQp9OU7yg7Q78/4TT0LkJITIKVJyp1uf1Cf4MkQpFXUzzhRWDoM\n0lha1vrMEgsoX673qKkff3LYWYCUnAApTVL/DNS6ovmDacrQwiJIP9HduvnnozY2havIvGN7\nU9YlNLXkAkVfrr9H79CksLMAKTkBUpqkQvo0p9mUD4flXK78RNpTp9Z7n99y8sl58/4xKdxF\nd5v3fZ9NFXYVLXDLY2afKV+u78k/ovS2sBUBKTkBUpoU9sCEBafn5Rw9rlCBpC9qW/bQa7ZN\nr1p5hUlh/XE5Joi36CplAbsBypfr+tV0UfiKgJScAMnPvW8+DAKlQ4Dk4/Yc3ybVI6BAgOTb\nfnvvzKxvUj0ECgRIvm1SRr2ZqZ4BBQMkhAQCJIQEAiSEBAIkhAQCJIQEAiSEBAIkhAQCJIQE\n+n/r20q2hiJ7owAAAABJRU5ErkJggg==" + ] }, "metadata": { "image/png": { - "width": 420, - "height": 420 + "height": 420, + "width": 420 } - } + }, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de4CMdf//8fee7bJECNFRRSkVioSi033fodItUXSgRHdHpaJbErrJV+V3d5eSziqdnA9JRZEUKpGzsNbux3FZpz1cv7lm5vp8rtmdw/W55jPNzHW9nn/szs5c3vu5rp2Hnd2dA2kIoaijeC8AIScESAgpCJAQUhAgIaQgQEJIQYCEkIIACSEFAZJUK6lD4BlDaIKawa0yqm6NZoCqhUhW6YCE3FBqBwfT/+SWYf/whTtwVg+qZ7soIA0jUY2IW78zTW66afuVdKbs2ipV8Ezr2pkntxlb6Dn9DxrHz29Nb/PPQl2Nc2cQDQk6xhKkXc9cViejVsvBWyr/+5BHoRld+XBh6OVHLsTXfKX/C5R9zoAN0YwPlWVIlnfQe4ieP/U9qWVEcfh8By74F+avgTSlq6dLqI7+rmfErevdLTfdtL0CSG/nUEbLq8/NpJrzNG0WNTHOX021jvDPkpJe4D99c0oUkF7LpswW11xahdJHVfr3oY7CUapeEnEnwhYSUrVbPHXvUJOyv4zuMwTNKiTrOyh7RZGbXjnfgQv+Wf8aSN4+p06WtttEcsfHvH30kKZQ6pADnvcFD1DaCq3sdPrWf8GD9Kj4LBca36n2ZjWzD2kipT9X5Hl/eHwVerbCZSGPwj46NfJehC0kJP+xK76PTimL8nMEnd/B0naWd1D2iiI3PUjeAxfis8YD0lM07b8NPLfwyl9vXa1Kk6GH9PMOPNGkSmbjQZ6rcDf95kVbbShNW9KhWu0+ReXjz8luOqpcC9jec+nqG+pkNf+Ab+8rAJKYqWlTr6yZUf+62RVOHn+xZbWsMwfmiX9TdAK/yT2Y7vHccKDbfB8drZXCb/CspLvrNPOdfIUe9kIyT9p6y4nZzSf5rjemNfsOdRptN8bsyqap/pPzUlP/0LSBNFn/YCn9I3CvzOO7em9+8aWYdsa0u5UP3sP06cL21au1W8AXYj76FY5dWXVaJQ6J6VNs6V7Ls2e76OKAtQZ86mBfXPMBCful0fPvoHm+6avtWdx/W1at1vFb/oX3/YxkOv4BW4c9fOaDEuLaFbBU/cD5Pmtbmumd8DX/GlU8qB1ouvf8GdSx4rVAGaRn6JGcW/tp2m1U/9GnWtOFnv+Uj7ejFoMePIdalWoz76DW4z/WnqUnT7hlYEO6fXCj/r0z6R0tYPtn6Zka1zz8D6KvjO19mSGZZnr+769z79N31Up5J+Bk2d+oyYNP/43qix8+X6GWxskjf3resKwqe70fvU/X8I1W0l39abn35CUnfaxDMk/a24jaD7233p3e641pzZUgPeu9EvrqSQMDrpwBe2UeP30U1Rw/fr//AtPOmHe38sEbTPdnd3m0a0ra18ZCTCurdOwa0zK+MtOn2NOQrhzRv959+rXHfEU3f+ogX9yAAxLuS+PNv4Pm+aavtue2NJ07oFc1zz75D5EXkvn4B2wd9vCZD0qIa1fAUvUD5/usb1A374T76FX/56h4UF+jPt7zb9f3I/BaoAzSKKrh+fFD+4haeAaX309PaNqn1Nqz0GNNdMZTvd87R1OWZ/f+TMtoskfT3qDrA7cfTZnvejYapK92aoibduaZ59NGzznbc1sHnJxIbY5q+n9A3fk/6mb67YK3XvSi9/0V9Lnps/T5kQbop9bSI1N1SOZJ/6ZbPCfz6+nXG/OafYf6qy+PGmM60Lt85Cw6O/DKad6rgPEBt01MO2Pe3coHbwilzvBsOZZa+xdiXlnFY7cuNWN/sE/xb7pZ37OT9D0zr9X8qYN8cc0HJOyXxpdvB83zzV/tKfQ3zz/9I6fqQf8h8kIyHyDz1uEPn/mghLp2mZfqPXDez1qUk7nbs0Vp3ax9fFbgQd2bWfO45+yjNbKLKl4LlEEaTd6DdhXN9x63jPqe2wyfef+DH0zPCUjX6udcSP/1vN1FTQO3H+37lrqM2oSGZJ7ZKCVfP31MCzjZlny3iTIzDxv/6CLfJxF9T+fp79anNCw1fZY+2vk19a/NE7TaC8k8qTkt1U8P16835jVXuhXdiFby03mUXh4aknl8ACTTzph3t/LBG+I7YEdzUvb4FmJeWeCx2/VZY/3bY5BP0Zy+008Oqwgp8FNX+uKaD4ivEF8aX8Egia/2NbRYPz3+0U1mSOYDZN46/OEzH5RQ1y7zUgUkzzealz1vv/T+z+KfVeGgdqa5npNfUI9K1wKFkB7W31Uj3+3zi+hP7/ui/Pzh+pXSgDRYP7O997gdptMCtx/t+8l/PV0YGpJ55gBqMinfd5Y4WV6FfP/xnu+/nabpN2qWVlh2c/re8/Zx828DdEgv0hTPbYaGl2g6JPOkskzyspyrX2/Ma64E6QTxo452kKg4JKSAhQZAMu2XeXcrH7wh9Lh3iwvoF99CKh598etvT3cfC/YpPHtWrL+fUxFS4Keu+MUNOCCV11phF4JDEl/tqsT/0xOQAg6Qeevwh898UEJdu8xLNUFaqP+cqPWjaWJWhYM6hfpq+k32WZWuBQoh6b/sPSy+bks8F7at4j1pgjRW37YDrfW8PeLZ+4Dt/ZduoOZhIJlmHr83g+jcxzdr5pNFlOnbsCPNKe2g95V2Mc3WAntV/wZ//KT0neIsHdLuLM/PTPM8X0QdknnSAariPbncc70JWHMlSKcKv9p2fUIlSP5Vmcf7rgnGck37Zd7dSgfP88l9N1k7eP5z1BcSePT9e+X99ffl1GSN/mHlT3GAsvieBUIK+NQVv7jmAxL+S+MrGCT+1T7kH6YnIAUcIPN1w1vlw+fPdFBCXrvMSzVBKj+dftVKTqx9XMyqcFCLq9Uu1Y7k1i2pdC1QCElf5xFKGeZvs+dHs9yHPpg1594wkAK2twLJPFPTdr7aNZcyPzKfPEgZvi2voLkl3v2cot1a6ffQh6pn79c+oX+aztIhad1Tt2k9PRfpkMyT9vuvbj94rjcBa64E6Vp6jZ+eQRcEgeRflXm875pgLNe0X+bdDQZpvHdCe88P0PpCAlYWcOwOn+G5umnBPoWxZ0srQqr8qc3jzQck0pdGLyykw5RWbmwoIAUcoEqQKh8+f6aDEvLaZV6qCZLn9u0jnm/N/zLNqnhQe3kO9af0gFbpWqAYklaDxB+YT/b9uebfYSAFbG8FknmmtyP/Sz/hqPlkDvl+VDyPfjb+0SQ60/h7Xfmza73vB3p+0riOFpr2xAtpHo0syu6leSGZJ5Wm+W58fKFfb8xrrgRpnLgV77kJ4LmVcT+9qZ/+rNLPSOaFVv5DiG9nzLsbDNJQ77YX0GrfQswrCzx2c+iUogoX+T5Fabpvzz7T98y81iCf2jQ+4ID4Cvel8e+geb75q51LzJhi+hnJfIAqQQp5+MwHJdS1y7xUM6QtKSeX3266SVH5oM6iAVoP7xYVrgWqIV1Dn3g/3KP/ubmafqr8knCQTNtbgRQwc6vvhll7WmM+2d7314A96dn8N2mH6/pvN2vac3SZ9/3v1K4gjd/BwfdZ+nh+Yjil1Tv670e9kMyTmtAP+ulB+vXGvOZKkPZV5788XZiauU3/J96bGk9WgmQeHwBJ7EzA7gaD5P3tfVFm2gHfQswrq3Ds/kn3mVZpOl5NfTcDH9b3zLTWYJ/aPN58QLyF+tL4D4t3B83HwvzVvtL7TVIb1el7MyTzAaoMKdThMx+UENeugKWaIXkWMrtaU800q+JBLand6HDVJhUPRgwgfUTNdKiL0m/WtFrkuR6VP1NP/3XRDLpBC3ZdMG9vPli+7X2ZvyOZZq6ijvqPz0UN03abTmpv02X66Ue9Pxb6m5FCd+o/++YPoOr+L22HlCfpJfOeeCFpw1I7nV7uh2Se9Ij3t+mba/p+/S3W7IP0zdccrfdeFPp//kdfyfH+fu0V6uAZuKa2fuUx71XAePM1wbwz5kMYDFKa/luUl+lK/ptasbIKxy6vesq3QT/FILrJs76NJ+h7Zl5rkE9tHm8+IOG+NP4LfTtonm/+ar9FLTw/uW+pmbPXf4i8kMwHqDKkUIfPfFBCXbvM++Y9cMYX5m06nUx37ApyUO+jkfrv+ipdC1RD8vw8cvIjw7pl5C7Tj/VZzz3X+px5dOLz2zemZNzVP8h1wby9eVd92/taSTn/8PV+wMyedMbAfw84lR7Ub0Txk+VdqdljT3Wis3eblvlJdUptfnWzTGr0q/+cDyk9Z59pCz+krak0XPNDMk/Kq00t/9Wjhu8/YNOaK/1BVtPeq0aZl/69bVWq4r07RUF1avNwj2pj6W+BexUwPuA7kmlnzLsbDFKf3D4jeqdlLDUWYlqZsVfGf0IvUePDwT7Fzjp06aBeNQbqe2Zea5BPbR4fcEC8hfjS+PLtoHm++atddj2d2v/2XHrdOEReSOYDVBlSqMNnPiihrl3mpXoPnPGFKc4lz0/JplmVDupiyk3ZUvFgxAJS2ettctMb9vZ+sYecmdVowG7tjqr1ftWer511cTBIpu0DDpZ3e1/iV7jDAmaW/fey2mk12r1Zrt/FhJ/USl68OCeryZMBSrTdz7aunV6z/RvGXVS14/Uq3L3KB0m7OlX/zbEXUsCktV1PqHL+63vo0sA1B4GkFQ5vUzuj1iVD/b8S/K1jTrVLv2B0ReBeBYwPgGTaGfPuBoP0yoIO1ap1WMQXYlqZsVf8LkItxP0KA47XH/qevbHKK8K01iCfOmC8+YB4C/Gl8eXfQdP8gK92ybgLsqu29/7I6j1EvrsImQ5QZUihDp/5oIS6dpmX6jtwxhfmbv3OP6ZZlQ5q+WnUTqt4MBRAQnFM5WOQLD8eItGL7qCMMt0xRSpASuIAKUhRHZTjjWofjbxVsAApiQOkIEV1UB6q9BdHqwFSEgdIQbJ/UNYOvpyaH468XdAAKYkDpCDZPyhfpVa71fbD/QEJIQUBEkIKAiSEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQVFAenQ/kgdLimKuI2tDhyNzVzPig/GZvCBY7GZu784VivefzxGc4tLIl9z7BWrFR8KuWLx8P0oIO1nkTqi7Yu4ja32HI/NXHbYwl7ZandJbOayYu1AjCaXxmjuIa0oRpPLYjT3oHYwxCV7ASl4gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkIkKQDJBEgGQGSdIAkAiQjQJIOkESAZARI0gGSCJCMAEk6QBIBkhEgSQdIIkAyAiTpAEkESEaAJB0giQDJCJCkAyQRIBkBknSAJAIkI0CSDpBEgGQESNIBkgiQjABJOkASAZIRIEkHSCJAMgIk6QBJBEhGgCQdIIkAyQiQpAMkESAZAZJ0gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkoMSEtenXwUyNfm7Mumj0DJB4giVwEaecbl5C/c/p+kGd3zwCJB0gi10D64O56RO2fe2/K68/2vTyLKLfbh4W29gyQeIAkcgmkjdcT5fRZZHy4/dN+DYjOemGbjT0DJB4gidwBqaATtfx8R8DlhbNvzKDaz/wpvWeAxAMkkTsgPUqX51feZNX91aj2szsqXxA2QOIBksgFkBa/eQM1WB10o3UPVqVT3pLbM0DiAZLI+ZC6E1HTFaE2+6NfOl3xvcyeARIPkESOhzSZzhnywa4wGy5qRxmDg9zwCxUg8QBJ5HRIBeekLY606aST6ELr35QAiQdIIqdDeoVuirzYdTdQlREFFvcMkHiAJHI2pKUfnZT5o5Xlvl6LrrR4xyFA4gGSyNGQ3iaiIdbWu7oDNZhtaUtA4gGSyNGQbqRe71q9H9CuQakZI6xsDEg8QBI5GlLdE2XuTvdxLbp+a+TNAIkHSCInQ/qe/i615lWt6PxfIm4FSDxAEjkZ0nM0Tm7RebdQvQWRNgIkHiCJnAzpRloiu+whKTmR7jEESDxAEjkZUsu0ndLrnlQl9d/htwAkHiCJnAypQT0bC59Xl24I+ysHQOIBksjBkPZmNLez8lUt6OzvwlwOSDxAEjkY0la60tbS8x+gqhNDXwxIPEASORjSCrrZ5uJfzaHeIZ8cBZB4gCRyMKSvqK/d1S9qTBf9EOIyQOIBksjBkD6lR20vf/MNlPNC8LtFABIPkEQOhjSJnotiByadQFf+FuwCQOIBksjBkP6PJkSzByvaUO33gpwPSDxAEjkY0jB6O6pdKHg6k3r8XulsQOIBksjBkB6mz6PciYVNqOqQik/XBUg8QBI5GNJdtDDavdg5siY1ej3wlw6AxAMkkYMh3UQhn4TLehseyKSLAx46C0g8QBI5GFIn2qhiT5ZcRSkdJ4u7vwISD5BEDobUIs3qEwNF6OOLier8a5n/I0DiAZLIwZDOqKVsbxbeWZ1S2r7q/b0DIPEASeRgSCecqXB/tk24hKhm368ByRQgiZwLqTS1pdpd+q7/iUQXPL8JkIwASeRcSHvoKrW7xFjem53SKKvHp4p+9qoQIIkAySj+kDZRN7W75O2XJ08jqt/nPTuv9RchQBIBkpEKSAfH9ek5vMB3evuzvW554ncZSD/TXWp3yd/u+f+sTlTl6rGrFA8GJBEgGamANGLw5ryxA8v0k+X9JhQffe+fRRKQvqYH1e6Svz3H2c7P7juTiJo9PDvc68XIBkgiQDJSAIl12eT5rnTDKi+czms1bW/ndRKQpll92m/J/L+1W/ZsuwyimjdOWKNqMCCJAMlIAaQl3co9b+//yPvBY+OLjnzQ95gEpHfpebW75E/8+nvTGz3qEqU0f3iWkm9MgCQCJCMFkObeob8dMtH7wZ6BnTv33qifWtjC07Lw/1TvFXon8kbRVr5iZLt0zzem7h8Vx/6TIWSqjJ+KBOlO/a0PUsmDE/YXT+2lI1x+m6dfSiJV9h/6JOJGtiqv8DGbckd9oqq3fHIwurllWml0A0JWccWqwopFWozmlmllIS45bhXSD76bdlP10yu6HPG8vWu6cZmFm3bD6GO132T9BbtnQ+GCfzUiyu3+gfxTu4pw006Em3ZGCm7a7emyQdMOdF2tn/65s37TqbcMpMdoltpd8hfiLkKFc+71fF+q2Wuq7Z+XAEkESEYqfv09+qHNO555pFybP10r7j3h4LFPu+2UgDSQvlK7S/5C39euYMbddYjqPmzzD0yAJAIkIxWQisf37jXKs/mYoZq2dXivHo//yi+yAOkusv5K5TKFvdPqrk9vy6W0v38i8wJnRoAkAiSj+N9F6Fb6We0u+Yt07++tY5sSNR4p/6BCQBIBklH8Id1Iq9Xukj8LD6NY0D2Dsrp/LTkYkESAZBR/SH+jDWp3yZ+lxyOteboRUfNx22UGA5IIkIziD6kjxeAe2szyA/vyJ7dLoXrDLby6sxEgiQDJKP6QLiOV9ygVWX+E7Pd3Z1PNQeusbg5IIkAyij+kFplq98hI5qHmfzxSg3LusfjrcEASAZJR/CGdW13tHhnJPWfD1pH1KaO7pV/EA5IIkIziD+mMumr3yEj2yU/yXjiNUq//MvKGgCQCJKP4Q6rfUO0eGck/i1DBexcSXRLslS0CAiQRIBnFH1Ktxmr3yMjO03EVfnApUevZ4TcCJBEgGcUfUtWmavfIyObz2s28kugfS8JtAUgiQDKKP6T0C9XukZHtJ4icfSmldgnzvP6AJAIko7hDKqZWavfIyP4zrRZOOoOyHwp5JzxAEgGSUdwh7aXL1e6RUTRPWbxzTF2q9Wxe8AsBSQRIRnGHlE+d1O6RUXTP/b3t6erUYFzQ+1wAkgiQjOIOaTtdp3aPjKJ9Ev0/+mVS0ylBLgAkESAZxR3SRuqido+Mon81ip+6pdJl8yqdDUgiQDKKO6Q1dLPaPTJS8bIuC6+k1EovmA5IIkAyijuk7+h2tXtkpOb1kT4+m3KfCfytAyCJAMko7pCG0r1q98hI0QuN7XyuBjX+0HwOIIkAySjukN6gx9TukZGyV+xbe1sqXbNMfAxIIkAyijukgnuXRdzGVgpf+nJBK8p8YIvxESCJAMko7pCOaPvU7pGRyteQLZzUkE4a538BQEASAZIRIFlr22NZdKHvbuGAJAIkI0Cy2vIulNJd/1U4IIkAyQiQrPfRWVR9RD4gmQIkI0CSaOfw6nTePEASAZIRIEm15mZK7X8AkIwAyQiQJJt2FtV/NSaTAUkESI6HxHYMyaKrwzyA1n6AxAMk50Nih9e3p+zHonnNvxABEg+Q3ABJ2/f/atF5c5UPBiQeILkC0n62pjul9t6seDAg8QDJJZAY+6IxnfT/1A4GJB4guQYS2/5YJl2j9JcOgMQDJPdAYmxRK8p+WuFr0gASD5DcBIkVjK1Bzb9VNhiQeIDkKkiMre5KGYNCPP+ddIDEAySXQWJsSgNqUvmZhmwFSDxAch0ktql3Snq/P1UMBiQeILkPEmMfN6TTvlAwGJB4gORGSGzbA6kpvSVeFz1EgMQDJFdCYmzmmXTKp9EOBiQeILkUEtv+QFpKtPcZAiQeILkVEmOzz6Z670Y1GJB4gOReSGzHYxnUZV0UgwGJB0guhsTYN+dT3cn2BwMSD5BcDYntfDqTuvxhdzAg8QDJ3ZC835RqT7I5GJB4gOR2SCx/SCb9fbWtwYDEAyTXQ2Lsu1ZU46VCG4MBiQdIgMRY4biq1MbGa2wAEg+QAElvxRVURf4hf4DEAyRA8jWpFrX6TnIwIPEACZD8/X49ZTwg95A/QOIBEiDx3qtHTefLDAYkHiABkmhjb5J6yB8g8QAJkMx91JBO+8zyYEDiARIgBeR9yJ/VR1cAEg+QAKlCs86iem9b2xSQeIAESBWz/ugKQOIBEiBVbsF5VMfKHVkBiQdIgBSknU9mUrdNETcDJB4gAVLQvmtOjaZH2giQeIAESMHLfywtpV+EOzoAEg+QAClUs0+jpovCbgFIPEACpJBt7k1VRoZ7nBIg8QAJkMI0qSZd+VvoiwGJB0iAFK6VbenE0M99B0g8QAKksBWOzKTuoe7HCkg8QAKkCC1qSmctCH4RIPEACZAitaNfSvpjQR+GDkg8QAKkyH1cj1otD3I+IPEACZAs9Md1lDuu8tmAxAOkCh3YHakj2v6I29hqb0ls5u4+bGGvIvXfHOq6oeKZe0qjnhu8Yq0oRpPLYjT3UNKt2AMpxCX7VEA6VhKpMq004jb2Ko/RXCUrXt+GTllQ8cyEXnHQkm/FWozmlmllIS45rgISbtqFKsid73DTjoebdoBkuRmN6IIl5jMAiQdIgGS9TTdTNfMD/gCJB0iAJNOEKikD8vlHgMQDJECS6pvTqDV/DRhA4gESIMm1+XqqbTzzHSDxAAmQJCt8Oi39ad9JQOIBEiBJ90Ud+rv3mVEAiQdIgCTfLy2p8WIGSKYACZBslNfP+3twQOIBEiDZ6pXslH47AYkHSIBkr69PpTZrAMkIkADJZpv/QfXDP1uX/QBJBEiyJRkkVvh0qvF7cNUBkgiQZEs2SIx9UZdukniBP+sBkgiQZEs+SLs3t6CzZF8K3UqAJAIk2ZIQUkleb6r2pvrBgCQCJNmSERJj/0//PbjqwYAkAiTZkhMSW3gqXfa74sGAJAIk2ZIUElvfkerPVjsYkESAJFuyQmIFg1Iz/6N0MCCJAEm2pIXE2Acn0N1Bn4vVZoAkAiTZkhgS++lsukbhX5QASQRIsiUzJLbhcjr3F2WDAUkESLIlNSSW153qf61qMCCJAEm25IbECh9LqfahosGAJAIk2ZIcEmMTMtJfUDMYkESAJFvSQ2Kf1qB+4V682XKAJAIk2ZIfElvciLruUDAYkESAJJsDILHVzemSddEPBiQRIMnmBEjsz2vpnBVRDwYkESDJ5ghIbFdfqjs/2sGAJAIk2ZwBibGRqTnvRDkYkESAJJtTILHJVdJGRzcYkESAJJtjILG5talfQTSDAUkESLI5BxL76Sz6+7YoBgOSCJBkcxAktr4NXbzG/mBAEgGSbE6CxPK60Snf2x4MSCJAks1RkFjhY3TCF3YHA5IIkGRzFiTGXsrIfMXmYEASAZJsToPEpuamPGZvMCCJAEk2x0Fii06mW2095x0giQBJNudBYr+dTx022RgMSCJAks2BkNjWq6npSvnBgCQCJNmcCInt7EX1F0sPBiQRIMnmSEiMPZVSa6HsYEASAZJsDoXEXkitLvuMxoAkAiTZnAqJvZqe84ncYEASAZJsjoXE3sjIfFdqMCCJAEk250JiH2RlTpYZDEgiQJLNwZDYF1XTJkgMBiQRIMnmZEhsZm7qi9YHA5IIkGRzNCS2oFbKc5Y3BiQRIMnmbEjsu5NosNVtAUkESLI5HBJb2oAesLgpIIkASTanQ2IrTqO7rT01OCCJAEk2x0Niq86g3paeXQiQRIAkm/MhsTXnUrd8C9sBkgiQZHMBJLbhYupq4aF+gCQCJNncAIltakVXR37hF0ASAZJsroDE/uxAbbdG2giQRIAkmzsgsbzr6NLNEbYBJBEgyeYSSCzvemoe4cXIAEkESLK5BRLb1YPOWR12C0ASAZJsroHECm6jxr+E2wCQRIAkm3sgscJ7qdHyMJcDkgiQZHMRJMYG0ck/hL4UkESAJJurILGnqc63IS8EJBEgyeYuSOz5lBPmhboMkESAJJvLILFxoZ+mC5BEgCSb2yCFeZouQBIBkmyug8Tezcx8J+gFgCQCJNncB4m9n5X5VrDzAUkESLK5EBL7JCcj2BPeAZIIkGRzIyQ2LagkQBIlMqSD4/r0HF7g/2BW3xvv/xGQ7KQAEptZNW1ipTMBSZTIkEYM3pw3dmCZ9/SC3ssLvuhXDEg2UgGJzaia8XbF8wBJlMCQWJdNnu9KN6zyftDvq4DLAEkiJZCCSQIkUQJDWtKt3PP2/o/007s7f/Wvmx9dq58sOeBp7+5IHdH2R9zGVnuPx2bu7sPagdgM3lOiZMyMnIx3A88p1oqUTK5caYzmHorZiscXYRQAACAASURBVMtiNNcDKcQl+6xCmnuH/nbIRP3tus5Pbi+a2GO/5+TCFp6Whf+nKBZ9WzXzi3ivAfHK+KlIkO7U3xqQPLfwSm9d4Dm56j5Pvx2PVJlWEnEbe5XHaG7ir3h2duZn5o9LE37FFSvVSmM0+a9f8TGrkH7w3bSbqp9mnTd43g6calyGn5EkUvMzkt7UKgEvRYafkUQJ/DPSni4ePAe6rvZ+G+s9Q9OOdV8ESDZSB4l9nJX5nvgIkEQJDEkb/dDmHc88Uq7Nn65pU3utZC/3PgJINlIIKVASIIkSGVLx+N69Rnk2HzPU8y3p7dtvfGIbvwiQJFIJKUASIIkSGVKYAEkipZB0Se/7TwKSCJBkczsk9lFW5ge+U4AkAiTZXA9JSAIkESDJBkjsw8wq3sfMApIIkGQDJC4JkESAJBsgMa+kTwHJHCDJBkh6UzKzPwMkU4AkGyB5+8AjCZBEgCQbIPmanJE9G5B4gCQbIPmbnJGzEJCMAEk2QDLySJoZm8mAxAMk6ZIOEns/PfuL2EwGJCNAki75IBVPTc+JjSRAMgIk6ZIQkvZOes60WEwGJCNAki4ZIR2YFBtJgGQESNIlJST2RnrOdPWTAckIkKRLTkgeSblzlE8GJCNAki5JIcVEEiAZAZJ0yQqJvZ5efa7iyYBkBEjSJS0kNjFNtSRAMgIk6ZIXEpuQWmOB0smAZARI0iUxJI+kE5eqnAxIRoAkXTJDYmOowQqFkwHJCJCkS2pI7DE6Y426yYBkBEjSJTck1p8u2qJsMiAZAZJ0SQ6psAddvkPVZEAyAiTpkhwS23k1/S1f0WRAMgIk6ZIdEtvemnoUqpkMSEaAJF3SQ2KbzqcH1UwGJCNAki75IbG1Z9IwJZMByQiQpHMAJPbTSSnjVUwGJCNAks4JkNiimmlvKpgMSEaAJJ0jILG5OZkfRz8ZkIzkIFU1lQlIivtLIbFPMqt9GfVkQDKSg3SLp3My2nS74cKUFvcDkuL+WkjstdRa30c7GZCMpG/aTW22U3/3R5PpgKS4vxiSijuwApKRNKRmH/ve/685ICnur4ak4A6sgGQkDSlzgf87UxYgKe4vhxT9HVgByUgaUoNe3nflt9QHJMX99ZCivgMrIBlJQxpG5z8wYsTApvQEICnur4cU9R1YAclIGlL5f+qTp9pPlwKS4uIASb8D6y1R3IEVkIxs/EG2/M9lP2wqs8IIkKSKB6Qo78AKSEY2IB358TOmlQCS8uICKbo7sAKSkTykF3KJlmpP3WGJEiBJFB9IUd2BFZCMpCFNpC6veiC9nT4GkBQXJ0jR3IEVkIykIV3QXzvigaQ9eTYgKS5ekKK4AysgGUlDqvKlD9K8DEBSXNwg2b8DKyAZSUOqO8MH6ePqgKS4+EGyfQdWQDKShnRVh8M6pD3NrgEkxcURkt07sAKSkTSkr9MaP0h39ame8R0gKS6ekNjjtu7ACkhG8r/+XnCRfs+GS76x4giQZIorJHt3YAUkIzsPNS9YuXKvZi1Akii+kGzdgRWQjKQhtZll0RAgyRZfSLbuwApIRtKQGo4DJGdCsnMHVkAykoY0rennxwEpJsUbkn4H1gfkJgOSkTSkdudTZoNT9QBJcXGHxNY2lrwDKyAZSUNq27GTP0BSXPwhSd+BFZCMbD9B5MH1gKS4BICk34F1ksRkQDKyDWlBLUBSXCJAYnOrytyBFZCM5CHN7NWubdu2rXNrA5LiEgIS+zCj2kLLGwOSkTSkKZTekBpUoSst/T0JkCRKDEjsfyknWb7bHSAZSUNqcV2RlvZbyctXFAGS4hIEEhtKZ6+3uCkgGUlDyp2paWm/atpDAwFJcYkCifWl1hbvLARIRvIP7JujadUXadriBoCkuISBVHA9dSmwtCUgGUlDuujmY9p5QzRtWlVAUlzCQGLbW9F9ljYEJCNpSO9SJ+3ptH7DT74MkBSXOJDYusb0nJXtAMlI/tffU0ZrxVcTNVoOSIpLIEjspzqpVp5ZCJCMbP5BdsMaa/dcBSSJEgkS+zIna2bkrQDJCK8hK507ILEP0msuibgRIBlJQzrRKBeQFJdYkNj/UaPfI20DSEbSkLp6uyS7Gf6OpLoEg8Qeogu2RtgEkIzs3rTLbz8TkBSXaJAKe1CnCI89ByQj2z8jLW8BSIpLNEhs5xXUK/wWgGRkG1J+NiApLuEgsc3N6PGwGwCSkV1I5SMbWoFUtDdSR7UDEbex1f6S2Mzde8TCXtlqX2ls5u49rB20+S/XNEyZEO7yMptzI3VYOxSjybFacbFWHOKS/cEhNffWrDYNsgLpWEmkyrTSiNvYqzxGc1214l9qZswMc3ECrjhCWozmlmllIS4Rf3ENAumiji8dswIJN+0kSrybdp6mZVb7KvSluGlnhD/ISucuSGxiapjH+QGSESBJ5zJIYR/nB0hG0pDSc6qaAiSFJSikcI/zAyQjaUgDzk1vfdMNF6Zc2OMWT4CksESFFOZxfoBkJA1p6vl5+ru158yIhAiQJEtUSGEe5wdIRtKQzpvqe/+/5oCkuISFxNadGeJxfoBkJA0p80vf+4+zAElxiQsp5OP8AMlIGlKDnuX6u9LO9QFJcQkMKdTj/ADJSBrSv6nxfcOGDTyXngQkxSUypBCP8wMkI2lIZaPq668hW2dYKSApLqEhBX+cHyAZ2fiDbPmfy37YVGaFESBJldiQgj7OD5CM5CEV79S0w5Nf2ARIqktwSMEe5wdIRtKQ1tYdrZW0JKqxApAUl+CQgj3OD5CMpCHddP5G7V16ZeNlNwOS4hIdEtt8XsXH+QGSkTSkuu9r2o3NNO39RoCkuISHxH5rmPJSwBmAZCT/B9mFWmnNxzVtfiYgKS7xIbHFJ2R8ZP4YkIykITV6Q5tPCzVtEv4gq7okgFTxcX6AZCQN6e56T5x6ZqlWcAF+RlJdMkCq8Dg/QDKShrSzNdVeqmm31PgFkBSXFJACH+cHSEY2/iB7QH82h+W7rDgCJJmSA1LA4/wAycjOQ80PzymwpAiQ5EoSSObH+QGSkR1IW+hzQIpBSQLJ/Dg/QDICJOlcD8n0OD9AMgIk6QBJPM4PkIwASTpAEo/zAyQjO5COrdyvWQyQJEoiSMbj/ADJyP4TRG4BJMUlEyQ2hk75HZBEkpC+vabxNbP1E0efw8u6qC6pILEH6aJtgMSTg7Q0I+WUjJSPNW3eWXQOICkuuSAVdqerdwGSkRykrjVWaQUtm26/mU4YfzwYHECKouSCxPIup/6AZCQH6bQHPW/mUJW0+5gVRoAkVZJBYusb04uxmMucDyn9v543W6nDb9YYAZJUyQaJ/VQ79Z2YDHY8JHrd8yaf5lh1BEgyJR0kNi8ne15MBgMSINkv+SCxD1NPWhmLuYAESPZLQkilQ+mcjTGY63hITy5dunQWjV+qB0iKS0ZI7E5qm6d+ruMhmQMkxSUlpF3XUE/1c50OaZg5QFJcUkJiW86jocrnOh2SdIAkUXJCYr81SHlF9VxAAiT7JSkktrBq5ueK5wISINkvWSGxD9Nr/aB2LiABkv2SFhIbR6f+oXQuIAGS/ZIXEutPl+4Iu6FkgARI9ktiSAXXU9eCsFvK5QpIR378jGklgKS8JIbEtrekhxTOdQOkF3KJlmpP3WGJEiBJlMyQ2LrTaay6uS6ANJG6vOqB9Hb6GEBSXFJDYt+fkDFV2VwXQLqgv3bEA0l78mxAUlxyQ2LTM3O/VTXXBZCqfOmDNC8DkBSX5JDYayn1f1E01wWQ6s7wQfq4OiApLtkhsUF0wVY1c10A6aoOh3VIe5pdA0iKS3pIhbfQVbuUzHUBpK/TGj9Id/WpnvEdICku6SGxne2pj5K5LoCkLbhIfzDSJd9YcQRIMiU/JLapKY1UMdcNkDStYOXKvZq1AEkiB0BiP9dJfUvBXHdAkgiQJHICJPZldpU50c91OqRzzAGS4hwBiU1OPXF51HOdDqmtOUBSnDMgsWF09oZo5zodknSAJJFDILG76bJon1jIFZB2zZ789txdgKQ8p0DadR3dGuVcF0Da9890/dffKb0OAZLinAKJbWlGT0Y31wWQ7si4++2ZX7zWlfoDkuIcA4n9dnLKf6Oa6wJINd/2vR98IiApzjmQ2KLqmZ9FM9cFkLLyfe8X5gCS4hwEiX2UXnNpFHNdAOni733vX2kPSIpzEiQ2nk5Za3+uCyAtaLm4XNNKZ533MyApzlGQ2ED9xZrt5gJIretQ1TPOyKZGTazcuwGQJHIWpMKbqLPtJxZyAaSL28jcuwGQJHIWJLa9FT1gd64LIMkFSBI5DBJbdwb9x+Zcd0Aq2ucNkBTnNEhsea209+zNdQGkTf+oihcai81gx0FiMzKrfWNrrgsgXVGj16DB3gBJcc6DxCam1FtlZ64LIFX93gogQLKRAyGxwdR0k425LoBUNw+QAMkoIqTCHtQpX36uCyA9OgKQAMkoIiS2swPdLj/XBZCOXdV20Ghv/KyD4/r0HF7AP1zQeSkg2cmRkNjmpvSs9FwXQBpNVPG3diMGb84bO7DM/9G+27sBkq2cCYmtqJM6WXauCyDV7/bdxi3ejHNYl02e70o3rDKkTbodkGzlUEhsXnaVLyXnugBSVqVfNizpVu55e/9H/o/6HvFBKjngae/uSB3R9kfcxlZ7j8dm7u7D2oHYDN5TEpu5u4u1ohhNLrW01Vspp2yQm3soZisui9FcD6QQl4h7LgRAumiVVqG5d+hvh0z0fnCw90rNB2lhC0/LKm6M3NhTdFVpvNcQt8r4qQBI33b8pcKGc+/U3/ohvfii5oe0Wv+j7dqjkSrVjkXcxlbHymIz17Pi47EZfKw8NnOPlsRqxUctrvjwVTRUam7cVyxdiVYS6qLgkNo2pGqnejPO+cF3026qfnpl7yIDkjf8jCSRU39G8rS2QeqHMnNd8DNSu05Gxjl7umzQtANdV+unx3Tr2bNnl+6jAMlGDobE5mbW/FlirgsgGR1cz0+OfmjzjmceKdfmT/ft/m3zDwCSjZwMiY2iiySeNdJFkBbU4ieLx/fuNcqz+Zihvo9x085ejobEutOd1jd2A6SZvdq1bdu2dW7tYLwqBkgSORvSn01oguWNXQBpCqU3pAZV6MpZgKQ4Z0NiP+RmLbS6rQsgtbiuSEv7reTlK4oASXEOh8TeoNOtPqTCBZByZ2pa2q+a9tBAQFKc0yGxe+m6QmtbugBSlTmaVn2Rpi1uAEiKczyk/NY0zNqWLoB00c3HtPOGaNq0qoCkOMdDYr+flD7N0oYugPQuddKeTus3/OTLAElxzofEvkiv86uV7VwASZsyWiu+mqjRckBSnAsgsaHUysrfZd0AyduGNcetOAIkmdwAqfB6GmBhMzdAKt6paYcnv7AJkFTnBkhsy1kpkyJv5QJIa+uO1kpaEtVYAUiKcwUktii76vcRN3IBpJvO36i9S69svOxmQFKcOyCx/1HjLZG2cQGkuu9r2o3NNO39RoCkOJdAYr3pxkibuABS5kKttObjmjY/E5AU5xZIeRfS8xE2cQGkRm9o82mhpk2qD0iKcwsktrJWxqzwW7gA0t31njj1zFKt4AL8jKQ610Bin6Sd/EfYDVwAaWdrqr1U026pUfE5UAAp2twDiT1K7XeFu9wFkDTtgP632OW7rDgCJJlcBKmgIw0Kd7krIGmH5xRoFgMkiVwEia1vlPJOmIvdAWkLfQ5IMchNkNi8zBN+Cn0pIAGS/VwFiT1P520PeSEgAZL93AWJ9aDeIS8DJECyn8sgbW9GL4e6zOmQthdrW45px1buB6QY5DJIbFn1rK9CXOR0SFVmaGTpAX2AZCO3QWLvpDRaH/wSp0PKvn0RvbHYHyApznWQ2AC6JvjTCjkdUk8yBUiKcx+k/DY0NOgFTodUMvMtGjbZHyApzn2Q2Jp6qVODne90SJ46rbMCCJBs5EJIbHZG7WBPK+QCSJq2e+bEN+ZaesJiQJLKjZDYMGoZ5GmFXACp7NEM/QekqmMASXWuhFTYme6tfK4LII2hGyfNnvnatfQ2ICnOlZDYlrPplUpnugBS00d87++5GJAU505IbFFO1e8qnucCSFlf+d7PygYkxbkUEnuNGm+ucJYLIFWd4Xv/RTVAUpxbIbE76YYK57gA0uVXHtPfHbnmCkBSnGsh7WxFowLPcQGkWSmn9B/xbL8GqV8CkuJcC4mtqpUxM+AMF0DSPm+i//r7fEsvIQtIMrkXEvs0re5q88dugKRpeT9afOoTQJLKxZDY43S5+WmF3AEJT34Sm8FuhlTQiR4yfegOSHiEbGwGuxkS23BKylviI0ACJPu5GhL7ukqN5fwDQAIk+7kbEnuBzt1mnAYkQLKfyyGxW+mfxkl3QMKTn8RmsNsh7biAxvtPugBSizW+9580BSTFuR0S+7FG1gLfKRdA8j+LUMlwvNCY6lwPib2X0mid94TjIZme+wQPo1AdILEH6OoC/b3jIa16ibrerdf339sBSXGAxHZdQU/q7x0PSdOuXW8FECDZCJAYW1s/9WPmCkhyAZJEgORpTmbNFa6AdKJRLiApDpD0RtDFeW6A1NXbJdnNBgKS4gDJW1fq6wZI/vLbzwQkxQGSty1n0UT3QNKWtwAkxQGSr8XZNda6B1I+nkVIdYDk7z90WalbIJWPbAhIigMkf4VX0wuOh9TcW7PaNAiQFAdIRmvrZC2JzeREg3RRx5eOAZLiAIk3hZruiM3khIEkFyBJBEi8Q33o/thMThxIG6e/P3MHIMUgQOIdOnhG6ucxmZwokKY18971u803gKQ8QOId0uanNdgQi8kJAmkc5fR6cfL4W3NS3wQk1QES75BW9AB1j8XkxIC0KrXtTu+JvMsyLL0EJiBJBEg8D6SdF9LrMZicGJDuqLnbf2p3zXsBSXGAxNPvIvRdlRqr1E9ODEin9eMn72kMSIoDJJ73vnYj6IpC5ZMTA1LWWH7y/3AXIdUBEs8LqbAjjVY+OTEgVRvNTz6PxyOpDpB4vnt//1oza5HqyYkB6fx/8pOdmwOS4gCJ538YxWQ6P0/x5MSA9HjGav+pJalDAUlxgMQzHo90Ez2seHJiQNpZ4+Q5+vuyKbVO3F2ZDSBFFSDxDEgbG6ZOVzs5MSBpC6rTaTf16VKfai+x4giQZAIkHn+E7PTUU7YonZwgkLStA04motMH5VtyBEgyARJPPNS8P/VSOjlRIHk6sOOgNUWAJBcg8QSkvHPpTZWTEwiSTEX7InXUwja2OlASm7meFR+MzeD9pbGZu++IdihGk8tiNPewWPHSrBPXKZwcqxUXa8UhLjmgAtLR45Eq00oibmOrkvLYzI3dio/HasWlSbjiUn56JF1zTN3kv2LFgYlHwOKBfQHhpp0o9jftGCtoS+PUTU7Sm3aAJBEg8QKe127VCTk/KJsMSLIBEi/JIbEJdOFOVZMBSTZA4iU7JNaVBquaDEiyARIv6SFtaJA+V9FkQJINkHhJD4lNTTltq5rJgCQbIPGSHxK7i+5SMxmQZAMkngMgbWuc8oGSyYAkGyDxHACJfZVRe62KyYAkGyDxnACJPUF/VzEZkGQDJJ4jIOW3ogkKJgOSbIDEcwQk9lO1qj9GPxmQZAMknjMgsXF0ya6oJwOSbIDEcwgkdi0NjXoyIMkGSDynQPqjTsb8aCcDkmyAxHMKJPYunb09ysmAJBsg8RwDid1G90Y5GZBkAySecyD9eUbKh9FNBiTZAInnHEhsdlr99VFNBiTZAInnIEjsEbo+qsmAJBsg8ZwEKf9iejWayYAkGyDxnASJLcmuvjKKyYAkGyDxHAWJjabWBfYnA5JsgMRzFqTCq2iE/cmAJBsg8ZwFif1WK/Nb25MBSTZA4jkMEnuLmuywOxmQZAMkntMgsVvoX3YnA5JsgMRzHKRNjVK/sDkZkGQDJJ7jILGZaQ022JsMSLIBEs95kNj91MPeZECSDZB4DoSUdx69YWsyIMkGSDwHQmLfVTlhlZ3JgCQbIPGcCIk9S1cU2pgMSLIBEs+RkAra0fM2JgOSbIDEcyQk9mvNrMXykwFJNkDiORMS+x9dkCc9GZBkAySeQyGxG+lR6cmAJBsg8ZwKaWPD1OmykwFJNkDiORUS+zTl1C2SkwFJNkDiORYSu4dul5wMSLIBEs+5kPKa0mS5yYAkGyDxnAuJLcw88XepyYAkGyDxHAyJDaGOUndwACTZAInnZEgFl9H/yUwGJNkAiedkSOzn3JwfJCYDkmyAxHM0JPYyXbjT+mRAkg2QeM6GxLrQk9Y3BiTZAInncEjrTkqfa3ljQJINkHgOh8Q+Tjl9q9VtAUk2QOI5HRK7k+62uikgyQZIPMdD2tY4ZYrFTQFJNkDiOR4Sm5d+0jprWwKSbIDEcz4k9jjdYm1DQJINkHgugJTfjKZa2hCQZAMkngsgsXlpjf60sh0gyQZIPDdAYn3pISubAZJsgMRzBaQtJ6cvtLAZIMkGSDxXQGJT6KJdkbcCJNkAiecOSOx6Gh15I0CSDZB4LoH0e41qkZ8OHJBkAySeSyCxMXRVxG0ASTZA4rkFUsGl9GakbQBJNkDiuQUSW5pZN9IL+QGSbIDEcw0k9ijdEWELQJINkHjugZR3durM8FsAkmyAxHMPJDY9pXH4V6gAJNkAieciSKwXDQ57OSDJBkg8N0HaVC/z+3CXA5JsgMRzEyQ2kdqEe+pVQJINkHiugsSupfFhLgUk2QCJ5y5IK6tW/y30pYAkGyDx3AWJPUs3hr4QkGQDJJ7LIO1qTu+GvBCQZAMknssgsW8zGoZ8wkhAkg2QeG6DxAZS/1AXJTKkg+P69Bxe4Du9Z+xt3Z9YB0h2AiRelJC2nZo6J8RFiQxpxODNeWMHlnlPPzx4084Xeh0BJBsBEi9KSOxjOjfES70kMCTWZZPnu9INq/TTRaO2aVph5/WAZCNA4kULiXWjYcEvSGBIS7qVe97e/xE/Y21X/d8e3uGJ7Y3UUe1AxG1sta8kNnP3HtGKYjN4X2ls5u49rB2M0eTYrfhQdAM21M7+OegFZdHNDVmxVhzikv1WIc29Q387ZKLxcdGAyfq7hS08LQv/TxGKUW/StfFegr8yfioSpDv1txzS9nte0b9DaasHe1p7NFKl2rGI29jqWFls5npWfDw2g4+Vx2bu0ZJYrfho4q74SEeaHOz82K24JNRFViH94LtpN9X30aqeM0yX4WckifAzEi/qn5EYW1al1h9Bzk7gn5H2dNmgaQe6rvZ+8PutP5kvAySJAImnABJ7im4Ncm4CQ9JGP7R5xzOPlGvzp2vH+nlf8Am//rYTIPFUQMpvlvJJ5XMTGVLx+N69Rnk2HzNUW9XZ20xAshEg8VRAYvPSzthe6cxEhhQmQJIIkHhKILG7g7xABSDJBkg8t0La3CD964rnAZJsgMRzKyT2AV1c8QUqAEk2QOK5FhL7R6UXqAAk2QCJ515Iq2tU+yXwHECSDZB47oXE/kNXB54BSLIBEs/FkAouockBZwCSbIDEczEktiTzpI3mjwFJNkDiuRkSe4TuMn8ISLIBEs/VkPLOTp1l+hCQZAMknqshsWkpZ5leoAKQZAMknrshsZ70hPgAkGQDJJ7LIa2vY3qBCkCSDZB4LofEXjW9QAUgyQZIPLdDYtfQi8ZJQJINkHiuh7Sias01/pOAJBsg8VwPiQ2nm/ynAEk2QOIB0q7m9J7vFCDJBkg8QGLfGC9QAUiyARIPkBgbQAO87wFJNkDiAZL+AhVpC/T3gCQbIPEAydNHdJ7+AhWAJBsg8QBJ7yYazgBJPkDiAZLeulrZPwGSfIDEAyRvL9KVgCQfIPEAyVthe3oVkKQDJB4g+VqWVWsdIMkGSDxA8vcE9QQk2QCJB0j+8s9L+TIWcxkg2QiQREkGic1NPWtHTAYDknyAJEo2SOwueiQ2gwFJOkASJR2kzQ3Tv4nJYECSDpBESQeJfUItCmIxF5CkAyRR8kEq+zv9JxZzAUk6QBIlIaTVNXJ/ibyZdIAkHSCJkhASG13xBSqUBEjSAZIoGSEVtKK31M8FJOkASZSMkNiijMAXqFASIEkHSKKkhMQeoruVzwUk6QBJlJyQ8s4KeIEKJQGSdIAkSk5IbFpK07xIG0oGSNIBkihJIbFb6SnFcwFJOkASJSuk9bUzl6idC0jSAZIoWSGx/9FlheE3lAyQpAMkUdJCYtfQy0rnApJ0gCRKXkgrqtZcq3IuIEkHSKLkhcSGUTeVcwFJOkASJTGk/AvofYVzAUk6QBIlMST2TXqjrermApJ0gCRKZkisPw1UNxeQpAMkUVJD2naK7wUqlARI0gGSKKkhsQ/pvHxVcwFJOkASJTckdiM9q2ouIEkHSKIkh7TmBP0FKpQESNIBkijJIbHx1FHRXECSDpBEyQ6psB1NVDMXkKQDJFGyQ2JLM+usVzIXkKQDJFHSQ2JP0C1K5gKSdIAkSn5I+RfQOyrmApJ0gCRKfkjsm4x6Kp5TCJCkAySRAyCxh6iPgrmAJB0giZwAKa9JytTo5wKSdIAkcgIkNjdNwd3AAUk6QBI5AhLrT/dEPReQpAMkkTMgbTs9dUa0cwFJOkASOQMSm5Zy5vYo5wKSdIAkcggkdic9EOVcQJIOkEROgfTnqWnzo5sLSNIBksgpkNinKWdH92zggCQdIIkcA4n1oMeimgtI0gGSyDmQNtVPXxjNXECSDpBEzoHE3qdmO6OYC0jSAZLIQZDYjTQ0irmAJB0giZwEaV3tzO/sz40xpKJ9kTpqYRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns4HiueRC13255brBWHuOSACkhHI1aqHYu8kZ2OlcVmrmfFx2M0uTxGc0uwYl64Fd9AY23PLdFKQl2kAhJu2kmEm3a8eNy0Y+z3mtk/2p2Ln5GkAySRsyCxCdTW7uv4AZJ0gCRyGCTWkV6wOReQpAMkkdMg/Vojd6W9uYAk0i8KqwAAC59JREFUHSCJnAaJjaEO9m7cAZJ0gCRyHKTCDjTB1lxAkg6QRI6DxH6uWv1XO3MBSTpAEjkPEhtB19qZC0jSAZLIgZAKLqXXbcwFJOkASeRASGxpVq218nMBSTpAEjkREhtC3eTnApJ0gCRyJKT8C+kt6bmAJB0giRwJiX2bedIG2bmAJB0giZwJiQ2inrJzAUk6QBI5FFJeU/pIci4gSQdIIodCYgszGm6RmwtI0gGSyKmQ2L/oLrm5gCQdIIkcCynvnJRPpOYCknSAJHIsJDY79ZQ/ZeYCknSAJHIuJNaXBsjMBSTpAEnkYEjbTkudJTEXkKQDJJGDIbHPU87aYX0uIEkHSCInQ2K308PWNwYk6QBJ5GhIm09OX2B5Y0CSDpBEjobEptC5lp9XH5CkAySRsyGx7vSE1U0BSTpAEjkc0vq6mYssbgpI0gGSyOGQ2Lt08S5rWwKSdIAkcjok1pmesbYhIEkHSCLHQ1p3Yub3ljYEJOkASeR4SOwValVgZTtAkg6QRM6HxP5Oo61sBkjSAZLIBZB+OyFnuYXNAEk6QBK5ABIbT+0sPK8+IEkHSCI3QCq8ksZH3gqQpAMkkRsgsRXVcn+JuBEgSQdIIldAYqPp6ojbAJJ0gCRyB6SCNvS/SNsAknSAJHIHJPZDlVprImwCSNIBksglkNgw6hJhC0CSDpBEboFU0Iomh98CkKQDJJFbILElWSf+EXYDQJIOkESugcSeoO5hLwck6QBJ5B5I+c3pnXCXA5J0gCRyDyT2TUa9jWEuBiTpAEnkIkjsIeod5lJAkg6QRG6ClNck5ePQlwKSdIAkchMkNjet0daQFwKSdIAkchUkdh/1C3kZIEkHSCJ3Qdp2RuqMUJcBknSAJHIXJDYt5cztIS4CJOkASeQySOwu+leISwBJOkASuQ3Sn6emzQ9+CSBJB0git0Fin6WcnRf0AkCSDpBEroPEbqXHgp4PSNIBksh9kDY1SP8q2PmAJB0gidwHib1PzYK9aBIgSQdIIhdCYjfRkCDnApJ0gCRyI6R1dTIXVz4XkKQDJJEbIbHXqUXlF00CJOkASeRKSOx6GlHpPECSDpBE7oT0e83sHyueB0jSAZLInZDYBGpb8Xn1AUk6QBK5FBK7jsZWOAeQpAMkkVsh/Vojd2XgOYAkHSCJ3AqJjaX2gTfuAEk6QBK5FlLhFTQh4AxAkg6QRK6FxFZUrf6r+WNAkg6QRO6FxJ6ja8wfApJ0gCRyMaSCS2mi6UNAkg6QRC6GxJZm1VorPgIk6QBJ5GZIbAjdJD4AJOkASeRqSPkX0lv8A0CSDpBErobEvs08aYNxGpCkAySRuyGxx+hW46QKSAfH9ek5vKDyaUCSC5B4SQIprylN859UAWnE4M15YweWVToNSHIBEi9JILH5T+X7TymAxLps8nwnumFVxdOAJBkg8ZIFkkgBpCXdyj1v7/+owumiNZ4K9kXqqFYUcRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns46VZcrBWHuOSAVUhz79DfDplY4fTCFp6Whf+nCDk+/nNOREh36m/9kMTpdSM9rT8SqVLtWMRtbHW0LDZzj5TEasVHkm/F5TGaW6Idj9HkWK34eOgVW4X0g+/m3NSKp/XwM5JE+BmJ58qfkfZ02aBpB7qurngakCQDJJ4rIWmjH9q845lHyrX508VpQJIPkHjuhFQ8vnevUZ7NxwwVpwFJPkDiuRNSmABJIkDiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiAZJ0c0buiuX4GPTVyD/jvQTJFo/cGO8lSLZs5O/xXoJkK0eujLhNTCGNbvFHLMfHoJda/BzvJUg2scV38V6CZO+3mBfvJUj2eYvPI24DSAEBUuwDJOkAKfYBUuwDJOkAKfYBEkIoRICEkIIACSEFARJCCooJpIPj+vQcXlD5dOJmXuWesbd1f2JdfNcTuQrHdUHnpXFcjKUCVjyr7433/xjX5VjJvOTtz/a65YnQf0qOCaQRgzfnjR1YVul04mZe5cODN+18odeROK8oUoHHdd/t3RIeknnFC3ovL/iiX3GcVxQx05LL+00oPvreP4tCbRoLSKzLJg/mG1ZVPJ24mVdZNGqbphV2Xh/vNYWvwnEdPen2RIcUsOJ+X8V5NZYyL3l/57WatrdzyFsqsYC0pFu55+39H1U8nbhVWuXarntDb50IBa54Sd8jCQ/JvOLdnb/6182Pro3ziiIWcJAfG1905IO+x0JtGwtIc+/Q3w6ZWPF04lZxlUUDJsdtLdYKWPHB3iu1hIdkXvG6zk9uL5rYY398VxSxgIO8Z2Dnzr1D30E4JpDuFJ/ffDpxq7DK7fe8Uh7H1VgpYMUvvqglASTTitd19txcKr11QXxXFDHzkksenLC/eGqvkDdUYgHpB993xKkVTydugatc1XNGXFdjJfOKV/YuSgJI5hWzzhs8bwcm+LUiYMkruui/frpreqhtYwFpTxfPYTrQdXXF04lbwCp/v/WnOC/HQuYVj+nWs2fPLt1HxXtN4TOvuKy357+qY90XxXtNETIv+efO+u8Ye/+lkLTRD23e8cwj5dr86eJ0Ymda8bF+U/THPib6r79NK/Y+4PS2+QfivaQIma8VU3utZC/3TvRjbF5yce8JB4992m1nqE1jAql4fO9eozy3JscMFacTO9OKV3X2NjPeS4qQ+RjrJfxNu4AVl719+41PbIv3iiJmXvLW4b16PP5ryE1xFyGEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQUBEkIKAqQEaRjp5bb/NOKWbc8JMcDSn2RD/GsUZYCUIA2jJ19//bWhp9CLkbbUKays/HUDpLgGSAmS30HRqbmR7oGmU3gZkBIsQEqQDAeP0DJN++aq3OyLJnk+anf5io65dXroz78xpVV2bospmpfCtZ5bgS3anuh9uGaH2scDBmjin7c9sUT/8NL6pWIiIMUmQEqQDAdD6TttQVr7GfP70wua1qlRqy8LPknro2kf0o0zZ15HM70U1nel5Wsm0See7fNTHwgcoIl//l+a7/nwz5RHTBMBKTYBUoJkOLg8fb92UWP9sS9dPDfyOpH+zN6dGmjaqI6e7z8H0nv5KNzt+bodrNbZc9kE+jlwgCb+OUu/x3NiLK00TQSk2ARICdIwmpWfv/PHu+g+rYAePOLpVfpR65SjX9Yn1diqYTsBSbsz3XOTr10zPsAPyfTP/1a3TNNanWc+C5BiEyAlSL5ff1P6gKPaSvL3mdbpVP0ync2Bp5tVT0ujtiZIi2mclpcyhg/wQzL983fpa20LPW8+C5BiEyAlSMNo/Jw5cxfv03QJdy31xkyQ2qc9tejX3xqYIWlnX6C9mLaTD+CQ+D8/mDNQ+0/KNvNZgBSbAClBMv3SbQ/1MU5ySBuon+dESZUASKNp9SXXVRpg+ufaLQ20llcEnAVIsQmQEiTzn4EuqaF/Y3p7SImAtIaGa/qfj1r7KPQl/RfbO9N60pTKA8Q/16bR5zQp4CxAik2AlCCZIX2TccHb84Zm3GH6jnS80cnTvnv0iityFx7SKfybhuu/+/4HVT8sBjw6Qe9b0z/Xjtc6o8qBgImAFJsAKUEKuGPC4qtzM84eU2KCpC1vk3PSvQdm1K65Tqew/aIMHcSn1Nc0wNdA0z/XtHvon4ETASk2AVIyN12/GwRKhAApiTvesnW8l4D8AVLStm3adWmJ/1pdbgmQkrZJKafPivcakBEgIaQgQEJIQYCEkIIACSEFARJCCgIkhBQESAgpCJAQUtD/B4iXRsAuE6zrAAAAAElFTkSuQmCC", "text/plain": [ "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de4CMdf//8fee7bJECNFRRSkV\nioSi033fodItUXSgRHdHpaJbErrJV+V3d5eSziqdnA9JRZEUKpGzsNbux3FZpz1cv7lm5vp8\nrtmdw/W55jPNzHW9nn/szs5c3vu5rp2Hnd2dA2kIoaijeC8AIScESAgpCJAQUhAgIaQgQEJI\nQYCEkIIACSEFAZJUK6lD4BlDaIKawa0yqm6NZoCqhUhW6YCE3FBqBwfT/+SWYf/whTtwVg+q\nZ7soIA0jUY2IW78zTW66afuVdKbs2ipV8Ezr2pkntxlb6Dn9DxrHz29Nb/PPQl2Nc2cQDQk6\nxhKkXc9cViejVsvBWyr/+5BHoRld+XBh6OVHLsTXfKX/C5R9zoAN0YwPlWVIlnfQe4ieP/U9\nqWVEcfh8By74F+avgTSlq6dLqI7+rmfErevdLTfdtL0CSG/nUEbLq8/NpJrzNG0WNTHOX021\njvDPkpJe4D99c0oUkF7LpswW11xahdJHVfr3oY7CUapeEnEnwhYSUrVbPHXvUJOyv4zuMwTN\nKiTrOyh7RZGbXjnfgQv+Wf8aSN4+p06WtttEcsfHvH30kKZQ6pADnvcFD1DaCq3sdPrWf8GD\n9Kj4LBca36n2ZjWzD2kipT9X5Hl/eHwVerbCZSGPwj46NfJehC0kJP+xK76PTimL8nMEnd/B\n0naWd1D2iiI3PUjeAxfis8YD0lM07b8NPLfwyl9vXa1Kk6GH9PMOPNGkSmbjQZ6rcDf95kVb\nbShNW9KhWu0+ReXjz8luOqpcC9jec+nqG+pkNf+Ab+8rAJKYqWlTr6yZUf+62RVOHn+xZbWs\nMwfmiX9TdAK/yT2Y7vHccKDbfB8drZXCb/CspLvrNPOdfIUe9kIyT9p6y4nZzSf5rjemNfsO\ndRptN8bsyqap/pPzUlP/0LSBNFn/YCn9I3CvzOO7em9+8aWYdsa0u5UP3sP06cL21au1W8AX\nYj76FY5dWXVaJQ6J6VNs6V7Ls2e76OKAtQZ86mBfXPMBCful0fPvoHm+6avtWdx/W1at1vFb\n/oX3/YxkOv4BW4c9fOaDEuLaFbBU/cD5Pmtbmumd8DX/GlU8qB1ouvf8GdSx4rVAGaRn6JGc\nW/tp2m1U/9GnWtOFnv+Uj7ejFoMePIdalWoz76DW4z/WnqUnT7hlYEO6fXCj/r0z6R0tYPtn\n6Zka1zz8D6KvjO19mSGZZnr+769z79N31Up5J+Bk2d+oyYNP/43qix8+X6GWxskjf3resKwq\ne70fvU/X8I1W0l39abn35CUnfaxDMk/a24jaD7233p3e641pzZUgPeu9EvrqSQMDrpwBe2Ue\nP30U1Rw/fr//AtPOmHe38sEbTPdnd3m0a0ra18ZCTCurdOwa0zK+MtOn2NOQrhzRv959+rXH\nfEU3f+ogX9yAAxLuS+PNv4Pm+aavtue2NJ07oFc1zz75D5EXkvn4B2wd9vCZD0qIa1fAUvUD\n5/usb1A374T76FX/56h4UF+jPt7zb9f3I/BaoAzSKKrh+fFD+4haeAaX309PaNqn1Nqz0GNN\ndMZTvd87R1OWZ/f+TMtoskfT3qDrA7cfTZnvejYapK92aoibduaZ59NGzznbc1sHnJxIbY5q\n+n9A3fk/6mb67YK3XvSi9/0V9Lnps/T5kQbop9bSI1N1SOZJ/6ZbPCfz6+nXG/OafYf6qy+P\nGmM60Lt85Cw6O/DKad6rgPEBt01MO2Pe3coHbwilzvBsOZZa+xdiXlnFY7cuNWN/sE/xb7pZ\n37OT9D0zr9X8qYN8cc0HJOyXxpdvB83zzV/tKfQ3zz/9I6fqQf8h8kIyHyDz1uEPn/mghLp2\nmZfqPXDez1qUk7nbs0Vp3ax9fFbgQd2bWfO45+yjNbKLKl4LlEEaTd6DdhXN9x63jPqe2wyf\nef+DH0zPCUjX6udcSP/1vN1FTQO3H+37lrqM2oSGZJ7ZKCVfP31MCzjZlny3iTIzDxv/6CLf\nJxF9T+fp79anNCw1fZY+2vk19a/NE7TaC8k8qTkt1U8P16835jVXuhXdiFby03mUXh4aknl8\nACTTzph3t/LBG+I7YEdzUvb4FmJeWeCx2/VZY/3bY5BP0Zy+008Oqwgp8FNX+uKaD4ivEF8a\nX8Egia/2NbRYPz3+0U1mSOYDZN46/OEzH5RQ1y7zUgUkzzealz1vv/T+z+KfVeGgdqa5npNf\nUI9K1wKFkB7W31Uj3+3zi+hP7/ui/Pzh+pXSgDRYP7O997gdptMCtx/t+8l/PV0YGpJ55gBq\nMinfd5Y4WV6FfP/xnu+/nabpN2qWVlh2c/re8/Zx828DdEgv0hTPbYaGl2g6JPOkskzyspyr\nX2/Ma64E6QTxo452kKg4JKSAhQZAMu2XeXcrH7wh9Lh3iwvoF99CKh598etvT3cfC/YpPHtW\nrL+fUxFS4Keu+MUNOCCV11phF4JDEl/tqsT/0xOQAg6Qeevwh898UEJdu8xLNUFaqP+cqPWj\naWJWhYM6hfpq+k32WZWuBQoh6b/sPSy+bks8F7at4j1pgjRW37YDrfW8PeLZ+4Dt/ZduoOZh\nIJlmHr83g+jcxzdr5pNFlOnbsCPNKe2g95V2Mc3WAntV/wZ//KT0neIsHdLuLM/PTPM8X0Qd\nknnSAariPbncc70JWHMlSKcKv9p2fUIlSP5Vmcf7rgnGck37Zd7dSgfP88l9N1k7eP5z1BcS\nePT9e+X99ffl1GSN/mHlT3GAsvieBUIK+NQVv7jmAxL+S+MrGCT+1T7kH6YnIAUcIPN1w1vl\nw+fPdFBCXrvMSzVBKj+dftVKTqx9XMyqcFCLq9Uu1Y7k1i2pdC1QCElf5xFKGeZvs+dHs9yH\nPpg1594wkAK2twLJPFPTdr7aNZcyPzKfPEgZvi2voLkl3v2cot1a6ffQh6pn79c+oX+aztIh\nad1Tt2k9PRfpkMyT9vuvbj94rjcBa64E6Vp6jZ+eQRcEgeRflXm875pgLNe0X+bdDQZpvHdC\ne88P0PpCAlYWcOwOn+G5umnBPoWxZ0srQqr8qc3jzQck0pdGLyykw5RWbmwoIAUcoEqQKh8+\nf6aDEvLaZV6qCZLn9u0jnm/N/zLNqnhQe3kO9af0gFbpWqAYklaDxB+YT/b9uebfYSAFbG8F\nknmmtyP/Sz/hqPlkDvl+VDyPfjb+0SQ60/h7Xfmza73vB3p+0riOFpr2xAtpHo0syu6leSGZ\nJ5Wm+W58fKFfb8xrrgRpnLgV77kJ4LmVcT+9qZ/+rNLPSOaFVv5DiG9nzLsbDNJQ77YX0Grf\nQswrCzx2c+iUogoX+T5Fabpvzz7T98y81iCf2jQ+4ID4Cvel8e+geb75q51LzJhi+hnJfIAq\nQQp5+MwHJdS1y7xUM6QtKSeX3266SVH5oM6iAVoP7xYVrgWqIV1Dn3g/3KP/ubmafqr8knCQ\nTNtbgRQwc6vvhll7WmM+2d7314A96dn8N2mH6/pvN2vac3SZ9/3v1K4gjd/BwfdZ+nh+Yjil\n1Tv670e9kMyTmtAP+ulB+vXGvOZKkPZV5788XZiauU3/J96bGk9WgmQeHwBJ7EzA7gaD5P3t\nfVFm2gHfQswrq3Ds/kn3mVZpOl5NfTcDH9b3zLTWYJ/aPN58QLyF+tL4D4t3B83HwvzVvtL7\nTVIb1el7MyTzAaoMKdThMx+UENeugKWaIXkWMrtaU800q+JBLand6HDVJhUPRgwgfUTNdKiL\n0m/WtFrkuR6VP1NP/3XRDLpBC3ZdMG9vPli+7X2ZvyOZZq6ijvqPz0UN03abTmpv02X66Ue9\nPxb6m5FCd+o/++YPoOr+L22HlCfpJfOeeCFpw1I7nV7uh2Se9Ij3t+mba/p+/S3W7IP0zdcc\nrfdeFPp//kdfyfH+fu0V6uAZuKa2fuUx71XAePM1wbwz5kMYDFKa/luUl+lK/ptasbIKxy6v\nesq3QT/FILrJs76NJ+h7Zl5rkE9tHm8+IOG+NP4LfTtonm/+ar9FLTw/uW+pmbPXf4i8kMwH\nqDKkUIfPfFBCXbvM++Y9cMYX5m06nUx37ApyUO+jkfrv+ipdC1RD8vw8cvIjw7pl5C7Tj/VZ\nzz3X+px5dOLz2zemZNzVP8h1wby9eVd92/taSTn/8PV+wMyedMbAfw84lR7Ub0Txk+Vdqdlj\nT3Wis3eblvlJdUptfnWzTGr0q/+cDyk9Z59pCz+krak0XPNDMk/Kq00t/9Wjhu8/YNOaK/1B\nVtPeq0aZl/69bVWq4r07RUF1avNwj2pj6W+BexUwPuA7kmlnzLsbDFKf3D4jeqdlLDUWYlqZ\nsVfGf0IvUePDwT7Fzjp06aBeNQbqe2Zea5BPbR4fcEC8hfjS+PLtoHm++atddj2d2v/2XHrd\nOEReSOYDVBlSqMNnPiihrl3mpXoPnPGFKc4lz0/JplmVDupiyk3ZUvFgxAJS2ettctMb9vZ+\nsYecmdVowG7tjqr1ftWer511cTBIpu0DDpZ3e1/iV7jDAmaW/fey2mk12r1Zrt/FhJ/USl68\nOCeryZMBSrTdz7aunV6z/RvGXVS14/Uq3L3KB0m7OlX/zbEXUsCktV1PqHL+63vo0sA1B4Gk\nFQ5vUzuj1iVD/b8S/K1jTrVLv2B0ReBeBYwPgGTaGfPuBoP0yoIO1ap1WMQXYlqZsVf8LkIt\nxP0KA47XH/qevbHKK8K01iCfOmC8+YB4C/Gl8eXfQdP8gK92ybgLsqu29/7I6j1EvrsImQ5Q\nZUihDp/5oIS6dpmX6jtwxhfmbv3OP6ZZlQ5q+WnUTqt4MBRAQnFM5WOQLD8eItGL7qCMMt0x\nRSpASuIAKUhRHZTjjWofjbxVsAApiQOkIEV1UB6q9BdHqwFSEgdIQbJ/UNYOvpyaH468XdAA\nKYkDpCDZPyhfpVa71fbD/QEJIQUBEkIKAiSEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQVF\nAenQ/kgdLimKuI2tDhyNzVzPig/GZvCBY7GZu784VivefzxGc4tLIl9z7BWrFR8KuWLx8P0o\nIO1nkTqi7Yu4ja32HI/NXHbYwl7ZandJbOayYu1AjCaXxmjuIa0oRpPLYjT3oHYwxCV7ASl4\ngCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkIkKQD\nJBEgGQGSdIAkAiQjQJIOkESAZARI0gGSCJCMAEk6QBIBkhEgSQdIIkAyAiTpAEkESEaAJB0g\niQDJCJCkAyQRIBkBknSAJAIkI0CSDpBEgGQESNIBkgiQjABJOkASAZIRIEkHSCJAMgIk6QBJ\nBEhGgCQdIIkAyQiQpAMkESAZAZJ0gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0gi\nQDICJOkASQRIRoAkHSCJAMkoMSEtenXwUyNfm7Mumj0DJB4giVwEaecbl5C/c/p+kGd3zwCJ\nB0gi10D64O56RO2fe2/K68/2vTyLKLfbh4W29gyQeIAkcgmkjdcT5fRZZHy4/dN+DYjOemGb\njT0DJB4gidwBqaATtfx8R8DlhbNvzKDaz/wpvWeAxAMkkTsgPUqX51feZNX91aj2szsqXxA2\nQOIBksgFkBa/eQM1WB10o3UPVqVT3pLbM0DiAZLI+ZC6E1HTFaE2+6NfOl3xvcyeARIPkESO\nhzSZzhnywa4wGy5qRxmDg9zwCxUg8QBJ5HRIBeekLY606aST6ELr35QAiQdIIqdDeoVuirzY\ndTdQlREFFvcMkHiAJHI2pKUfnZT5o5Xlvl6LrrR4xyFA4gGSyNGQ3iaiIdbWu7oDNZhtaUtA\n4gGSyNGQbqRe71q9H9CuQakZI6xsDEg8QBI5GlLdE2XuTvdxLbp+a+TNAIkHSCInQ/qe/i61\n5lWt6PxfIm4FSDxAEjkZ0nM0Tm7RebdQvQWRNgIkHiCJnAzpRloiu+whKTmR7jEESDxAEjkZ\nUsu0ndLrnlQl9d/htwAkHiCJnAypQT0bC59Xl24I+ysHQOIBksjBkPZmNLez8lUt6OzvwlwO\nSDxAEjkY0la60tbS8x+gqhNDXwxIPEASORjSCrrZ5uJfzaHeIZ8cBZB4gCRyMKSvqK/d1S9q\nTBf9EOIyQOIBksjBkD6lR20vf/MNlPNC8LtFABIPkEQOhjSJnotiByadQFf+FuwCQOIBksjB\nkP6PJkSzByvaUO33gpwPSDxAEjkY0jB6O6pdKHg6k3r8XulsQOIBksjBkB6mz6PciYVNqOqQ\nik/XBUg8QBI5GNJdtDDavdg5siY1ej3wlw6AxAMkkYMh3UQhn4TLehseyKSLAx46C0g8QBI5\nGFIn2qhiT5ZcRSkdJ4u7vwISD5BEDobUIs3qEwNF6OOLier8a5n/I0DiAZLIwZDOqKVsbxbe\nWZ1S2r7q/b0DIPEASeRgSCecqXB/tk24hKhm368ByRQgiZwLqTS1pdpd+q7/iUQXPL8JkIwA\nSeRcSHvoKrW7xFjem53SKKvHp4p+9qoQIIkAySj+kDZRN7W75O2XJ08jqt/nPTuv9RchQBIB\nkpEKSAfH9ek5vMB3evuzvW554ncZSD/TXWp3yd/u+f+sTlTl6rGrFA8GJBEgGamANGLw5ryx\nA8v0k+X9JhQffe+fRRKQvqYH1e6Svz3H2c7P7juTiJo9PDvc68XIBkgiQDJSAIl12eT5rnTD\nKi+czms1bW/ndRKQpll92m/J/L+1W/ZsuwyimjdOWKNqMCCJAMlIAaQl3co9b+//yPvBY+OL\njnzQ95gEpHfpebW75E/8+nvTGz3qEqU0f3iWkm9MgCQCJCMFkObeob8dMtH7wZ6BnTv33qif\nWtjC07Lw/1TvFXon8kbRVr5iZLt0zzem7h8Vx/6TIWSqjJ+KBOlO/a0PUsmDE/YXT+2lI1x+\nm6dfSiJV9h/6JOJGtiqv8DGbckd9oqq3fHIwurllWml0A0JWccWqwopFWozmlmllIS45bhXS\nD76bdlP10yu6HPG8vWu6cZmFm3bD6GO132T9BbtnQ+GCfzUiyu3+gfxTu4pw006Em3ZGCm7a\n7emyQdMOdF2tn/65s37TqbcMpMdoltpd8hfiLkKFc+71fF+q2Wuq7Z+XAEkESEYqfv09+qHN\nO555pFybP10r7j3h4LFPu+2UgDSQvlK7S/5C39euYMbddYjqPmzzD0yAJAIkIxWQisf37jXK\ns/mYoZq2dXivHo//yi+yAOkusv5K5TKFvdPqrk9vy6W0v38i8wJnRoAkAiSj+N9F6Fb6We0u\n+Yt07++tY5sSNR4p/6BCQBIBklH8Id1Iq9Xukj8LD6NY0D2Dsrp/LTkYkESAZBR/SH+jDWp3\nyZ+lxyOteboRUfNx22UGA5IIkIziD6kjxeAe2szyA/vyJ7dLoXrDLby6sxEgiQDJKP6QLiOV\n9ygVWX+E7Pd3Z1PNQeusbg5IIkAyij+kFplq98hI5qHmfzxSg3LusfjrcEASAZJR/CGdW13t\nHhnJPWfD1pH1KaO7pV/EA5IIkIziD+mMumr3yEj2yU/yXjiNUq//MvKGgCQCJKP4Q6rfUO0e\nGck/i1DBexcSXRLslS0CAiQRIBnFH1Ktxmr3yMjO03EVfnApUevZ4TcCJBEgGcUfUtWmavfI\nyObz2s28kugfS8JtAUgiQDKKP6T0C9XukZHtJ4icfSmldgnzvP6AJAIko7hDKqZWavfIyP4z\nrRZOOoOyHwp5JzxAEgGSUdwh7aXL1e6RUTRPWbxzTF2q9Wxe8AsBSQRIRnGHlE+d1O6RUXTP\n/b3t6erUYFzQ+1wAkgiQjOIOaTtdp3aPjKJ9Ev0/+mVS0ylBLgAkESAZxR3SRuqido+Mon81\nip+6pdJl8yqdDUgiQDKKO6Q1dLPaPTJS8bIuC6+k1EovmA5IIkAyijuk7+h2tXtkpOb1kT4+\nm3KfCfytAyCJAMko7pCG0r1q98hI0QuN7XyuBjX+0HwOIIkAySjukN6gx9TukZGyV+xbe1sq\nXbNMfAxIIkAyijukgnuXRdzGVgpf+nJBK8p8YIvxESCJAMko7pCOaPvU7pGRyteQLZzUkE4a\n538BQEASAZIRIFlr22NZdKHvbuGAJAIkI0Cy2vIulNJd/1U4IIkAyQiQrPfRWVR9RD4gmQIk\nI0CSaOfw6nTePEASAZIRIEm15mZK7X8AkIwAyQiQJJt2FtV/NSaTAUkESI6HxHYMyaKrwzyA\n1n6AxAMk50Nih9e3p+zHonnNvxABEg+Q3ABJ2/f/atF5c5UPBiQeILkC0n62pjul9t6seDAg\n8QDJJZAY+6IxnfT/1A4GJB4guQYS2/5YJl2j9JcOgMQDJPdAYmxRK8p+WuFr0gASD5DcBIkV\njK1Bzb9VNhiQeIDkKkiMre5KGYNCPP+ddIDEAySXQWJsSgNqUvmZhmwFSDxAch0ktql3Snq/\nP1UMBiQeILkPEmMfN6TTvlAwGJB4gORGSGzbA6kpvSVeFz1EgMQDJFdCYmzmmXTKp9EOBiQe\nILkUEtv+QFpKtPcZAiQeILkVEmOzz6Z670Y1GJB4gOReSGzHYxnUZV0UgwGJB0guhsTYN+dT\n3cn2BwMSD5BcDYntfDqTuvxhdzAg8QDJ3ZC835RqT7I5GJB4gOR2SCx/SCb9fbWtwYDEAyTX\nQ2Lsu1ZU46VCG4MBiQdIgMRY4biq1MbGa2wAEg+QAElvxRVURf4hf4DEAyRA8jWpFrX6TnIw\nIPEACZD8/X49ZTwg95A/QOIBEiDx3qtHTefLDAYkHiABkmhjb5J6yB8g8QAJkMx91JBO+8zy\nYEDiARIgBeR9yJ/VR1cAEg+QAKlCs86iem9b2xSQeIAESBWz/ugKQOIBEiBVbsF5VMfKHVkB\niQdIgBSknU9mUrdNETcDJB4gAVLQvmtOjaZH2giQeIAESMHLfywtpV+EOzoAEg+QAClUs0+j\npovCbgFIPEACpJBt7k1VRoZ7nBIg8QAJkMI0qSZd+VvoiwGJB0iAFK6VbenE0M99B0g8QAKk\nsBWOzKTuoe7HCkg8QAKkCC1qSmctCH4RIPEACZAitaNfSvpjQR+GDkg8QAKkyH1cj1otD3I+\nIPEACZAs9Md1lDuu8tmAxAOkCh3YHakj2v6I29hqb0ls5u4+bGGvIvXfHOq6oeKZe0qjnhu8\nYq0oRpPLYjT3UNKt2AMpxCX7VEA6VhKpMq004jb2Ko/RXCUrXt+GTllQ8cyEXnHQkm/FWozm\nlmllIS45rgISbtqFKsid73DTjoebdoBkuRmN6IIl5jMAiQdIgGS9TTdTNfMD/gCJB0iAJNOE\nKikD8vlHgMQDJECS6pvTqDV/DRhA4gESIMm1+XqqbTzzHSDxAAmQJCt8Oi39ad9JQOIBEiBJ\n90Ud+rv3mVEAiQdIgCTfLy2p8WIGSKYACZBslNfP+3twQOIBEiDZ6pXslH47AYkHSIBkr69P\npTZrAMkIkADJZpv/QfXDP1uX/QBJBEiyJRkkVvh0qvF7cNUBkgiQZEs2SIx9UZdukniBP+sB\nkgiQZEs+SLs3t6CzZF8K3UqAJAIk2ZIQUkleb6r2pvrBgCQCJNmSERJj/0//PbjqwYAkAiTZ\nkhMSW3gqXfa74sGAJAIk2ZIUElvfkerPVjsYkESAJFuyQmIFg1Iz/6N0MCCJAEm2pIXE2Acn\n0N1Bn4vVZoAkAiTZkhgS++lsukbhX5QASQRIsiUzJLbhcjr3F2WDAUkESLIlNSSW153qf61q\nMCCJAEm25IbECh9LqfahosGAJAIk2ZIcEmMTMtJfUDMYkESAJFvSQ2Kf1qB+4V682XKAJAIk\n2ZIfElvciLruUDAYkESAJJsDILHVzemSddEPBiQRIMnmBEjsz2vpnBVRDwYkESDJ5ghIbFdf\nqjs/2sGAJAIk2ZwBibGRqTnvRDkYkESAJJtTILHJVdJGRzcYkESAJJtjILG5talfQTSDAUkE\nSLI5BxL76Sz6+7YoBgOSCJBkcxAktr4NXbzG/mBAEgGSbE6CxPK60Snf2x4MSCJAks1RkFjh\nY3TCF3YHA5IIkGRzFiTGXsrIfMXmYEASAZJsToPEpuamPGZvMCCJAEk2x0Fii06mW2095x0g\niQBJNudBYr+dTx022RgMSCJAks2BkNjWq6npSvnBgCQCJNmcCInt7EX1F0sPBiQRIMnmSEiM\nPZVSa6HsYEASAZJsDoXEXkitLvuMxoAkAiTZnAqJvZqe84ncYEASAZJsjoXE3sjIfFdqMCCJ\nAEk250JiH2RlTpYZDEgiQJLNwZDYF1XTJkgMBiQRIMnmZEhsZm7qi9YHA5IIkGRzNCS2oFbK\nc5Y3BiQRIMnmbEjsu5NosNVtAUkESLI5HBJb2oAesLgpIIkASTanQ2IrTqO7rT01OCCJAEk2\nx0Niq86g3paeXQiQRIAkm/MhsTXnUrd8C9sBkgiQZHMBJLbhYupq4aF+gCQCJNncAIltakVX\nR37hF0ASAZJsroDE/uxAbbdG2giQRIAkmzsgsbzr6NLNEbYBJBEgyeYSSCzvemoe4cXIAEkE\nSLK5BRLb1YPOWR12C0ASAZJsroHECm6jxr+E2wCQRIAkm3sgscJ7qdHyMJcDkgiQZHMRJMYG\n0ck/hL4UkESAJJurILGnqc63IS8EJBEgyeYuSOz5lBPmhboMkESAJJvLILFxoZ+mC5BEgCSb\n2yCFeZouQBIBkmyug8Tezcx8J+gFgCQCJNncB4m9n5X5VrDzAUkESLK5EBL7JCcj2BPeAZII\nkGRzIyQ2LagkQBIlMqSD4/r0HF7g/2BW3xvv/xGQ7KQAEptZNW1ipTMBSZTIkEYM3pw3dmCZ\n9/SC3ssLvuhXDEg2UgGJzaia8XbF8wBJlMCQWJdNnu9KN6zyftDvq4DLAEkiJZCCSQIkUQJD\nWtKt3PP2/o/007s7f/Wvmx9dq58sOeBp7+5IHdH2R9zGVnuPx2bu7sPagdgM3lOiZMyMnIx3\nA88p1oqUTK5caYzmHorZiscXYRQAACAASURBVMtiNNcDKcQl+6xCmnuH/nbIRP3tus5Pbi+a\n2GO/5+TCFp6Whf+nKBZ9WzXzi3ivAfHK+KlIkO7U3xqQPLfwSm9d4Dm56j5Pvx2PVJlWEnEb\ne5XHaG7ir3h2duZn5o9LE37FFSvVSmM0+a9f8TGrkH7w3bSbqp9mnTd43g6calyGn5EkUvMz\nkt7UKgEvRYafkUQJ/DPSni4ePAe6rvZ+G+s9Q9OOdV8ESDZSB4l9nJX5nvgIkEQJDEkb/dDm\nHc88Uq7Nn65pU3utZC/3PgJINlIIKVASIIkSGVLx+N69Rnk2HzPU8y3p7dtvfGIbvwiQJFIJ\nKUASIIkSGVKYAEkipZB0Se/7TwKSCJBkczsk9lFW5ge+U4AkAiTZXA9JSAIkESDJBkjsw8wq\n3sfMApIIkGQDJC4JkESAJBsgMa+kTwHJHCDJBkh6UzKzPwMkU4AkGyB5+8AjCZBEgCQbIPma\nnJE9G5B4gCQbIPmbnJGzEJCMAEk2QDLySJoZm8mAxAMk6ZIOEns/PfuL2EwGJCNAki75IBVP\nTc+JjSRAMgIk6ZIQkvZOes60WEwGJCNAki4ZIR2YFBtJgGQESNIlJST2RnrOdPWTAckIkKRL\nTkgeSblzlE8GJCNAki5JIcVEEiAZAZJ0yQqJvZ5efa7iyYBkBEjSJS0kNjFNtSRAMgIk6ZIX\nEpuQWmOB0smAZARI0iUxJI+kE5eqnAxIRoAkXTJDYmOowQqFkwHJCJCkS2pI7DE6Y426yYBk\nBEjSJTck1p8u2qJsMiAZAZJ0SQ6psAddvkPVZEAyAiTpkhwS23k1/S1f0WRAMgIk6ZIdEtve\nmnoUqpkMSEaAJF3SQ2KbzqcH1UwGJCNAki75IbG1Z9IwJZMByQiQpHMAJPbTSSnjVUwGJCNA\nks4JkNiimmlvKpgMSEaAJJ0jILG5OZkfRz8ZkIzkIFU1lQlIivtLIbFPMqt9GfVkQDKSg3SL\np3My2nS74cKUFvcDkuL+WkjstdRa30c7GZCMpG/aTW22U3/3R5PpgKS4vxiSijuwApKRNKRm\nH/ve/685ICnur4ak4A6sgGQkDSlzgf87UxYgKe4vhxT9HVgByUgaUoNe3nflt9QHJMX99ZCi\nvgMrIBlJQxpG5z8wYsTApvQEICnur4cU9R1YAclIGlL5f+qTp9pPlwKS4uIASb8D6y1R3IEV\nkIxs/EG2/M9lP2wqs8IIkKSKB6Qo78AKSEY2IB358TOmlQCS8uICKbo7sAKSkTykF3KJlmpP\n3WGJEiBJFB9IUd2BFZCMpCFNpC6veiC9nT4GkBQXJ0jR3IEVkIykIV3QXzvigaQ9eTYgKS5e\nkKK4AysgGUlDqvKlD9K8DEBSXNwg2b8DKyAZSUOqO8MH6ePqgKS4+EGyfQdWQDKShnRVh8M6\npD3NrgEkxcURkt07sAKSkTSkr9MaP0h39ame8R0gKS6ekNjjtu7ACkhG8r/+XnCRfs+GS76x\n4giQZIorJHt3YAUkIzsPNS9YuXKvZi1Akii+kGzdgRWQjKQhtZll0RAgyRZfSLbuwApIRtKQ\nGo4DJGdCsnMHVkAykoY0rennxwEpJsUbkn4H1gfkJgOSkTSkdudTZoNT9QBJcXGHxNY2lrwD\nKyAZSUNq27GTP0BSXPwhSd+BFZCMbD9B5MH1gKS4BICk34F1ksRkQDKyDWlBLUBSXCJAYnOr\nytyBFZCM5CHN7NWubdu2rXNrA5LiEgIS+zCj2kLLGwOSkTSkKZTekBpUoSst/T0JkCRKDEjs\nfyknWb7bHSAZSUNqcV2RlvZbyctXFAGS4hIEEhtKZ6+3uCkgGUlDyp2paWm/atpDAwFJcYkC\nifWl1hbvLARIRvIP7JujadUXadriBoCkuISBVHA9dSmwtCUgGUlDuujmY9p5QzRtWlVAUlzC\nQGLbW9F9ljYEJCNpSO9SJ+3ptH7DT74MkBSXOJDYusb0nJXtAMlI/tffU0ZrxVcTNVoOSIpL\nIEjspzqpVp5ZCJCMbP5BdsMaa/dcBSSJEgkS+zIna2bkrQDJCK8hK507ILEP0msuibgRIBlJ\nQzrRKBeQFJdYkNj/UaPfI20DSEbSkLp6uyS7Gf6OpLoEg8Qeogu2RtgEkIzs3rTLbz8TkBSX\naJAKe1CnCI89ByQj2z8jLW8BSIpLNEhs5xXUK/wWgGRkG1J+NiApLuEgsc3N6PGwGwCSkV1I\n5SMbWoFUtDdSR7UDEbex1f6S2Mzde8TCXtlqX2ls5u49rB20+S/XNEyZEO7yMptzI3VYOxSj\nybFacbFWHOKS/cEhNffWrDYNsgLpWEmkyrTSiNvYqzxGc1214l9qZswMc3ECrjhCWozmlmll\nIS4Rf3ENAumiji8dswIJN+0kSrybdp6mZVb7KvSluGlnhD/ISucuSGxiapjH+QGSESBJ5zJI\nYR/nB0hG0pDSc6qaAiSFJSikcI/zAyQjaUgDzk1vfdMNF6Zc2OMWT4CksESFFOZxfoBkJA1p\n6vl5+ru158yIhAiQJEtUSGEe5wdIRtKQzpvqe/+/5oCkuISFxNadGeJxfoBkJA0p80vf+4+z\nAElxiQsp5OP8AMlIGlKDnuX6u9LO9QFJcQkMKdTj/ADJSBrSv6nxfcOGDTyXngQkxSUypBCP\n8wMkI2lIZaPq668hW2dYKSApLqEhBX+cHyAZ2fiDbPmfy37YVGaFESBJldiQgj7OD5CM5CEV\n79S0w5Nf2ARIqktwSMEe5wdIRtKQ1tYdrZW0JKqxApAUl+CQgj3OD5CMpCHddP5G7V16ZeNl\nNwOS4hIdEtt8XsXH+QGSkTSkuu9r2o3NNO39RoCkuISHxH5rmPJSwBmAZCT/B9mFWmnNxzVt\nfiYgKS7xIbHFJ2R8ZP4YkIykITV6Q5tPCzVtEv4gq7okgFTxcX6AZCQN6e56T5x6ZqlWcAF+\nRlJdMkCq8Dg/QDKShrSzNdVeqmm31PgFkBSXFJACH+cHSEY2/iB7QH82h+W7rDgCJJmSA1LA\n4/wAycjOQ80PzymwpAiQ5EoSSObH+QGSkR1IW+hzQIpBSQLJ/Dg/QDICJOlcD8n0OD9AMgIk\n6QBJPM4PkIwASTpAEo/zAyQjO5COrdyvWQyQJEoiSMbj/ADJyP4TRG4BJMUlEyQ2hk75HZBE\nkpC+vabxNbP1E0efw8u6qC6pILEH6aJtgMSTg7Q0I+WUjJSPNW3eWXQOICkuuSAVdqerdwGS\nkRykrjVWaQUtm26/mU4YfzwYHECKouSCxPIup/6AZCQH6bQHPW/mUJW0+5gVRoAkVZJBYusb\n04uxmMucDyn9v543W6nDb9YYAZJUyQaJ/VQ79Z2YDHY8JHrd8yaf5lh1BEgyJR0kNi8ne15M\nBgMSINkv+SCxD1NPWhmLuYAESPZLQkilQ+mcjTGY63hITy5dunQWjV+qB0iKS0ZI7E5qm6d+\nruMhmQMkxSUlpF3XUE/1c50OaZg5QFJcUkJiW86jocrnOh2SdIAkUXJCYr81SHlF9VxAAiT7\nJSkktrBq5ueK5wISINkvWSGxD9Nr/aB2LiABkv2SFhIbR6f+oXQuIAGS/ZIXEutPl+4Iu6Fk\ngARI9ktiSAXXU9eCsFvK5QpIR378jGklgKS8JIbEtrekhxTOdQOkF3KJlmpP3WGJEiBJlMyQ\n2LrTaay6uS6ANJG6vOqB9Hb6GEBSXFJDYt+fkDFV2VwXQLqgv3bEA0l78mxAUlxyQ2LTM3O/\nVTXXBZCqfOmDNC8DkBSX5JDYayn1f1E01wWQ6s7wQfq4OiApLtkhsUF0wVY1c10A6aoOh3VI\ne5pdA0iKS3pIhbfQVbuUzHUBpK/TGj9Id/WpnvEdICku6SGxne2pj5K5LoCkLbhIfzDSJd9Y\ncQRIMiU/JLapKY1UMdcNkDStYOXKvZq1AEkiB0BiP9dJfUvBXHdAkgiQJHICJPZldpU50c91\nOqRzzAGS4hwBiU1OPXF51HOdDqmtOUBSnDMgsWF09oZo5zodknSAJJFDILG76bJon1jIFZB2\nzZ789txdgKQ8p0DadR3dGuVcF0Da9890/dffKb0OAZLinAKJbWlGT0Y31wWQ7si4++2ZX7zW\nlfoDkuIcA4n9dnLKf6Oa6wJINd/2vR98IiApzjmQ2KLqmZ9FM9cFkLLyfe8X5gCS4hwEiX2U\nXnNpFHNdAOni733vX2kPSIpzEiQ2nk5Za3+uCyAtaLm4XNNKZ533MyApzlGQ2ED9xZrt5gJI\nretQ1TPOyKZGTazcuwGQJHIWpMKbqLPtJxZyAaSL28jcuwGQJHIWJLa9FT1gd64LIMkFSBI5\nDBJbdwb9x+Zcd0Aq2ucNkBTnNEhsea209+zNdQGkTf+oihcai81gx0FiMzKrfWNrrgsgXVGj\n16DB3gBJcc6DxCam1FtlZ64LIFX93gogQLKRAyGxwdR0k425LoBUNw+QAMkoIqTCHtQpX36u\nCyA9OgKQAMkoIiS2swPdLj/XBZCOXdV20Ghv/KyD4/r0HF7AP1zQeSkg2cmRkNjmpvSs9FwX\nQBpNVPG3diMGb84bO7DM/9G+27sBkq2cCYmtqJM6WXauCyDV7/bdxi3ejHNYl02e70o3rDKk\nTbodkGzlUEhsXnaVLyXnugBSVqVfNizpVu55e/9H/o/6HvFBKjngae/uSB3R9kfcxlZ7j8dm\n7u7D2oHYDN5TEpu5u4u1ohhNLrW01Vspp2yQm3soZisui9FcD6QQl4h7LgRAumiVVqG5d+hv\nh0z0fnCw90rNB2lhC0/LKm6M3NhTdFVpvNcQt8r4qQBI33b8pcKGc+/U3/ohvfii5oe0Wv+j\n7dqjkSrVjkXcxlbHymIz17Pi47EZfKw8NnOPlsRqxUctrvjwVTRUam7cVyxdiVYS6qLgkNo2\npGqnejPO+cF3026qfnpl7yIDkjf8jCSRU39G8rS2QeqHMnNd8DNSu05Gxjl7umzQtANdV+un\nx3Tr2bNnl+6jAMlGDobE5mbW/FlirgsgGR1cz0+OfmjzjmceKdfmT/ft/m3zDwCSjZwMiY2i\niySeNdJFkBbU4ieLx/fuNcqz+Zihvo9x085ejobEutOd1jd2A6SZvdq1bdu2dW7tYLwqBkgS\nORvSn01oguWNXQBpCqU3pAZV6MpZgKQ4Z0NiP+RmLbS6rQsgtbiuSEv7reTlK4oASXEOh8Te\noNOtPqTCBZByZ2pa2q+a9tBAQFKc0yGxe+m6QmtbugBSlTmaVn2Rpi1uAEiKczyk/NY0zNqW\nLoB00c3HtPOGaNq0qoCkOMdDYr+flD7N0oYugPQuddKeTus3/OTLAElxzofEvkiv86uV7VwA\nSZsyWiu+mqjRckBSnAsgsaHUysrfZd0AyduGNcetOAIkmdwAqfB6GmBhMzdAKt6paYcnv7AJ\nkFTnBkhsy1kpkyJv5QJIa+uO1kpaEtVYAUiKcwUktii76vcRN3IBpJvO36i9S69svOxmQFKc\nOyCx/1HjLZG2cQGkuu9r2o3NNO39RoCkOJdAYr3pxkibuABS5kKttObjmjY/E5AU5xZIeRfS\n8xE2cQGkRm9o82mhpk2qD0iKcwsktrJWxqzwW7gA0t31njj1zFKt4AL8jKQ610Bin6Sd/EfY\nDVwAaWdrqr1U026pUfE5UAAp2twDiT1K7XeFu9wFkDTtgP632OW7rDgCJJlcBKmgIw0Kd7kr\nIGmH5xRoFgMkiVwEia1vlPJOmIvdAWkLfQ5IMchNkNi8zBN+Cn0pIAGS/VwFiT1P520PeSEg\nAZL93AWJ9aDeIS8DJECyn8sgbW9GL4e6zOmQthdrW45px1buB6QY5DJIbFn1rK9CXOR0SFVm\naGTpAX2AZCO3QWLvpDRaH/wSp0PKvn0RvbHYHyApznWQ2AC6JvjTCjkdUk8yBUiKcx+k/DY0\nNOgFTodUMvMtGjbZHyApzn2Q2Jp6qVODne90SJ46rbMCCJBs5EJIbHZG7WBPK+QCSJq2e+bE\nN+ZaesJiQJLKjZDYMGoZ5GmFXACp7NEM/QekqmMASXWuhFTYme6tfK4LII2hGyfNnvnatfQ2\nICnOlZDYlrPplUpnugBS00d87++5GJAU505IbFFO1e8qnucCSFlf+d7PygYkxbkUEnuNGm+u\ncJYLIFWd4Xv/RTVAUpxbIbE76YYK57gA0uVXHtPfHbnmCkBSnGsh7WxFowLPcQGkWSmn9B/x\nbL8GqV8CkuJcC4mtqpUxM+AMF0DSPm+i//r7fEsvIQtIMrkXEvs0re5q88dugKRpeT9afOoT\nQJLKxZDY43S5+WmF3AEJT34Sm8FuhlTQiR4yfegOSHiEbGwGuxkS23BKylviI0ACJPu5GhL7\nukqN5fwDQAIk+7kbEnuBzt1mnAYkQLKfyyGxW+mfxkl3QMKTn8RmsNsh7biAxvtPugBSizW+\n9580BSTFuR0S+7FG1gLfKRdA8j+LUMlwvNCY6lwPib2X0mid94TjIZme+wQPo1AdILEH6OoC\n/b3jIa16ibrerdf339sBSXGAxHZdQU/q7x0PSdOuXW8FECDZCJAYW1s/9WPmCkhyAZJEgORp\nTmbNFa6AdKJRLiApDpD0RtDFeW6A1NXbJdnNBgKS4gDJW1fq6wZI/vLbzwQkxQGSty1n0UT3\nQNKWtwAkxQGSr8XZNda6B1I+nkVIdYDk7z90WalbIJWPbAhIigMkf4VX0wuOh9TcW7PaNAiQ\nFAdIRmvrZC2JzeREg3RRx5eOAZLiAIk3hZruiM3khIEkFyBJBEi8Q33o/thMThxIG6e/P3MH\nIMUgQOIdOnhG6ucxmZwokKY18971u803gKQ8QOId0uanNdgQi8kJAmkc5fR6cfL4W3NS3wQk\n1QES75BW9AB1j8XkxIC0KrXtTu+JvMsyLL0EJiBJBEg8D6SdF9LrMZicGJDuqLnbf2p3zXsB\nSXGAxNPvIvRdlRqr1E9ODEin9eMn72kMSIoDJJ73vnYj6IpC5ZMTA1LWWH7y/3AXIdUBEs8L\nqbAjjVY+OTEgVRvNTz6PxyOpDpB4vnt//1oza5HqyYkB6fx/8pOdmwOS4gCJ538YxWQ6P0/x\n5MSA9HjGav+pJalDAUlxgMQzHo90Ez2seHJiQNpZ4+Q5+vuyKbVO3F2ZDSBFFSDxDEgbG6ZO\nVzs5MSBpC6rTaTf16VKfai+x4giQZAIkHn+E7PTUU7YonZwgkLStA04motMH5VtyBEgyARJP\nPNS8P/VSOjlRIHk6sOOgNUWAJBcg8QSkvHPpTZWTEwiSTEX7InXUwja2OlASm7meFR+MzeD9\npbGZu++IdihGk8tiNPewWPHSrBPXKZwcqxUXa8UhLjmgAtLR45Eq00oibmOrkvLYzI3dio/H\nasWlSbjiUn56JF1zTN3kv2LFgYlHwOKBfQHhpp0o9jftGCtoS+PUTU7Sm3aAJBEg8QKe127V\nCTk/KJsMSLIBEi/JIbEJdOFOVZMBSTZA4iU7JNaVBquaDEiyARIv6SFtaJA+V9FkQJINkHhJ\nD4lNTTltq5rJgCQbIPGSHxK7i+5SMxmQZAMkngMgbWuc8oGSyYAkGyDxHACJfZVRe62KyYAk\nGyDxnACJPUF/VzEZkGQDJJ4jIOW3ogkKJgOSbIDEcwQk9lO1qj9GPxmQZAMknjMgsXF0ya6o\nJwOSbIDEcwgkdi0NjXoyIMkGSDynQPqjTsb8aCcDkmyAxHMKJPYunb09ysmAJBsg8RwDid1G\n90Y5GZBkAySecyD9eUbKh9FNBiTZAInnHEhsdlr99VFNBiTZAInnIEjsEbo+qsmAJBsg8ZwE\nKf9iejWayYAkGyDxnASJLcmuvjKKyYAkGyDxHAWJjabWBfYnA5JsgMRzFqTCq2iE/cmAJBsg\n8ZwFif1WK/Nb25MBSTZA4jkMEnuLmuywOxmQZAMkntMgsVvoX3YnA5JsgMRzHKRNjVK/sDkZ\nkGQDJJ7jILGZaQ022JsMSLIBEs95kNj91MPeZECSDZB4DoSUdx69YWsyIMkGSDwHQmLfVTlh\nlZ3JgCQbIPGcCIk9S1cU2pgMSLIBEs+RkAra0fM2JgOSbIDEcyQk9mvNrMXykwFJNkDiORMS\n+x9dkCc9GZBkAySeQyGxG+lR6cmAJBsg8ZwKaWPD1OmykwFJNkDiORUS+zTl1C2SkwFJNkDi\nORYSu4dul5wMSLIBEs+5kPKa0mS5yYAkGyDxnAuJLcw88XepyYAkGyDxHAyJDaGOUndwACTZ\nAInnZEgFl9H/yUwGJNkAiedkSOzn3JwfJCYDkmyAxHM0JPYyXbjT+mRAkg2QeM6GxLrQk9Y3\nBiTZAInncEjrTkqfa3ljQJINkHgOh8Q+Tjl9q9VtAUk2QOI5HRK7k+62uikgyQZIPMdD2tY4\nZYrFTQFJNkDiOR4Sm5d+0jprWwKSbIDEcz4k9jjdYm1DQJINkHgugJTfjKZa2hCQZAMkngsg\nsXlpjf60sh0gyQZIPDdAYn3pISubAZJsgMRzBaQtJ6cvtLAZIMkGSDxXQGJT6KJdkbcCJNkA\niecOSOx6Gh15I0CSDZB4LoH0e41qkZ8OHJBkAySeSyCxMXRVxG0ASTZA4rkFUsGl9GakbQBJ\nNkDiuQUSW5pZN9IL+QGSbIDEcw0k9ijdEWELQJINkHjugZR3durM8FsAkmyAxHMPJDY9pXH4\nV6gAJNkAieciSKwXDQ57OSDJBkg8N0HaVC/z+3CXA5JsgMRzEyQ2kdqEe+pVQJINkHiugsSu\npfFhLgUk2QCJ5y5IK6tW/y30pYAkGyDx3AWJPUs3hr4QkGQDJJ7LIO1qTu+GvBCQZAMknssg\nsW8zGoZ8wkhAkg2QeG6DxAZS/1AXJTKkg+P69Bxe4Du9Z+xt3Z9YB0h2AiRelJC2nZo6J8RF\niQxpxODNeWMHlnlPPzx4084Xeh0BJBsBEi9KSOxjOjfES70kMCTWZZPnu9INq/TTRaO2aVph\n5/WAZCNA4kULiXWjYcEvSGBIS7qVe97e/xE/Y21X/d8e3uGJ7Y3UUe1AxG1sta8kNnP3HtGK\nYjN4X2ls5u49rB2M0eTYrfhQdAM21M7+OegFZdHNDVmxVhzikv1WIc29Q387ZKLxcdGAyfq7\nhS08LQv/TxGKUW/StfFegr8yfioSpDv1txzS9nte0b9DaasHe1p7NFKl2rGI29jqWFls5npW\nfDw2g4+Vx2bu0ZJYrfho4q74SEeaHOz82K24JNRFViH94LtpN9X30aqeM0yX4WckifAzEi/q\nn5EYW1al1h9Bzk7gn5H2dNmgaQe6rvZ+8PutP5kvAySJAImnABJ7im4Ncm4CQ9JGP7R5xzOP\nlGvzp2vH+nlf8Am//rYTIPFUQMpvlvJJ5XMTGVLx+N69Rnk2HzNUW9XZ20xAshEg8VRAYvPS\nzthe6cxEhhQmQJIIkHhKILG7g7xABSDJBkg8t0La3CD964rnAZJsgMRzKyT2AV1c8QUqAEk2\nQOK5FhL7R6UXqAAk2QCJ515Iq2tU+yXwHECSDZB47oXE/kNXB54BSLIBEs/FkAouockBZwCS\nbIDEczEktiTzpI3mjwFJNkDiuRkSe4TuMn8ISLIBEs/VkPLOTp1l+hCQZAMknqshsWkpZ5le\noAKQZAMknrshsZ70hPgAkGQDJJ7LIa2vY3qBCkCSDZB4LofEXjW9QAUgyQZIPLdDYtfQi8ZJ\nQJINkHiuh7Sias01/pOAJBsg8VwPiQ2nm/ynAEk2QOIB0q7m9J7vFCDJBkg8QGLfGC9QAUiy\nARIPkBgbQAO87wFJNkDiAZL+AhVpC/T3gCQbIPEAydNHdJ7+AhWAJBsg8QBJ7yYazgBJPkDi\nAZLeulrZPwGSfIDEAyRvL9KVgCQfIPEAyVthe3oVkKQDJB4g+VqWVWsdIMkGSDxA8vcE9QQk\n2QCJB0j+8s9L+TIWcxkg2QiQREkGic1NPWtHTAYDknyAJEo2SOwueiQ2gwFJOkASJR2kzQ3T\nv4nJYECSDpBESQeJfUItCmIxF5CkAyRR8kEq+zv9JxZzAUk6QBIlIaTVNXJ/ibyZdIAkHSCJ\nkhASG13xBSqUBEjSAZIoGSEVtKK31M8FJOkASZSMkNiijMAXqFASIEkHSKKkhMQeoruVzwUk\n6QBJlJyQ8s4KeIEKJQGSdIAkSk5IbFpK07xIG0oGSNIBkihJIbFb6SnFcwFJOkASJSuk9bUz\nl6idC0jSAZIoWSGx/9FlheE3lAyQpAMkUdJCYtfQy0rnApJ0gCRKXkgrqtZcq3IuIEkHSKLk\nhcSGUTeVcwFJOkASJTGk/AvofYVzAUk6QBIlMST2TXqjrermApJ0gCRKZkisPw1UNxeQpAMk\nUVJD2naK7wUqlARI0gGSKKkhsQ/pvHxVcwFJOkASJTckdiM9q2ouIEkHSKIkh7TmBP0FKpQE\nSNIBkijJIbHx1FHRXECSDpBEyQ6psB1NVDMXkKQDJFGyQ2JLM+usVzIXkKQDJFHSQ2JP0C1K\n5gKSdIAkSn5I+RfQOyrmApJ0gCRKfkjsm4x6Kp5TCJCkAySRAyCxh6iPgrmAJB0giZwAKa9J\nytTo5wKSdIAkcgIkNjdNwd3AAUk6QBI5AhLrT/dEPReQpAMkkTMgbTs9dUa0cwFJOkASOQMS\nm5Zy5vYo5wKSdIAkcggkdic9EOVcQJIOkEROgfTnqWnzo5sLSNIBksgpkNinKWdH92zggCQd\nIIkcA4n1oMeimgtI0gGSyDmQNtVPXxjNXECSDpBEzoHE3qdmO6OYC0jSAZLIQZDYjTQ0irmA\nJB0giZwEaV3tzO/sz40xpKJ9kTpqYRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns4Hiue\nRC13255brBWHuOSACkhHI1aqHYu8kZ2OlcVmrmfFx2M0uTxGc0uwYl64Fd9AY23PLdFKQl2k\nAhJu2kmEm3a8eNy0Y+z3mtk/2p2Ln5GkAySRsyCxCdTW7uv4AZJ0gCRyGCTWkV6wOReQpAMk\nkdMg/Vojd6W9uYAk0i8KqwAAC59JREFUHSCJnAaJjaEO9m7cAZJ0gCRyHKTCDjTB1lxAkg6Q\nRI6DxH6uWv1XO3MBSTpAEjkPEhtB19qZC0jSAZLIgZAKLqXXbcwFJOkASeRASGxpVq218nMB\nSTpAEjkREhtC3eTnApJ0gCRyJKT8C+kt6bmAJB0giRwJiX2bedIG2bmAJB0giZwJiQ2inrJz\nAUk6QBI5FFJeU/pIci4gSQdIIodCYgszGm6RmwtI0gGSyKmQ2L/oLrm5gCQdIIkcCynvnJRP\npOYCknSAJHIsJDY79ZQ/ZeYCknSAJHIuJNaXBsjMBSTpAEnkYEjbTkudJTEXkKQDJJGDIbHP\nU87aYX0uIEkHSCInQ2K308PWNwYk6QBJ5GhIm09OX2B5Y0CSDpBEjobEptC5lp9XH5CkAySR\nsyGx7vSE1U0BSTpAEjkc0vq6mYssbgpI0gGSyOGQ2Lt08S5rWwKSdIAkcjok1pmesbYhIEkH\nSCLHQ1p3Yub3ljYEJOkASeR4SOwValVgZTtAkg6QRM6HxP5Oo61sBkjSAZLIBZB+OyFnuYXN\nAEk6QBK5ABIbT+0sPK8+IEkHSCI3QCq8ksZH3gqQpAMkkRsgsRXVcn+JuBEgSQdIIldAYqPp\n6ojbAJJ0gCRyB6SCNvS/SNsAknSAJHIHJPZDlVprImwCSNIBksglkNgw6hJhC0CSDpBEboFU\n0Iomh98CkKQDJJFbILElWSf+EXYDQJIOkESugcSeoO5hLwck6QBJ5B5I+c3pnXCXA5J0gCRy\nDyT2TUa9jWEuBiTpAEnkIkjsIeod5lJAkg6QRG6ClNck5ePQlwKSdIAkchMkNjet0daQFwKS\ndIAkchUkdh/1C3kZIEkHSCJ3Qdp2RuqMUJcBknSAJHIXJDYt5cztIS4CJOkASeQySOwu+leI\nSwBJOkASuQ3Sn6emzQ9+CSBJB0git0Fin6WcnRf0AkCSDpBEroPEbqXHgp4PSNIBksh9kDY1\nSP8q2PmAJB0gidwHib1PzYK9aBIgSQdIIhdCYjfRkCDnApJ0gCRyI6R1dTIXVz4XkKQDJJEb\nIbHXqUXlF00CJOkASeRKSOx6GlHpPECSDpBE7oT0e83sHyueB0jSAZLInZDYBGpb8Xn1AUk6\nQBK5FBK7jsZWOAeQpAMkkVsh/Vojd2XgOYAkHSCJ3AqJjaX2gTfuAEk6QBK5FlLhFTQh4AxA\nkg6QRK6FxFZUrf6r+WNAkg6QRO6FxJ6ja8wfApJ0gCRyMaSCS2mi6UNAkg6QRC6GxJZm1Vor\nPgIk6QBJ5GZIbAjdJD4AJOkASeRqSPkX0lv8A0CSDpBErobEvs08aYNxGpCkAySRuyGxx+hW\n46QKSAfH9ek5vKDyaUCSC5B4SQIprylN859UAWnE4M15YweWVToNSHIBEi9JILH5T+X7TymA\nxLps8nwnumFVxdOAJBkg8ZIFkkgBpCXdyj1v7/+owumiNZ4K9kXqqFYUcRtbHSiJzVzPig/G\nZvD+0tjM3XdEOxSjyWUxmns46VZcrBWHuOSAVUhz79DfDplY4fTCFp6Whf+nCDk+/nNOREh3\n6m/9kMTpdSM9rT8SqVLtWMRtbHW0LDZzj5TEasVHkm/F5TGaW6Idj9HkWK34eOgVW4X0g+/m\n3NSKp/XwM5JE+BmJ58qfkfZ02aBpB7qurngakCQDJJ4rIWmjH9q845lHyrX508VpQJIPkHju\nhFQ8vnevUZ7NxwwVpwFJPkDiuRNSmABJIkDiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0Di\nARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0Di\nARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0Di\nARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiAZJ0c0buiuX4GPTVyD/jvQTJFo/cGO8lSLZs\n5O/xXoJkK0eujLhNTCGNbvFHLMfHoJda/BzvJUg2scV38V6CZO+3mBfvJUj2eYvPI24DSAEB\nUuwDJOkAKfYBUuwDJOkAKfYBEkIoRICEkIIACSEFARJCCooJpIPj+vQcXlD5dOJmXuWesbd1\nf2JdfNcTuQrHdUHnpXFcjKUCVjyr7433/xjX5VjJvOTtz/a65YnQf0qOCaQRgzfnjR1YVul0\n4mZe5cODN+18odeROK8oUoHHdd/t3RIeknnFC3ovL/iiX3GcVxQx05LL+00oPvreP4tCbRoL\nSKzLJg/mG1ZVPJ24mVdZNGqbphV2Xh/vNYWvwnEdPen2RIcUsOJ+X8V5NZYyL3l/57Watrdz\nyFsqsYC0pFu55+39H1U8nbhVWuXarntDb50IBa54Sd8jCQ/JvOLdnb/6182Pro3ziiIWcJAf\nG1905IO+x0JtGwtIc+/Q3w6ZWPF04lZxlUUDJsdtLdYKWPHB3iu1hIdkXvG6zk9uL5rYY398\nVxSxgIO8Z2Dnzr1D30E4JpDuFJ/ffDpxq7DK7fe8Uh7H1VgpYMUvvqglASTTitd19txcKr11\nQXxXFDHzkksenLC/eGqvkDdUYgHpB993xKkVTydugatc1XNGXFdjJfOKV/YuSgJI5hWzzhs8\nbwcm+LUiYMkruui/frpreqhtYwFpTxfPYTrQdXXF04lbwCp/v/WnOC/HQuYVj+nWs2fPLt1H\nxXtN4TOvuKy357+qY90XxXtNETIv+efO+u8Ye/+lkLTRD23e8cwj5dr86eJ0Ymda8bF+U/TH\nPib6r79NK/Y+4PS2+QfivaQIma8VU3utZC/3TvRjbF5yce8JB4992m1nqE1jAql4fO9eozy3\nJscMFacTO9OKV3X2NjPeS4qQ+RjrJfxNu4AVl719+41PbIv3iiJmXvLW4b16PP5ryE1xFyGE\nFARICCkIkBBSECAhpCBAQkhBgISQggAJIQUBEkIKAqQEaRjp5bb/NOKWbc8JMcDSn2RD/GsU\nZYCUIA2jJ19//bWhp9CLkbbUKays/HUDpLgGSAmS30HRqbmR7oGmU3gZkBIsQEqQDAeP0DJN\n++aq3OyLJnk+anf5io65dXroz78xpVV2bospmpfCtZ5bgS3anuh9uGaH2scDBmjin7c9sUT/\n8NL6pWIiIMUmQEqQDAdD6TttQVr7GfP70wua1qlRqy8LPknro2kf0o0zZ15HM70U1nel5Wsm\n0See7fNTHwgcoIl//l+a7/nwz5RHTBMBKTYBUoJkOLg8fb92UWP9sS9dPDfyOpH+zN6dGmja\nqI6e7z8H0nv5KNzt+bodrNbZc9kE+jlwgCb+OUu/x3NiLK00TQSk2ARICdIwmpWfv/PHu+g+\nrYAePOLpVfpR65SjX9Yn1diqYTsBSbsz3XOTr10zPsAPyfTP/1a3TNNanWc+C5BiEyAlSL5f\nf1P6gKPaSvL3mdbpVP0ync2Bp5tVT0ujtiZIi2mclpcyhg/wQzL983fpa20LPW8+C5BiEyAl\nSMNo/Jw5cxfv03QJdy31xkyQ2qc9tejX3xqYIWlnX6C9mLaTD+CQ+D8/mDNQ+0/KNvNZgBSb\nAClBMv3SbQ/1MU5ySBuon+dESZUASKNp9SXXVRpg+ufaLQ20llcEnAVIsQmQEiTzn4EuqaF/\nY3p7SImAtIaGa/qfj1r7KPQl/RfbO9N60pTKA8Q/16bR5zQp4CxAik2AlCCZIX2TccHb84Zm\n3GH6jnS80cnTvnv0iityFx7SKfybhuu/+/4HVT8sBjw6Qe9b0z/Xjtc6o8qBgImAFJsAKUEK\nuGPC4qtzM84eU2KCpC1vk3PSvQdm1K65Tqew/aIMHcSn1Nc0wNdA0z/XtHvon4ETASk2AVIy\nN12/GwRKhAApiTvesnW8l4D8AVLStm3adWmJ/1pdbgmQkrZJKafPivcakBEgIaQgQEJIQYCE\nkIIACSEFARJCCgIkhBQESAgpCJAQUtD/B4iXRsAuE6zrAAAAAElFTkSuQmCC" + ] }, "metadata": { "image/png": { - "width": 420, - "height": 420 + "height": 420, + "width": 420 } - } + }, + "output_type": "display_data" } + ], + "source": [ + "# Create a data frame to store the results\n", + "results_y <- data.frame(\n", + " Alphas = model_y$lambda,\n", + " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", + ")\n", + "\n", + "results_d <- data.frame(\n", + " Alphas = model_d$lambda,\n", + " OutOfSampleR2 = 1 - model_d$cvm / var(d)\n", + ")\n", + "\n", + "# Plot Outcome Lasso-CV Model\n", + "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n", + "\n", + "# Plot Treatment Lasso-CV Model\n", + "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n" ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { - "id": "Mw8BaZ621Zvp" + "id": "Mw8BaZ621Zvp", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 0d2f2d6ed21de76c8127a65c81a1a4e29c4fba46 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:10:10 -0700 Subject: [PATCH 056/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 4e83dbb1..3cef786f 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -140,6 +140,7 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' + git pull git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b0d857a30f6fb2d5baf18033e2573b4a30d42058 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:19:07 -0700 Subject: [PATCH 057/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 3cef786f..b154c9ca 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -61,7 +61,7 @@ jobs: run: | R -e ' library(lintr) - linters <- with_defaults(line_length_linter = line_length_linter(120)) + linters <- with_defaults(line_length_linter = line_length_linter(120), object_name_linter(styles = c("CamelCase", "dotted.case")) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) From 3f7bc9d75c7af9b16771b1e2ab68c51ae2c9dc55 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:20:07 -0700 Subject: [PATCH 058/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index b154c9ca..5cb0388c 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -61,7 +61,8 @@ jobs: run: | R -e ' library(lintr) - linters <- with_defaults(line_length_linter = line_length_linter(120), object_name_linter(styles = c("CamelCase", "dotted.case")) + linters <- with_defaults(line_length_linter = line_length_linter(120), + object_name_linter = object_name_linter(styles = c("CamelCase", "dotted.case")) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) From e53a828e8f5f3f9ea9f5edd92a923f18fbb9d646 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:26:48 -0700 Subject: [PATCH 059/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 5cb0388c..46a92393 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -62,7 +62,7 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("CamelCase", "dotted.case")) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "dotted.case"))) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) From e63768d8eebfbe2db7e6244868f6e5183ed7ab4a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:27:36 -0700 Subject: [PATCH 060/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 46a92393..06b90451 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -62,7 +62,7 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "dotted.case"))) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase"))) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) From 0595ea0ca016169b0aca471e9b4bb205741e23f8 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:30:20 -0700 Subject: [PATCH 061/261] Update r_convergence_hypothesis_double_lasso.irnb --- ...r_convergence_hypothesis_double_lasso.irnb | 64 ++++++++----------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index ee0bdb8a..56196c9b 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -605,7 +605,7 @@ "source": [ "## Create the outcome variable y and covariates x\n", "y <- growth$Outcome\n", - "x <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" + "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" ] }, { @@ -631,21 +631,21 @@ } ], "source": [ - "fit <- lm(Outcome ~ ., data=x)\n", + "fit <- lm(Outcome ~ ., data=X)\n", "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", "\n", - "HCV.coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors\n", + "hcv.coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se <- sqrt(diag(hcv.coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gdpsh465 and the corresponding standard error\n", "cat (\"The estimated coefficient on gdpsh465 is\", est,\n", " \" and the corresponding robust standard error is\", se)\n", "\n", "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci <- est - 1.96 * se\n", - "upper_ci <- est + 1.96 * se\n", + "lower.ci <- est - 1.96 * se\n", + "upper.ci <- est + 1.96 * se\n", "\n", - "cat (\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" + "cat (\"95% Confidence Interval: [\", lower.ci, \",\", upper.ci, \"]\")" ] }, { @@ -727,8 +727,8 @@ "outputs": [], "source": [ "y <- growth$Outcome\n", - "w <- growth[-which(colnames(growth) %in% c('Outcome', 'intercept', 'gdpsh465'))]\n", - "d <- growth$gdpsh465" + "W <- growth[-which(colnames(growth) %in% c('Outcome', 'intercept', 'gdpsh465'))]\n", + "D <- growth$gdpsh465" ] }, { @@ -762,23 +762,23 @@ }, "outputs": [], "source": [ - "double_lasso <- function(y, d, w) {\n", + "double_lasso <- function(y, D, W) {\n", " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(w, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(w))\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(w, d, post = FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(w))\n", - " dres <- d - as.numeric(dhat_rlasso)\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " dres <- D - as.numeric(dhat_rlasso)\n", "\n", " # rest is the same as in the OLS case\n", " hat <- mean(yres * dres) / mean(dres^2)\n", " epsilon <- yres - hat * dres\n", - " v <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", - " stderr <- sqrt(v / length(y))\n", + " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", "\n", " return(list(hat = hat, stderr = stderr))\n", "}" @@ -795,7 +795,7 @@ }, "outputs": [], "source": [ - "results <- double_lasso(y, d, w)\n", + "results <- double_lasso(y, D, W)\n", "hat <- results$hat\n", "stderr <- results$stderr\n", "# Calculate the 95% confidence interval\n", @@ -886,7 +886,7 @@ "\n", "# Define LassoCV models for y and D\n", "model_y <- cv.glmnet(\n", - " x = as.matrix(w),\n", + " x = as.matrix(W),\n", " y = y,\n", " alpha = 1, # Lasso penalty\n", " nfolds = n_folds,\n", @@ -894,8 +894,8 @@ ")\n", "\n", "model_d <- cv.glmnet(\n", - " x = as.matrix(w),\n", - " y = d,\n", + " x = as.matrix(W),\n", + " y = D,\n", " alpha = 1, # Lasso penalty\n", " nfolds = n_folds,\n", " family = \"gaussian\"\n", @@ -906,12 +906,12 @@ "best_lambda_d <- model_d$lambda.min\n", "\n", "# Fit Lasso models with the best lambda values\n", - "lasso_model_y <- glmnet(as.matrix(w), y, alpha = 1, lambda = best_lambda_y)\n", - "lasso_model_d <- glmnet(as.matrix(w), d, alpha = 1, lambda = best_lambda_d)\n", + "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", + "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", "\n", "# Calculate the residuals\n", - "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(w))\n", - "res_d <- d - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(w))" + "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", + "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" ] }, { @@ -1052,7 +1052,7 @@ "\n", "results_d <- data.frame(\n", " Alphas = model_d$lambda,\n", - " OutOfSampleR2 = 1 - model_d$cvm / var(d)\n", + " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", ")\n", "\n", "# Plot Outcome Lasso-CV Model\n", @@ -1073,18 +1073,6 @@ " y = \"Out-of-sample R-squared\"\n", " )\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mw8BaZ621Zvp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] } ], "metadata": { From 1eed96859137c960f9f6885332b28a7464d4309b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:39:23 -0700 Subject: [PATCH 062/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 06b90451..e05069b8 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -139,9 +139,9 @@ jobs: run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git pull git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' - git pull git push env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 96f81f1ccf5735ec96ee4186c604b066b1bd8ec8 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:39:53 -0700 Subject: [PATCH 063/261] Update r_experiment_non_orthogonal.irnb --- PM2/r_experiment_non_orthogonal.irnb | 789 ++++++++++++++++----------- 1 file changed, 459 insertions(+), 330 deletions(-) diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index 833da406..21b9b719 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -11,28 +11,39 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mS89_Re5ECjm", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "install.packages(\"hdm\")\n", "library(hdm)\n", "library(stats)" - ], - "metadata": { - "id": "mS89_Re5ECjm" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Generating RCT data" - ], "metadata": { "id": "VczKl9DMouLw" - } + }, + "source": [ + "## Generating RCT data" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zHRwbkqncrH_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "gen_data <- function(n, d, p, delta, base) {\n", " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", @@ -42,38 +53,44 @@ " y <- y1 * D + y0 * (1 - D)\n", " return(list(y=y, D=D, X=X))\n", "}" - ], - "metadata": { - "id": "zHRwbkqncrH_" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B3BX59QhcrK4", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "n <- 100 # n samples\n", "d <- 100 # n features\n", "delta <- 1.0 # treatment effect\n", "base <- 0.3 # baseline outcome" - ], - "metadata": { - "id": "B3BX59QhcrK4" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Two Means Estimator" - ], "metadata": { "id": "A8lx8FOpoqzj" - } + }, + "source": [ + "## Two Means Estimator" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uKriPptNcrQo", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Simple two means estimate and calculation of variance\n", "twomeans <- function(y, D) {\n", @@ -83,17 +100,20 @@ " V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated\n", " hat <- hat1 - hat0 # estimate of the treatment effect\n", " stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect\n", - " return(list(hat=hat, stderr=stderr))\n", + " return(list(hat = hat, stderr = stderr))\n", "}" - ], - "metadata": { - "id": "uKriPptNcrQo" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kUAwErtycrTf", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Set the random seed for reproducibility\n", "set.seed(125)\n", @@ -106,36 +126,39 @@ "\n", "# Calculate estimation quantities\n", "twomeans(y, D)" - ], - "metadata": { - "id": "kUAwErtycrTf" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Partialling-Out Estimator" - ], "metadata": { "id": "LxS_DAjWoyAk" - } + }, + "source": [ + "## Partialling-Out Estimator" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Bszks8xNcrWV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# We implement the partialling out version of OLS (for pedagogical purposes)\n", "partialling_out <- function(y, D, W) {\n", " # Residualize outcome with OLS\n", " yfit <- lm(y ~ W)\n", " yhat <- predict(yfit, as.data.frame(W))\n", - " yres <- y-as.numeric(yhat)\n", + " yres <- y - as.numeric(yhat)\n", "\n", " # Residualize treatment with OLS\n", " Dfit <- lm(D ~ W)\n", " Dhat <- predict(Dfit, as.data.frame(W))\n", - " Dres <- D-as.numeric(Dhat)\n", + " Dres <- D - as.numeric(Dhat)\n", "\n", " # Calculate final residual ~ residual OLS estimate\n", " hat <- mean(yres * Dres) / mean(Dres^2)\n", @@ -145,100 +168,112 @@ "\n", " # Calculate variance of the treatment effect\n", " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr = sqrt(V / length(y))\n", + " stderr <- sqrt(V / length(y))\n", "\n", " return(list(hat = hat, stderr = stderr))\n", "}" - ], - "metadata": { - "id": "Bszks8xNcrWV" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "partialling_out(y, D, cbind(D * X, X))" - ], + "execution_count": null, "metadata": { - "id": "eCoa9F1gcrY_" + "id": "eCoa9F1gcrY_", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "partialling_out(y, D, cbind(D * X, X))" + ] }, { "cell_type": "markdown", - "source": [ - "## Double Lasso Partialling-Out Estimator" - ], "metadata": { "id": "3C5agjr2o0wA" - } + }, + "source": [ + "## Double Lasso Partialling-Out Estimator" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YcHiwiJ1jtFz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", " # residualize outcome with Lasso\n", - " yfit.rlasso <- rlasso(W,y, post=FALSE)\n", - " yhat.rlasso <- predict(yfit.rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat.rlasso)\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " Dfit.rlasso <- rlasso(W,D, post=FALSE)\n", - " Dhat.rlasso <- predict(Dfit.rlasso, as.data.frame(W))\n", - " Dres <- D - as.numeric(Dhat.rlasso)\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " Dres <- D - as.numeric(dhat_rlasso)\n", "\n", " # rest is the same as in the OLS case\n", " hat <- mean(yres * Dres) / mean(Dres^2)\n", " epsilon <- yres - hat * Dres\n", " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr = sqrt(V / length(y))\n", + " stderr <- sqrt(V / length(y))\n", "\n", " return(list(hat = hat, stderr = stderr))\n", "}" - ], - "metadata": { - "id": "YcHiwiJ1jtFz" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "double_lasso(y, D, cbind(D * X, X))" - ], + "execution_count": null, "metadata": { - "id": "HBeGQAW9jtIa" + "id": "HBeGQAW9jtIa", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "double_lasso(y, D, cbind(D * X, X))" + ] }, { "cell_type": "markdown", - "source": [ - "# Simulation" - ], "metadata": { "id": "dnouR1CPo3tF" - } + }, + "source": [ + "# Simulation" + ] }, { "cell_type": "markdown", - "source": [ - "### Two-Means" - ], "metadata": { "id": "qS3i_zabo8e1" - } + }, + "source": [ + "### Two-Means" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "boYM55VIlzch", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# We now check the distributional properties of the different estimators across experiments\n", "# First is the simple two means estimate\n", @@ -271,82 +306,100 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}\n" - ], - "metadata": { - "id": "boYM55VIlzch" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "77Nr_nANngWz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Calculate average coverage (should be .95 ideally)\n", "coverage_rate <- mean(cov)\n", "\n", "cat(\"Coverage Rate (95% CI):\", coverage_rate, \"\\n\")" - ], - "metadata": { - "id": "77Nr_nANngWz" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ], + "execution_count": null, "metadata": { - "id": "3EdakeDKmAv4" + "id": "3EdakeDKmAv4", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] }, { "cell_type": "code", - "source": [ - "mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally)" - ], + "execution_count": null, "metadata": { - "id": "hNt5QmEKmCLo" + "id": "hNt5QmEKmCLo", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally)" + ] }, { "cell_type": "code", - "source": [ - "sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs" - ], + "execution_count": null, "metadata": { - "id": "ku_EVTfemM_I" + "id": "ku_EVTfemM_I", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs" + ] }, { "cell_type": "code", - "source": [ - "mean(stderrs)" - ], + "execution_count": null, "metadata": { - "id": "4r5MP3PYmODP" + "id": "4r5MP3PYmODP", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(stderrs)" + ] }, { "cell_type": "markdown", - "source": [ - "### Partialling Out" - ], "metadata": { "id": "gI1ph04ro9-7" - } + }, + "source": [ + "### Partialling Out" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GDcD1JEVmQ3A", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Let's repeat this for the partialling out process (OLS), controlling for X\n", "\n", @@ -377,79 +430,97 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}" - ], - "metadata": { - "id": "GDcD1JEVmQ3A" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "mean(cov)" - ], + "execution_count": null, "metadata": { - "id": "fwgT8Nd3m1-S" + "id": "fwgT8Nd3m1-S", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(cov)" + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ], + "execution_count": null, "metadata": { - "id": "Y0yYO1xBmt6Z" + "id": "Y0yYO1xBmt6Z", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] }, { "cell_type": "code", - "source": [ - "mean(hats) # ols is heavily biased... mean of estimates very far from delta=1" - ], + "execution_count": null, "metadata": { - "id": "fd-9aNqImvLG" + "id": "fd-9aNqImvLG", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(hats) # ols is heavily biased... mean of estimates very far from delta=1" + ] }, { "cell_type": "code", - "source": [ - "sd(hats)" - ], + "execution_count": null, "metadata": { - "id": "1kVDTlZunN-c" + "id": "1kVDTlZunN-c", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "sd(hats)" + ] }, { "cell_type": "code", - "source": [ - "mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting" - ], + "execution_count": null, "metadata": { - "id": "Ta8s0QlunOrP" + "id": "Ta8s0QlunOrP", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting" + ] }, { "cell_type": "markdown", - "source": [ - "### Double Lasso" - ], "metadata": { "id": "cqiR8n54pAM3" - } + }, + "source": [ + "### Double Lasso" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CApI-UF0n3yx", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Now let's try the double Lasso.\n", "n_experiments <- 100\n", @@ -479,73 +550,85 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}" - ], - "metadata": { - "id": "CApI-UF0n3yx" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "mean(cov)" - ], + "execution_count": null, "metadata": { - "id": "Ujw3sUicoOgK" + "id": "Ujw3sUicoOgK", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(cov)" + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ], + "execution_count": null, "metadata": { - "id": "NveiO9xnoEgv" + "id": "NveiO9xnoEgv", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "spy0Fd8goGt6", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "mean(hats) # much closer to 1... (almost the same as two-means)\n", "sd(hats) # standard deviation much smaller than two means, which did not adjust for X\n", "mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty" - ], - "metadata": { - "id": "spy0Fd8goGt6" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Single Lasso" - ], "metadata": { "id": "PbroSXpNpCaj" - } + }, + "source": [ + "### Single Lasso" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q7PkXPAdpEjh", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", " # residualize outcome with Lasso\n", - " yfit.rlasso <- rlasso(W,y, post=FALSE)\n", - " yhat.rlasso <- predict(yfit.rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat.rlasso)\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " Dfit.rlasso <- rlasso(W,D, post=FALSE)\n", - " Dhat.rlasso <- predict(Dfit.rlasso, as.data.frame(W))\n", - " Dres <- D - as.numeric(Dhat.rlasso)\n", + " dfit_rlasso <- rlasso(W, D, post=FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " Dres <- D - as.numeric(dhat_rlasso)\n", "\n", " # rest is the same as in the OLS case\n", " hat <- mean(yres * Dres) / mean(Dres^2)\n", @@ -555,15 +638,18 @@ "\n", " return(list(hat = hat, stderr = stderr))\n", "}" - ], - "metadata": { - "id": "Q7PkXPAdpEjh" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z86ziKegpLf_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Now let's try the double Lasso.\n", "\n", @@ -580,54 +666,63 @@ " # Calculate single lasso estimate\n", "\n", "\n", - " yfit.rlasso <- rlasso(cbind(D, X),y, post=FALSE)\n", - " hat <- yfit.rlasso$coefficients[2]\n", + " yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE)\n", + " hat <- yfit_rlasso$coefficients[2]\n", "\n", " hats[i] <- hat\n", "}" - ], - "metadata": { - "id": "z86ziKegpLf_" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" - ], + "execution_count": null, "metadata": { - "id": "JVHyQxSNrLFw" + "id": "JVHyQxSNrLFw", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5RK6CFCVrNVB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# bias is comparable and larger than standard deviation.\n", "# Even if we could estimate the standard deviation, confidence intervals would undercover\n", "1 - mean(hats)\n", "sd(hats)" - ], - "metadata": { - "id": "5RK6CFCVrNVB" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Post-Lasso OLS" - ], "metadata": { "id": "-oeenku7sWC9" - } + }, + "source": [ + "### Post-Lasso OLS" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VjjbagsFsYLe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Now let's try the post-Lasso.\n", "n_experiments <- 100\n", @@ -645,8 +740,8 @@ "\n", " # run a big lasso y ~ D, X\n", " DX = cbind(D,X)\n", - " yfit.rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", - " coefs <- yfit.rlasso$coefficients[2:n]\n", + " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", " # calculate standard error as if lasso step never happened\n", @@ -662,72 +757,87 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}" - ], - "metadata": { - "id": "VjjbagsFsYLe" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "mean(cov)" - ], + "execution_count": null, "metadata": { - "id": "ZpFKqURXsdGg" + "id": "ZpFKqURXsdGg", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(cov)" + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" - ], + "execution_count": null, "metadata": { - "id": "-XlZuHuFsw3E" + "id": "-XlZuHuFsw3E", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-_frAjzet5Oe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "1 - mean(hats) # quite un-biased; bias < standard deviation\n", "sd(hats)" - ], - "metadata": { - "id": "-_frAjzet5Oe" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K0sBTl3FtFeV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# we under-estimated a bit the uncertainty; smaller estimated stderr than true std.\n", "# this is most prob a finite sample error, from ignoring the lasso variable selection step\n", "# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior.\n", "mean(stderrs)" - ], - "metadata": { - "id": "K0sBTl3FtFeV" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Not RCT Data" - ], "metadata": { "id": "EXqKbFLkuKZi" - } + }, + "source": [ + "### Not RCT Data" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C3ZJTnpLt9-0", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "gen_data_nonRCT <- function(n, d, p, delta, base) {\n", " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", @@ -735,15 +845,18 @@ " y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1)\n", " return(list(y = y, D = D, X = X))\n", "}" - ], - "metadata": { - "id": "C3ZJTnpLt9-0" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4dnILAzPuTR7", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# post-lasso\n", "n_experiments <- 100\n", @@ -760,9 +873,9 @@ "\n", "\n", " # run a big lasso y ~ D, X\n", - " DX = cbind(D,X)\n", - " yfit.rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", - " coefs <- yfit.rlasso$coefficients[2:n]\n", + " DX = cbind(D, X)\n", + " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", " # calculate standard error as if lasso step never happened\n", @@ -778,48 +891,60 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}" - ], - "metadata": { - "id": "4dnILAzPuTR7" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X" - ], + "execution_count": null, "metadata": { - "id": "uOo3L6W9uXZd" + "id": "uOo3L6W9uXZd", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X" + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ], + "execution_count": null, "metadata": { - "id": "oTTJJUyBux9u" + "id": "oTTJJUyBux9u", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] }, { "cell_type": "code", - "source": [ - "mean(hats) # very heavily biased" - ], + "execution_count": null, "metadata": { - "id": "wrPu3ypku02M" + "id": "wrPu3ypku02M", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(hats) # very heavily biased" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2ZU1ihfcu6z5", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# But now let's try the double Lasso.\n", "n_experiments <- 100\n", @@ -849,50 +974,57 @@ " hats[i] <- hat\n", " stderrs[i] <- stderr\n", "}" - ], - "metadata": { - "id": "2ZU1ihfcu6z5" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "mean(cov) # great coverage" - ], + "execution_count": null, "metadata": { - "id": "x1BfN7HEu_E9" + "id": "x1BfN7HEu_E9", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "mean(cov) # great coverage" + ] }, { "cell_type": "code", - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ], + "execution_count": null, "metadata": { - "id": "4mSy5U0CvEBs" + "id": "4mSy5U0CvEBs", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ElXwDxR-vEzT", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "1 - mean(hats)\n", "sd(hats) # very small bias compared to standard deviation\n", "mean(stderrs)" - ], - "metadata": { - "id": "ElXwDxR-vEzT" - }, - "execution_count": null, - "outputs": [] + ] } ], "metadata": { + "colab": { + "provenance": [] + }, "hide_input": false, "kernelspec": { "display_name": "R", @@ -900,11 +1032,8 @@ }, "language_info": { "name": "R" - }, - "colab": { - "provenance": [] } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 15f2820645b9531a4e25389780fdc20bd6429649 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:46:10 -0700 Subject: [PATCH 064/261] Update r_heterogenous_wage_effects.irnb --- PM2/r_heterogenous_wage_effects.irnb | 404 ++++++++++++++++++++++----- 1 file changed, 341 insertions(+), 63 deletions(-) diff --git a/PM2/r_heterogenous_wage_effects.irnb b/PM2/r_heterogenous_wage_effects.irnb index 4f0583c3..6524425f 100644 --- a/PM2/r_heterogenous_wage_effects.irnb +++ b/PM2/r_heterogenous_wage_effects.irnb @@ -20,16 +20,19 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "id": "3QN4EOYGQkmz", - "outputId": "a5a8f7f8-def7-4ca6-8c8f-973d00b7bd20", "colab": { "base_uri": "https://localhost:8080/" + }, + "id": "3QN4EOYGQkmz", + "outputId": "a5a8f7f8-def7-4ca6-8c8f-973d00b7bd20", + "vscode": { + "languageId": "r" } }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "Installing package into ‘/usr/local/lib/R/site-library’\n", "(as ‘lib’ is unspecified)\n", @@ -54,17 +57,20 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "id": "fLiMuKqN_eQ-", - "outputId": "88233975-7a27-4614-d878-c718e6dcb072", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 + }, + "id": "fLiMuKqN_eQ-", + "outputId": "88233975-7a27-4614-d878-c718e6dcb072", + "vscode": { + "languageId": "r" } }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "'data.frame':\t5150 obs. of 20 variables:\n", " $ wage : num 9.62 48.08 11.06 13.94 28.85 ...\n", @@ -90,7 +96,6 @@ ] }, { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -164,8 +169,144 @@ "\n", "
\n" ], - "text/markdown": "\nA data.frame: 5150 × 20\n\n| wage <dbl> | lwage <dbl> | sex <int> | shs <int> | hsg <int> | scl <int> | clg <int> | ad <int> | mw <int> | so <int> | we <int> | ne <int> | exp1 <dbl> | exp2 <dbl> | exp3 <dbl> | exp4 <dbl> | occ <dbl> | occ2 <int> | ind <dbl> | ind2 <int> |\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n| 9.615385 | 2.263364 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 3600 | 11 | 8370 | 18 |\n| 48.076923 | 3.872802 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 3050 | 10 | 5070 | 9 |\n| 11.057692 | 2.403126 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 18.0 | 3.2400 | 5.832000 | 10.49760000 | 6260 | 19 | 770 | 4 |\n| 13.942308 | 2.634928 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 420 | 1 | 6990 | 12 |\n| 28.846154 | 3.361977 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 22.0 | 4.8400 | 10.648000 | 23.42560000 | 2015 | 6 | 9470 | 22 |\n| 11.730769 | 2.462215 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0100 | 0.001000 | 0.00010000 | 1650 | 5 | 7460 | 14 |\n| 19.230769 | 2.956512 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 42.0 | 17.6400 | 74.088000 | 311.16960000 | 5120 | 17 | 7280 | 14 |\n| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 37.0 | 13.6900 | 50.653000 | 187.41610000 | 5240 | 17 | 5680 | 9 |\n| 12.000000 | 2.484907 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 4040 | 13 | 8590 | 19 |\n| 19.230769 | 2.956512 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 4.0 | 0.1600 | 0.064000 | 0.02560000 | 3255 | 10 | 8190 | 18 |\n| 17.307692 | 2.851151 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 4020 | 13 | 8270 | 18 |\n| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 30.0 | 9.0000 | 27.000000 | 81.00000000 | 4220 | 14 | 8270 | 18 |\n| 12.019231 | 2.486508 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 5.5 | 0.3025 | 0.166375 | 0.09150625 | 3600 | 11 | 8270 | 18 |\n| 13.461538 | 2.599837 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 20.5 | 4.2025 | 8.615125 | 17.66100625 | 3645 | 11 | 8190 | 18 |\n| 16.346154 | 2.793993 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 110 | 1 | 7870 | 17 |\n| 27.884615 | 3.328075 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16.0 | 2.5600 | 4.096000 | 6.55360000 | 6355 | 19 | 770 | 4 |\n| 21.600000 | 3.072693 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 27.0 | 7.2900 | 19.683000 | 53.14410000 | 6320 | 19 | 770 | 4 |\n| 8.653846 | 2.158004 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3.5 | 0.1225 | 0.042875 | 0.01500625 | 7410 | 20 | 570 | 3 |\n| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.0 | 0.3600 | 0.216000 | 0.12960000 | 7000 | 20 | 4690 | 9 |\n| 13.186813 | 2.579217 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4000 | 13 | 8680 | 20 |\n| 10.683761 | 2.368725 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 6.5 | 0.4225 | 0.274625 | 0.17850625 | 4300 | 15 | 8590 | 19 |\n| 11.538462 | 2.445686 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 8.5 | 0.7225 | 0.614125 | 0.52200625 | 3600 | 11 | 8190 | 18 |\n| 17.788462 | 2.878550 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 2310 | 8 | 7860 | 17 |\n| 19.230769 | 2.956512 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 6200 | 19 | 770 | 4 |\n| 16.304348 | 2.791432 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 24.5 | 6.0025 | 14.706125 | 36.03000625 | 20 | 1 | 5480 | 9 |\n| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4800 | 16 | 7390 | 14 |\n| 14.423077 | 2.668829 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5.0 | 0.2500 | 0.125000 | 0.06250000 | 430 | 1 | 7390 | 14 |\n| 12.000000 | 2.484907 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 4.5 | 0.2025 | 0.091125 | 0.04100625 | 6330 | 19 | 770 | 4 |\n| 16.826923 | 2.822980 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 5400 | 17 | 4870 | 9 |\n| 19.670330 | 2.979111 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 26.0 | 6.7600 | 17.576000 | 45.69760000 | 6230 | 19 | 6570 | 11 |\n| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n| 13.986014 | 2.638058 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 7.0 | 0.4900 | 0.343000 | 0.2401000 | 3820 | 12 | 9590 | 22 |\n| 15.865385 | 2.764140 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 2.0 | 0.0400 | 0.008000 | 0.0016000 | 2200 | 8 | 7870 | 17 |\n| 38.461538 | 3.649659 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 7.5 | 0.5625 | 0.421875 | 0.3164062 | 220 | 1 | 770 | 4 |\n| 28.846154 | 3.361977 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 1360 | 4 | 770 | 4 |\n| 24.475524 | 3.197674 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 24.0 | 5.7600 | 13.824000 | 33.1776000 | 2310 | 8 | 7860 | 17 |\n| 27.884615 | 3.328075 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 28.5 | 8.1225 | 23.149125 | 65.9750063 | 8140 | 21 | 770 | 4 |\n| 8.653846 | 2.158004 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3930 | 12 | 7680 | 16 |\n| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 19.0 | 3.6100 | 6.859000 | 13.0321000 | 2750 | 9 | 8560 | 19 |\n| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 710 | 2 | 6870 | 12 |\n| 12.500000 | 2.525729 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4010 | 13 | 8680 | 20 |\n| 35.256410 | 3.562647 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 6.0 | 0.3600 | 0.216000 | 0.1296000 | 2320 | 8 | 7860 | 17 |\n| 48.076923 | 3.872802 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3255 | 10 | 8170 | 18 |\n| 9.615385 | 2.263364 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20.0 | 4.0000 | 8.000000 | 16.0000000 | 4110 | 13 | 8680 | 20 |\n| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 9620 | 22 | 6290 | 10 |\n| 12.019231 | 2.486508 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 31.5 | 9.9225 | 31.255875 | 98.4560063 | 5510 | 17 | 6380 | 10 |\n| 12.980769 | 2.563469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 2010 | 6 | 9370 | 22 |\n| 26.442308 | 3.274965 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 34.0 | 11.5600 | 39.304000 | 133.6336000 | 2310 | 8 | 7860 | 17 |\n| 13.461538 | 2.599837 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 4720 | 16 | 8590 | 19 |\n| 19.711538 | 2.981204 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 2000 | 6 | 8090 | 18 |\n| 21.153846 | 3.051822 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 28.0 | 7.8400 | 21.952000 | 61.4656000 | 40 | 1 | 9170 | 21 |\n| 45.546559 | 3.818735 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 3255 | 10 | 8190 | 18 |\n| 22.596154 | 3.117780 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 9620 | 22 | 5390 | 9 |\n| 16.826923 | 2.822980 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 7150 | 20 | 8770 | 21 |\n| 24.038462 | 3.179655 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 2550 | 8 | 9480 | 22 |\n| 13.846154 | 2.628007 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 800 | 2 | 770 | 4 |\n| 14.769231 | 2.692546 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 9.0 | 0.8100 | 0.729000 | 0.6561000 | 4700 | 16 | 4970 | 9 |\n| 23.076923 | 3.138833 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4110 | 13 | 8680 | 20 |\n| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 1550 | 4 | 3680 | 6 |\n| 32.967033 | 3.495508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 2920 | 9 | 6570 | 11 |\n| 17.307692 | 2.851151 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 14.0 | 1.9600 | 2.744000 | 3.8416000 | 1610 | 5 | 7460 | 14 |\n\n", - "text/latex": "A data.frame: 5150 × 20\n\\begin{tabular}{llllllllllllllllllll}\n wage & lwage & sex & shs & hsg & scl & clg & ad & mw & so & we & ne & exp1 & exp2 & exp3 & exp4 & occ & occ2 & ind & ind2\\\\\n & & & & & & & & & & & & & & & & & & & \\\\\n\\hline\n\t 9.615385 & 2.263364 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 3600 & 11 & 8370 & 18\\\\\n\t 48.076923 & 3.872802 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 3050 & 10 & 5070 & 9\\\\\n\t 11.057692 & 2.403126 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 18.0 & 3.2400 & 5.832000 & 10.49760000 & 6260 & 19 & 770 & 4\\\\\n\t 13.942308 & 2.634928 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 420 & 1 & 6990 & 12\\\\\n\t 28.846154 & 3.361977 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 22.0 & 4.8400 & 10.648000 & 23.42560000 & 2015 & 6 & 9470 & 22\\\\\n\t 11.730769 & 2.462215 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 1.0 & 0.0100 & 0.001000 & 0.00010000 & 1650 & 5 & 7460 & 14\\\\\n\t 19.230769 & 2.956512 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 42.0 & 17.6400 & 74.088000 & 311.16960000 & 5120 & 17 & 7280 & 14\\\\\n\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 37.0 & 13.6900 & 50.653000 & 187.41610000 & 5240 & 17 & 5680 & 9\\\\\n\t 12.000000 & 2.484907 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 4040 & 13 & 8590 & 19\\\\\n\t 19.230769 & 2.956512 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 4.0 & 0.1600 & 0.064000 & 0.02560000 & 3255 & 10 & 8190 & 18\\\\\n\t 17.307692 & 2.851151 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 4020 & 13 & 8270 & 18\\\\\n\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 30.0 & 9.0000 & 27.000000 & 81.00000000 & 4220 & 14 & 8270 & 18\\\\\n\t 12.019231 & 2.486508 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 5.5 & 0.3025 & 0.166375 & 0.09150625 & 3600 & 11 & 8270 & 18\\\\\n\t 13.461538 & 2.599837 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 20.5 & 4.2025 & 8.615125 & 17.66100625 & 3645 & 11 & 8190 & 18\\\\\n\t 16.346154 & 2.793993 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 110 & 1 & 7870 & 17\\\\\n\t 27.884615 & 3.328075 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 16.0 & 2.5600 & 4.096000 & 6.55360000 & 6355 & 19 & 770 & 4\\\\\n\t 21.600000 & 3.072693 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 27.0 & 7.2900 & 19.683000 & 53.14410000 & 6320 & 19 & 770 & 4\\\\\n\t 8.653846 & 2.158004 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 3.5 & 0.1225 & 0.042875 & 0.01500625 & 7410 & 20 & 570 & 3\\\\\n\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 6.0 & 0.3600 & 0.216000 & 0.12960000 & 7000 & 20 & 4690 & 9\\\\\n\t 13.186813 & 2.579217 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4000 & 13 & 8680 & 20\\\\\n\t 10.683761 & 2.368725 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 6.5 & 0.4225 & 0.274625 & 0.17850625 & 4300 & 15 & 8590 & 19\\\\\n\t 11.538462 & 2.445686 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 8.5 & 0.7225 & 0.614125 & 0.52200625 & 3600 & 11 & 8190 & 18\\\\\n\t 17.788462 & 2.878550 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 2310 & 8 & 7860 & 17\\\\\n\t 19.230769 & 2.956512 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 6200 & 19 & 770 & 4\\\\\n\t 16.304348 & 2.791432 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 24.5 & 6.0025 & 14.706125 & 36.03000625 & 20 & 1 & 5480 & 9\\\\\n\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4800 & 16 & 7390 & 14\\\\\n\t 14.423077 & 2.668829 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 5.0 & 0.2500 & 0.125000 & 0.06250000 & 430 & 1 & 7390 & 14\\\\\n\t 12.000000 & 2.484907 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 4.5 & 0.2025 & 0.091125 & 0.04100625 & 6330 & 19 & 770 & 4\\\\\n\t 16.826923 & 2.822980 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 5400 & 17 & 4870 & 9\\\\\n\t 19.670330 & 2.979111 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 26.0 & 6.7600 & 17.576000 & 45.69760000 & 6230 & 19 & 6570 & 11\\\\\n\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n\t 13.986014 & 2.638058 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 7.0 & 0.4900 & 0.343000 & 0.2401000 & 3820 & 12 & 9590 & 22\\\\\n\t 15.865385 & 2.764140 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 2.0 & 0.0400 & 0.008000 & 0.0016000 & 2200 & 8 & 7870 & 17\\\\\n\t 38.461538 & 3.649659 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 7.5 & 0.5625 & 0.421875 & 0.3164062 & 220 & 1 & 770 & 4\\\\\n\t 28.846154 & 3.361977 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 1360 & 4 & 770 & 4\\\\\n\t 24.475524 & 3.197674 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 24.0 & 5.7600 & 13.824000 & 33.1776000 & 2310 & 8 & 7860 & 17\\\\\n\t 27.884615 & 3.328075 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 28.5 & 8.1225 & 23.149125 & 65.9750063 & 8140 & 21 & 770 & 4\\\\\n\t 8.653846 & 2.158004 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3930 & 12 & 7680 & 16\\\\\n\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 19.0 & 3.6100 & 6.859000 & 13.0321000 & 2750 & 9 & 8560 & 19\\\\\n\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 710 & 2 & 6870 & 12\\\\\n\t 12.500000 & 2.525729 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4010 & 13 & 8680 & 20\\\\\n\t 35.256410 & 3.562647 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 6.0 & 0.3600 & 0.216000 & 0.1296000 & 2320 & 8 & 7860 & 17\\\\\n\t 48.076923 & 3.872802 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3255 & 10 & 8170 & 18\\\\\n\t 9.615385 & 2.263364 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 20.0 & 4.0000 & 8.000000 & 16.0000000 & 4110 & 13 & 8680 & 20\\\\\n\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 9620 & 22 & 6290 & 10\\\\\n\t 12.019231 & 2.486508 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 31.5 & 9.9225 & 31.255875 & 98.4560063 & 5510 & 17 & 6380 & 10\\\\\n\t 12.980769 & 2.563469 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 2010 & 6 & 9370 & 22\\\\\n\t 26.442308 & 3.274965 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 34.0 & 11.5600 & 39.304000 & 133.6336000 & 2310 & 8 & 7860 & 17\\\\\n\t 13.461538 & 2.599837 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 4720 & 16 & 8590 & 19\\\\\n\t 19.711538 & 2.981204 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 2000 & 6 & 8090 & 18\\\\\n\t 21.153846 & 3.051822 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 28.0 & 7.8400 & 21.952000 & 61.4656000 & 40 & 1 & 9170 & 21\\\\\n\t 45.546559 & 3.818735 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 3255 & 10 & 8190 & 18\\\\\n\t 22.596154 & 3.117780 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 9620 & 22 & 5390 & 9\\\\\n\t 16.826923 & 2.822980 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 7150 & 20 & 8770 & 21\\\\\n\t 24.038462 & 3.179655 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 2550 & 8 & 9480 & 22\\\\\n\t 13.846154 & 2.628007 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 800 & 2 & 770 & 4\\\\\n\t 14.769231 & 2.692546 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 9.0 & 0.8100 & 0.729000 & 0.6561000 & 4700 & 16 & 4970 & 9\\\\\n\t 23.076923 & 3.138833 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4110 & 13 & 8680 & 20\\\\\n\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 1550 & 4 & 3680 & 6\\\\\n\t 32.967033 & 3.495508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 2920 & 9 & 6570 & 11\\\\\n\t 17.307692 & 2.851151 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 14.0 & 1.9600 & 2.744000 & 3.8416000 & 1610 & 5 & 7460 & 14\\\\\n\\end{tabular}\n", + "text/latex": [ + "A data.frame: 5150 × 20\n", + "\\begin{tabular}{llllllllllllllllllll}\n", + " wage & lwage & sex & shs & hsg & scl & clg & ad & mw & so & we & ne & exp1 & exp2 & exp3 & exp4 & occ & occ2 & ind & ind2\\\\\n", + " & & & & & & & & & & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t 9.615385 & 2.263364 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 3600 & 11 & 8370 & 18\\\\\n", + "\t 48.076923 & 3.872802 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 3050 & 10 & 5070 & 9\\\\\n", + "\t 11.057692 & 2.403126 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 18.0 & 3.2400 & 5.832000 & 10.49760000 & 6260 & 19 & 770 & 4\\\\\n", + "\t 13.942308 & 2.634928 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 420 & 1 & 6990 & 12\\\\\n", + "\t 28.846154 & 3.361977 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 22.0 & 4.8400 & 10.648000 & 23.42560000 & 2015 & 6 & 9470 & 22\\\\\n", + "\t 11.730769 & 2.462215 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 1.0 & 0.0100 & 0.001000 & 0.00010000 & 1650 & 5 & 7460 & 14\\\\\n", + "\t 19.230769 & 2.956512 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 42.0 & 17.6400 & 74.088000 & 311.16960000 & 5120 & 17 & 7280 & 14\\\\\n", + "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 37.0 & 13.6900 & 50.653000 & 187.41610000 & 5240 & 17 & 5680 & 9\\\\\n", + "\t 12.000000 & 2.484907 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 4040 & 13 & 8590 & 19\\\\\n", + "\t 19.230769 & 2.956512 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 4.0 & 0.1600 & 0.064000 & 0.02560000 & 3255 & 10 & 8190 & 18\\\\\n", + "\t 17.307692 & 2.851151 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 4020 & 13 & 8270 & 18\\\\\n", + "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 30.0 & 9.0000 & 27.000000 & 81.00000000 & 4220 & 14 & 8270 & 18\\\\\n", + "\t 12.019231 & 2.486508 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 5.5 & 0.3025 & 0.166375 & 0.09150625 & 3600 & 11 & 8270 & 18\\\\\n", + "\t 13.461538 & 2.599837 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 20.5 & 4.2025 & 8.615125 & 17.66100625 & 3645 & 11 & 8190 & 18\\\\\n", + "\t 16.346154 & 2.793993 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 110 & 1 & 7870 & 17\\\\\n", + "\t 27.884615 & 3.328075 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 16.0 & 2.5600 & 4.096000 & 6.55360000 & 6355 & 19 & 770 & 4\\\\\n", + "\t 21.600000 & 3.072693 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 27.0 & 7.2900 & 19.683000 & 53.14410000 & 6320 & 19 & 770 & 4\\\\\n", + "\t 8.653846 & 2.158004 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 3.5 & 0.1225 & 0.042875 & 0.01500625 & 7410 & 20 & 570 & 3\\\\\n", + "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 6.0 & 0.3600 & 0.216000 & 0.12960000 & 7000 & 20 & 4690 & 9\\\\\n", + "\t 13.186813 & 2.579217 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4000 & 13 & 8680 & 20\\\\\n", + "\t 10.683761 & 2.368725 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 6.5 & 0.4225 & 0.274625 & 0.17850625 & 4300 & 15 & 8590 & 19\\\\\n", + "\t 11.538462 & 2.445686 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 8.5 & 0.7225 & 0.614125 & 0.52200625 & 3600 & 11 & 8190 & 18\\\\\n", + "\t 17.788462 & 2.878550 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 2310 & 8 & 7860 & 17\\\\\n", + "\t 19.230769 & 2.956512 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 6200 & 19 & 770 & 4\\\\\n", + "\t 16.304348 & 2.791432 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 24.5 & 6.0025 & 14.706125 & 36.03000625 & 20 & 1 & 5480 & 9\\\\\n", + "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4800 & 16 & 7390 & 14\\\\\n", + "\t 14.423077 & 2.668829 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 5.0 & 0.2500 & 0.125000 & 0.06250000 & 430 & 1 & 7390 & 14\\\\\n", + "\t 12.000000 & 2.484907 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 4.5 & 0.2025 & 0.091125 & 0.04100625 & 6330 & 19 & 770 & 4\\\\\n", + "\t 16.826923 & 2.822980 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 5400 & 17 & 4870 & 9\\\\\n", + "\t 19.670330 & 2.979111 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 26.0 & 6.7600 & 17.576000 & 45.69760000 & 6230 & 19 & 6570 & 11\\\\\n", + "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", + "\t 13.986014 & 2.638058 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 7.0 & 0.4900 & 0.343000 & 0.2401000 & 3820 & 12 & 9590 & 22\\\\\n", + "\t 15.865385 & 2.764140 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 2.0 & 0.0400 & 0.008000 & 0.0016000 & 2200 & 8 & 7870 & 17\\\\\n", + "\t 38.461538 & 3.649659 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 7.5 & 0.5625 & 0.421875 & 0.3164062 & 220 & 1 & 770 & 4\\\\\n", + "\t 28.846154 & 3.361977 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 1360 & 4 & 770 & 4\\\\\n", + "\t 24.475524 & 3.197674 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 24.0 & 5.7600 & 13.824000 & 33.1776000 & 2310 & 8 & 7860 & 17\\\\\n", + "\t 27.884615 & 3.328075 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 28.5 & 8.1225 & 23.149125 & 65.9750063 & 8140 & 21 & 770 & 4\\\\\n", + "\t 8.653846 & 2.158004 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3930 & 12 & 7680 & 16\\\\\n", + "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 19.0 & 3.6100 & 6.859000 & 13.0321000 & 2750 & 9 & 8560 & 19\\\\\n", + "\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 710 & 2 & 6870 & 12\\\\\n", + "\t 12.500000 & 2.525729 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4010 & 13 & 8680 & 20\\\\\n", + "\t 35.256410 & 3.562647 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 6.0 & 0.3600 & 0.216000 & 0.1296000 & 2320 & 8 & 7860 & 17\\\\\n", + "\t 48.076923 & 3.872802 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3255 & 10 & 8170 & 18\\\\\n", + "\t 9.615385 & 2.263364 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 20.0 & 4.0000 & 8.000000 & 16.0000000 & 4110 & 13 & 8680 & 20\\\\\n", + "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 9620 & 22 & 6290 & 10\\\\\n", + "\t 12.019231 & 2.486508 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 31.5 & 9.9225 & 31.255875 & 98.4560063 & 5510 & 17 & 6380 & 10\\\\\n", + "\t 12.980769 & 2.563469 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 2010 & 6 & 9370 & 22\\\\\n", + "\t 26.442308 & 3.274965 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 34.0 & 11.5600 & 39.304000 & 133.6336000 & 2310 & 8 & 7860 & 17\\\\\n", + "\t 13.461538 & 2.599837 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 4720 & 16 & 8590 & 19\\\\\n", + "\t 19.711538 & 2.981204 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 2000 & 6 & 8090 & 18\\\\\n", + "\t 21.153846 & 3.051822 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 28.0 & 7.8400 & 21.952000 & 61.4656000 & 40 & 1 & 9170 & 21\\\\\n", + "\t 45.546559 & 3.818735 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 3255 & 10 & 8190 & 18\\\\\n", + "\t 22.596154 & 3.117780 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 9620 & 22 & 5390 & 9\\\\\n", + "\t 16.826923 & 2.822980 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 7150 & 20 & 8770 & 21\\\\\n", + "\t 24.038462 & 3.179655 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 2550 & 8 & 9480 & 22\\\\\n", + "\t 13.846154 & 2.628007 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 800 & 2 & 770 & 4\\\\\n", + "\t 14.769231 & 2.692546 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 9.0 & 0.8100 & 0.729000 & 0.6561000 & 4700 & 16 & 4970 & 9\\\\\n", + "\t 23.076923 & 3.138833 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4110 & 13 & 8680 & 20\\\\\n", + "\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 1550 & 4 & 3680 & 6\\\\\n", + "\t 32.967033 & 3.495508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 2920 & 9 & 6570 & 11\\\\\n", + "\t 17.307692 & 2.851151 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 14.0 & 1.9600 & 2.744000 & 3.8416000 & 1610 & 5 & 7460 & 14\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 5150 × 20\n", + "\n", + "| wage <dbl> | lwage <dbl> | sex <int> | shs <int> | hsg <int> | scl <int> | clg <int> | ad <int> | mw <int> | so <int> | we <int> | ne <int> | exp1 <dbl> | exp2 <dbl> | exp3 <dbl> | exp4 <dbl> | occ <dbl> | occ2 <int> | ind <dbl> | ind2 <int> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| 9.615385 | 2.263364 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 3600 | 11 | 8370 | 18 |\n", + "| 48.076923 | 3.872802 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 3050 | 10 | 5070 | 9 |\n", + "| 11.057692 | 2.403126 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 18.0 | 3.2400 | 5.832000 | 10.49760000 | 6260 | 19 | 770 | 4 |\n", + "| 13.942308 | 2.634928 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 420 | 1 | 6990 | 12 |\n", + "| 28.846154 | 3.361977 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 22.0 | 4.8400 | 10.648000 | 23.42560000 | 2015 | 6 | 9470 | 22 |\n", + "| 11.730769 | 2.462215 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0100 | 0.001000 | 0.00010000 | 1650 | 5 | 7460 | 14 |\n", + "| 19.230769 | 2.956512 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 42.0 | 17.6400 | 74.088000 | 311.16960000 | 5120 | 17 | 7280 | 14 |\n", + "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 37.0 | 13.6900 | 50.653000 | 187.41610000 | 5240 | 17 | 5680 | 9 |\n", + "| 12.000000 | 2.484907 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 4040 | 13 | 8590 | 19 |\n", + "| 19.230769 | 2.956512 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 4.0 | 0.1600 | 0.064000 | 0.02560000 | 3255 | 10 | 8190 | 18 |\n", + "| 17.307692 | 2.851151 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 4020 | 13 | 8270 | 18 |\n", + "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 30.0 | 9.0000 | 27.000000 | 81.00000000 | 4220 | 14 | 8270 | 18 |\n", + "| 12.019231 | 2.486508 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 5.5 | 0.3025 | 0.166375 | 0.09150625 | 3600 | 11 | 8270 | 18 |\n", + "| 13.461538 | 2.599837 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 20.5 | 4.2025 | 8.615125 | 17.66100625 | 3645 | 11 | 8190 | 18 |\n", + "| 16.346154 | 2.793993 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 110 | 1 | 7870 | 17 |\n", + "| 27.884615 | 3.328075 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16.0 | 2.5600 | 4.096000 | 6.55360000 | 6355 | 19 | 770 | 4 |\n", + "| 21.600000 | 3.072693 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 27.0 | 7.2900 | 19.683000 | 53.14410000 | 6320 | 19 | 770 | 4 |\n", + "| 8.653846 | 2.158004 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3.5 | 0.1225 | 0.042875 | 0.01500625 | 7410 | 20 | 570 | 3 |\n", + "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.0 | 0.3600 | 0.216000 | 0.12960000 | 7000 | 20 | 4690 | 9 |\n", + "| 13.186813 | 2.579217 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4000 | 13 | 8680 | 20 |\n", + "| 10.683761 | 2.368725 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 6.5 | 0.4225 | 0.274625 | 0.17850625 | 4300 | 15 | 8590 | 19 |\n", + "| 11.538462 | 2.445686 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 8.5 | 0.7225 | 0.614125 | 0.52200625 | 3600 | 11 | 8190 | 18 |\n", + "| 17.788462 | 2.878550 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 2310 | 8 | 7860 | 17 |\n", + "| 19.230769 | 2.956512 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 6200 | 19 | 770 | 4 |\n", + "| 16.304348 | 2.791432 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 24.5 | 6.0025 | 14.706125 | 36.03000625 | 20 | 1 | 5480 | 9 |\n", + "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4800 | 16 | 7390 | 14 |\n", + "| 14.423077 | 2.668829 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5.0 | 0.2500 | 0.125000 | 0.06250000 | 430 | 1 | 7390 | 14 |\n", + "| 12.000000 | 2.484907 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 4.5 | 0.2025 | 0.091125 | 0.04100625 | 6330 | 19 | 770 | 4 |\n", + "| 16.826923 | 2.822980 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 5400 | 17 | 4870 | 9 |\n", + "| 19.670330 | 2.979111 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 26.0 | 6.7600 | 17.576000 | 45.69760000 | 6230 | 19 | 6570 | 11 |\n", + "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", + "| 13.986014 | 2.638058 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 7.0 | 0.4900 | 0.343000 | 0.2401000 | 3820 | 12 | 9590 | 22 |\n", + "| 15.865385 | 2.764140 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 2.0 | 0.0400 | 0.008000 | 0.0016000 | 2200 | 8 | 7870 | 17 |\n", + "| 38.461538 | 3.649659 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 7.5 | 0.5625 | 0.421875 | 0.3164062 | 220 | 1 | 770 | 4 |\n", + "| 28.846154 | 3.361977 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 1360 | 4 | 770 | 4 |\n", + "| 24.475524 | 3.197674 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 24.0 | 5.7600 | 13.824000 | 33.1776000 | 2310 | 8 | 7860 | 17 |\n", + "| 27.884615 | 3.328075 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 28.5 | 8.1225 | 23.149125 | 65.9750063 | 8140 | 21 | 770 | 4 |\n", + "| 8.653846 | 2.158004 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3930 | 12 | 7680 | 16 |\n", + "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 19.0 | 3.6100 | 6.859000 | 13.0321000 | 2750 | 9 | 8560 | 19 |\n", + "| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 710 | 2 | 6870 | 12 |\n", + "| 12.500000 | 2.525729 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4010 | 13 | 8680 | 20 |\n", + "| 35.256410 | 3.562647 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 6.0 | 0.3600 | 0.216000 | 0.1296000 | 2320 | 8 | 7860 | 17 |\n", + "| 48.076923 | 3.872802 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3255 | 10 | 8170 | 18 |\n", + "| 9.615385 | 2.263364 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20.0 | 4.0000 | 8.000000 | 16.0000000 | 4110 | 13 | 8680 | 20 |\n", + "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 9620 | 22 | 6290 | 10 |\n", + "| 12.019231 | 2.486508 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 31.5 | 9.9225 | 31.255875 | 98.4560063 | 5510 | 17 | 6380 | 10 |\n", + "| 12.980769 | 2.563469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 2010 | 6 | 9370 | 22 |\n", + "| 26.442308 | 3.274965 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 34.0 | 11.5600 | 39.304000 | 133.6336000 | 2310 | 8 | 7860 | 17 |\n", + "| 13.461538 | 2.599837 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 4720 | 16 | 8590 | 19 |\n", + "| 19.711538 | 2.981204 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 2000 | 6 | 8090 | 18 |\n", + "| 21.153846 | 3.051822 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 28.0 | 7.8400 | 21.952000 | 61.4656000 | 40 | 1 | 9170 | 21 |\n", + "| 45.546559 | 3.818735 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 3255 | 10 | 8190 | 18 |\n", + "| 22.596154 | 3.117780 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 9620 | 22 | 5390 | 9 |\n", + "| 16.826923 | 2.822980 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 7150 | 20 | 8770 | 21 |\n", + "| 24.038462 | 3.179655 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 2550 | 8 | 9480 | 22 |\n", + "| 13.846154 | 2.628007 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 800 | 2 | 770 | 4 |\n", + "| 14.769231 | 2.692546 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 9.0 | 0.8100 | 0.729000 | 0.6561000 | 4700 | 16 | 4970 | 9 |\n", + "| 23.076923 | 3.138833 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4110 | 13 | 8680 | 20 |\n", + "| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 1550 | 4 | 3680 | 6 |\n", + "| 32.967033 | 3.495508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 2920 | 9 | 6570 | 11 |\n", + "| 17.307692 | 2.851151 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 14.0 | 1.9600 | 2.744000 | 3.8416000 | 1610 | 5 | 7460 | 14 |\n", + "\n" + ], "text/plain": [ " wage lwage sex shs hsg scl clg ad mw so we ne exp1 exp2 \n", "1 9.615385 2.263364 1 0 0 0 1 0 0 0 0 1 7.0 0.4900\n", @@ -293,12 +434,13 @@ "5150 2.744000 3.8416000 1610 5 7460 14 " ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", - "data = read.csv(file)\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "data <- read.csv(file)\n", "str(data)\n", "data" ] @@ -307,24 +449,30 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "id": "GLjEqmK8hEU8" + "id": "GLjEqmK8hEU8", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "y = data$lwage\n", - "Z = subset(data, select = -c(lwage, wage))" + "y <- data$lwage\n", + "Z <- subset(data, select = -c(lwage, wage))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { - "id": "-IX2Sy1A-QCX" + "id": "-IX2Sy1A-QCX", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "center_colmeans <- function(x) {\n", - " xcenter = colMeans(x)\n", + " xcenter <- colMeans(x)\n", " x - rep(xcenter, rep.int(nrow(x), ncol(x)))\n", "}" ] @@ -333,13 +481,16 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "id": "kaygPMYdelFI" + "id": "kaygPMYdelFI", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# create the model matrix for the covariates\n", - "controls_formula <- '~ 0 + (shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we+exp1+exp2+exp3+exp4)**2'\n", - "Zcontrols <- model.matrix(as.formula(controls_formula), data=Z) # 979\n", + "controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2'\n", + "Zcontrols <- model.matrix(as.formula(controls_formula), data = Z)\n", "Zcontrols <- center_colmeans(Zcontrols)" ] }, @@ -356,13 +507,16 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "id": "qVF99n7dhyc-" + "id": "qVF99n7dhyc-", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# create the model matrix for the linear heterogeneity\n", - "linear_het_formula <- '~ -1 + (shs+hsg+scl+clg+mw+so+we+exp1+exp2+exp3+exp4)'\n", - "Zhet <- model.matrix(as.formula(linear_het_formula), data=Z) # 11\n", + "linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "Zhet <- model.matrix(as.formula(linear_het_formula), data = Z)\n", "Zhet <- center_colmeans(Zhet)" ] }, @@ -379,15 +533,18 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "id": "jPGR47mfhzBu" + "id": "jPGR47mfhzBu", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# create the model matrix for the higher order heterogeneity\n", "Zhet <- as.data.frame(cbind(Zhet, \"sex\" = Z$sex))\n", - "nonlin_het_formula <- '~-1 + sex + sex * (shs+hsg+scl+clg+mw+so+we+exp1+exp2+exp3+exp4)'\n", - "Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data=Zhet) # 23\n", - "interaction_cols <- Zinteractions[ , grepl( \"sex\" , colnames(Zinteractions) ) ] #12" + "nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet)\n", + "interaction_cols <- Zinteractions[, grepl(\"sex\", colnames(Zinteractions))]" ] }, { @@ -403,7 +560,10 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "id": "niEXvfVSlk3v" + "id": "niEXvfVSlk3v", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -412,12 +572,12 @@ }, { "cell_type": "markdown", - "source": [ - "Get estimates and CIs" - ], "metadata": { "id": "hWtAmml_Kf2v" - } + }, + "source": [ + "Get estimates and CIs" + ] }, { "cell_type": "code", @@ -428,11 +588,13 @@ "height": 854 }, "id": "YO7Hmw5nllBK", - "outputId": "39873473-1933-4d81-d536-da42d661072e" + "outputId": "39873473-1933-4d81-d536-da42d661072e", + "vscode": { + "languageId": "r" + } }, "outputs": [ { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -456,8 +618,45 @@ "\n", "
\n" ], - "text/markdown": "\nA matrix: 12 × 4 of type dbl\n\n| | Estimate. | Std. Error | t value | Pr(>|t|) |\n|---|---|---|---|---|\n| sex | -0.067849974 | 0.015091077 | -4.4960327 | 6.923309e-06 |\n| sex:shs | -0.197841195 | 0.108831873 | -1.8178608 | 6.908541e-02 |\n| sex:hsg | 0.012340806 | 0.049116741 | 0.2512546 | 8.016173e-01 |\n| sex:scl | 0.021446184 | 0.046697210 | 0.4592605 | 6.460471e-01 |\n| sex:clg | 0.061623588 | 0.043983877 | 1.4010495 | 1.611993e-01 |\n| sex:mw | -0.108545899 | 0.041038649 | -2.6449677 | 8.169872e-03 |\n| sex:so | -0.072790206 | 0.039651133 | -1.8357661 | 6.639226e-02 |\n| sex:we | -0.050935968 | 0.041895027 | -1.2157999 | 2.240612e-01 |\n| sex:exp1 | 0.018014937 | 0.006981997 | 2.5801983 | 9.874358e-03 |\n| sex:exp2 | 0.023587076 | 0.049880609 | 0.4728706 | 6.363055e-01 |\n| sex:exp3 | -0.054853929 | 0.033450520 | -1.6398528 | 1.010358e-01 |\n| sex:exp4 | -0.007313037 | 0.002027920 | -3.6061767 | 3.107416e-04 |\n\n", - "text/latex": "A matrix: 12 × 4 of type dbl\n\\begin{tabular}{r|llll}\n & Estimate. & Std. Error & t value & Pr(>\\textbar{}t\\textbar{})\\\\\n\\hline\n\tsex & -0.067849974 & 0.015091077 & -4.4960327 & 6.923309e-06\\\\\n\tsex:shs & -0.197841195 & 0.108831873 & -1.8178608 & 6.908541e-02\\\\\n\tsex:hsg & 0.012340806 & 0.049116741 & 0.2512546 & 8.016173e-01\\\\\n\tsex:scl & 0.021446184 & 0.046697210 & 0.4592605 & 6.460471e-01\\\\\n\tsex:clg & 0.061623588 & 0.043983877 & 1.4010495 & 1.611993e-01\\\\\n\tsex:mw & -0.108545899 & 0.041038649 & -2.6449677 & 8.169872e-03\\\\\n\tsex:so & -0.072790206 & 0.039651133 & -1.8357661 & 6.639226e-02\\\\\n\tsex:we & -0.050935968 & 0.041895027 & -1.2157999 & 2.240612e-01\\\\\n\tsex:exp1 & 0.018014937 & 0.006981997 & 2.5801983 & 9.874358e-03\\\\\n\tsex:exp2 & 0.023587076 & 0.049880609 & 0.4728706 & 6.363055e-01\\\\\n\tsex:exp3 & -0.054853929 & 0.033450520 & -1.6398528 & 1.010358e-01\\\\\n\tsex:exp4 & -0.007313037 & 0.002027920 & -3.6061767 & 3.107416e-04\\\\\n\\end{tabular}\n", + "text/latex": [ + "A matrix: 12 × 4 of type dbl\n", + "\\begin{tabular}{r|llll}\n", + " & Estimate. & Std. Error & t value & Pr(>\\textbar{}t\\textbar{})\\\\\n", + "\\hline\n", + "\tsex & -0.067849974 & 0.015091077 & -4.4960327 & 6.923309e-06\\\\\n", + "\tsex:shs & -0.197841195 & 0.108831873 & -1.8178608 & 6.908541e-02\\\\\n", + "\tsex:hsg & 0.012340806 & 0.049116741 & 0.2512546 & 8.016173e-01\\\\\n", + "\tsex:scl & 0.021446184 & 0.046697210 & 0.4592605 & 6.460471e-01\\\\\n", + "\tsex:clg & 0.061623588 & 0.043983877 & 1.4010495 & 1.611993e-01\\\\\n", + "\tsex:mw & -0.108545899 & 0.041038649 & -2.6449677 & 8.169872e-03\\\\\n", + "\tsex:so & -0.072790206 & 0.039651133 & -1.8357661 & 6.639226e-02\\\\\n", + "\tsex:we & -0.050935968 & 0.041895027 & -1.2157999 & 2.240612e-01\\\\\n", + "\tsex:exp1 & 0.018014937 & 0.006981997 & 2.5801983 & 9.874358e-03\\\\\n", + "\tsex:exp2 & 0.023587076 & 0.049880609 & 0.4728706 & 6.363055e-01\\\\\n", + "\tsex:exp3 & -0.054853929 & 0.033450520 & -1.6398528 & 1.010358e-01\\\\\n", + "\tsex:exp4 & -0.007313037 & 0.002027920 & -3.6061767 & 3.107416e-04\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A matrix: 12 × 4 of type dbl\n", + "\n", + "| | Estimate. | Std. Error | t value | Pr(>|t|) |\n", + "|---|---|---|---|---|\n", + "| sex | -0.067849974 | 0.015091077 | -4.4960327 | 6.923309e-06 |\n", + "| sex:shs | -0.197841195 | 0.108831873 | -1.8178608 | 6.908541e-02 |\n", + "| sex:hsg | 0.012340806 | 0.049116741 | 0.2512546 | 8.016173e-01 |\n", + "| sex:scl | 0.021446184 | 0.046697210 | 0.4592605 | 6.460471e-01 |\n", + "| sex:clg | 0.061623588 | 0.043983877 | 1.4010495 | 1.611993e-01 |\n", + "| sex:mw | -0.108545899 | 0.041038649 | -2.6449677 | 8.169872e-03 |\n", + "| sex:so | -0.072790206 | 0.039651133 | -1.8357661 | 6.639226e-02 |\n", + "| sex:we | -0.050935968 | 0.041895027 | -1.2157999 | 2.240612e-01 |\n", + "| sex:exp1 | 0.018014937 | 0.006981997 | 2.5801983 | 9.874358e-03 |\n", + "| sex:exp2 | 0.023587076 | 0.049880609 | 0.4728706 | 6.363055e-01 |\n", + "| sex:exp3 | -0.054853929 | 0.033450520 | -1.6398528 | 1.010358e-01 |\n", + "| sex:exp4 | -0.007313037 | 0.002027920 | -3.6061767 | 3.107416e-04 |\n", + "\n" + ], "text/plain": [ " Estimate. Std. Error t value Pr(>|t|) \n", "sex -0.067849974 0.015091077 -4.4960327 6.923309e-06\n", @@ -474,11 +673,12 @@ "sex:exp4 -0.007313037 0.002027920 -3.6061767 3.107416e-04" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", "% Tue Feb 13 18:38:15 2024\n", @@ -508,11 +708,11 @@ ], "source": [ "# this cell takes 30 minutes to run\n", - "index.gender <- grep(\"sex\", colnames(Zinteractions))\n", - "effects.female <- rlassoEffects(x = X, y = y, index = index.gender, post=FALSE)\n", - "result=summary(effects.female)\n", + "index_gender <- grep(\"sex\", colnames(Zinteractions))\n", + "effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE)\n", + "result <- summary(effects_female)\n", "result$coef\n", - "print(xtable(result$coef[,c(1,2,4)], type=\"latex\"), digits=3)" + "print(xtable(result$coef[, c(1, 2, 4)], type = \"latex\"), digits = 3)" ] }, { @@ -533,11 +733,13 @@ "height": 854 }, "id": "d88JnYGG_eRA", - "outputId": "a9221fd0-db3b-418b-8823-be9e79ba0b77" + "outputId": "a9221fd0-db3b-418b-8823-be9e79ba0b77", + "vscode": { + "languageId": "r" + } }, "outputs": [ { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -561,8 +763,45 @@ "\n", "
\n" ], - "text/markdown": "\nA matrix: 12 × 2 of type dbl\n\n| | 2.5 % | 97.5 % |\n|---|---|---|\n| sex | -0.097427941 | -0.038272007 |\n| sex:shs | -0.411147747 | 0.015465357 |\n| sex:hsg | -0.083926237 | 0.108607850 |\n| sex:scl | -0.070078666 | 0.112971034 |\n| sex:clg | -0.024583227 | 0.147830403 |\n| sex:mw | -0.188980172 | -0.028111625 |\n| sex:so | -0.150504999 | 0.004924587 |\n| sex:we | -0.133048712 | 0.031176776 |\n| sex:exp1 | 0.004330474 | 0.031699400 |\n| sex:exp2 | -0.074177122 | 0.121351274 |\n| sex:exp3 | -0.120415743 | 0.010707886 |\n| sex:exp4 | -0.011287686 | -0.003338387 |\n\n", - "text/latex": "A matrix: 12 × 2 of type dbl\n\\begin{tabular}{r|ll}\n & 2.5 \\% & 97.5 \\%\\\\\n\\hline\n\tsex & -0.097427941 & -0.038272007\\\\\n\tsex:shs & -0.411147747 & 0.015465357\\\\\n\tsex:hsg & -0.083926237 & 0.108607850\\\\\n\tsex:scl & -0.070078666 & 0.112971034\\\\\n\tsex:clg & -0.024583227 & 0.147830403\\\\\n\tsex:mw & -0.188980172 & -0.028111625\\\\\n\tsex:so & -0.150504999 & 0.004924587\\\\\n\tsex:we & -0.133048712 & 0.031176776\\\\\n\tsex:exp1 & 0.004330474 & 0.031699400\\\\\n\tsex:exp2 & -0.074177122 & 0.121351274\\\\\n\tsex:exp3 & -0.120415743 & 0.010707886\\\\\n\tsex:exp4 & -0.011287686 & -0.003338387\\\\\n\\end{tabular}\n", + "text/latex": [ + "A matrix: 12 × 2 of type dbl\n", + "\\begin{tabular}{r|ll}\n", + " & 2.5 \\% & 97.5 \\%\\\\\n", + "\\hline\n", + "\tsex & -0.097427941 & -0.038272007\\\\\n", + "\tsex:shs & -0.411147747 & 0.015465357\\\\\n", + "\tsex:hsg & -0.083926237 & 0.108607850\\\\\n", + "\tsex:scl & -0.070078666 & 0.112971034\\\\\n", + "\tsex:clg & -0.024583227 & 0.147830403\\\\\n", + "\tsex:mw & -0.188980172 & -0.028111625\\\\\n", + "\tsex:so & -0.150504999 & 0.004924587\\\\\n", + "\tsex:we & -0.133048712 & 0.031176776\\\\\n", + "\tsex:exp1 & 0.004330474 & 0.031699400\\\\\n", + "\tsex:exp2 & -0.074177122 & 0.121351274\\\\\n", + "\tsex:exp3 & -0.120415743 & 0.010707886\\\\\n", + "\tsex:exp4 & -0.011287686 & -0.003338387\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A matrix: 12 × 2 of type dbl\n", + "\n", + "| | 2.5 % | 97.5 % |\n", + "|---|---|---|\n", + "| sex | -0.097427941 | -0.038272007 |\n", + "| sex:shs | -0.411147747 | 0.015465357 |\n", + "| sex:hsg | -0.083926237 | 0.108607850 |\n", + "| sex:scl | -0.070078666 | 0.112971034 |\n", + "| sex:clg | -0.024583227 | 0.147830403 |\n", + "| sex:mw | -0.188980172 | -0.028111625 |\n", + "| sex:so | -0.150504999 | 0.004924587 |\n", + "| sex:we | -0.133048712 | 0.031176776 |\n", + "| sex:exp1 | 0.004330474 | 0.031699400 |\n", + "| sex:exp2 | -0.074177122 | 0.121351274 |\n", + "| sex:exp3 | -0.120415743 | 0.010707886 |\n", + "| sex:exp4 | -0.011287686 | -0.003338387 |\n", + "\n" + ], "text/plain": [ " 2.5 % 97.5 % \n", "sex -0.097427941 -0.038272007\n", @@ -579,11 +818,12 @@ "sex:exp4 -0.011287686 -0.003338387" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", "% Tue Feb 13 18:41:01 2024\n", @@ -612,10 +852,9 @@ } ], "source": [ - "pointwise.CI <- confint(effects.female, level = 0.95)\n", - "pointwise.CI\n", - "# plot(effects.female, level=0.95) # plot of the effects\n", - "print(xtable(pointwise.CI), type=\"latex\")" + "pointwise_ci <- confint(effects_female, level = 0.95)\n", + "pointwise_ci\n", + "print(xtable(pointwise_ci), type = \"latex\")" ] }, { @@ -636,11 +875,13 @@ "height": 854 }, "id": "az7AJkhE_eRB", - "outputId": "ef8e3e87-3aac-483e-f5f5-4aacd5bf02be" + "outputId": "ef8e3e87-3aac-483e-f5f5-4aacd5bf02be", + "vscode": { + "languageId": "r" + } }, "outputs": [ { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -664,8 +905,45 @@ "\n", "
\n" ], - "text/markdown": "\nA matrix: 12 × 2 of type dbl\n\n| | 2.5 % | 97.5 % |\n|---|---|---|\n| sex | -0.11304841 | -0.022651543 |\n| sex:shs | -0.53156729 | 0.135884900 |\n| sex:hsg | -0.13716201 | 0.161843621 |\n| sex:scl | -0.12487424 | 0.167766609 |\n| sex:clg | -0.07664857 | 0.199895749 |\n| sex:mw | -0.22994084 | 0.012849045 |\n| sex:so | -0.18897851 | 0.043398103 |\n| sex:we | -0.17562318 | 0.073751241 |\n| sex:exp1 | -0.00200092 | 0.038030794 |\n| sex:exp2 | -0.11908721 | 0.166261366 |\n| sex:exp3 | -0.16472153 | 0.055013668 |\n| sex:exp4 | -0.01224237 | -0.002383705 |\n\n", - "text/latex": "A matrix: 12 × 2 of type dbl\n\\begin{tabular}{r|ll}\n & 2.5 \\% & 97.5 \\%\\\\\n\\hline\n\tsex & -0.11304841 & -0.022651543\\\\\n\tsex:shs & -0.53156729 & 0.135884900\\\\\n\tsex:hsg & -0.13716201 & 0.161843621\\\\\n\tsex:scl & -0.12487424 & 0.167766609\\\\\n\tsex:clg & -0.07664857 & 0.199895749\\\\\n\tsex:mw & -0.22994084 & 0.012849045\\\\\n\tsex:so & -0.18897851 & 0.043398103\\\\\n\tsex:we & -0.17562318 & 0.073751241\\\\\n\tsex:exp1 & -0.00200092 & 0.038030794\\\\\n\tsex:exp2 & -0.11908721 & 0.166261366\\\\\n\tsex:exp3 & -0.16472153 & 0.055013668\\\\\n\tsex:exp4 & -0.01224237 & -0.002383705\\\\\n\\end{tabular}\n", + "text/latex": [ + "A matrix: 12 × 2 of type dbl\n", + "\\begin{tabular}{r|ll}\n", + " & 2.5 \\% & 97.5 \\%\\\\\n", + "\\hline\n", + "\tsex & -0.11304841 & -0.022651543\\\\\n", + "\tsex:shs & -0.53156729 & 0.135884900\\\\\n", + "\tsex:hsg & -0.13716201 & 0.161843621\\\\\n", + "\tsex:scl & -0.12487424 & 0.167766609\\\\\n", + "\tsex:clg & -0.07664857 & 0.199895749\\\\\n", + "\tsex:mw & -0.22994084 & 0.012849045\\\\\n", + "\tsex:so & -0.18897851 & 0.043398103\\\\\n", + "\tsex:we & -0.17562318 & 0.073751241\\\\\n", + "\tsex:exp1 & -0.00200092 & 0.038030794\\\\\n", + "\tsex:exp2 & -0.11908721 & 0.166261366\\\\\n", + "\tsex:exp3 & -0.16472153 & 0.055013668\\\\\n", + "\tsex:exp4 & -0.01224237 & -0.002383705\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A matrix: 12 × 2 of type dbl\n", + "\n", + "| | 2.5 % | 97.5 % |\n", + "|---|---|---|\n", + "| sex | -0.11304841 | -0.022651543 |\n", + "| sex:shs | -0.53156729 | 0.135884900 |\n", + "| sex:hsg | -0.13716201 | 0.161843621 |\n", + "| sex:scl | -0.12487424 | 0.167766609 |\n", + "| sex:clg | -0.07664857 | 0.199895749 |\n", + "| sex:mw | -0.22994084 | 0.012849045 |\n", + "| sex:so | -0.18897851 | 0.043398103 |\n", + "| sex:we | -0.17562318 | 0.073751241 |\n", + "| sex:exp1 | -0.00200092 | 0.038030794 |\n", + "| sex:exp2 | -0.11908721 | 0.166261366 |\n", + "| sex:exp3 | -0.16472153 | 0.055013668 |\n", + "| sex:exp4 | -0.01224237 | -0.002383705 |\n", + "\n" + ], "text/plain": [ " 2.5 % 97.5 % \n", "sex -0.11304841 -0.022651543\n", @@ -682,11 +960,12 @@ "sex:exp4 -0.01224237 -0.002383705" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", "% Tue Feb 13 18:41:16 2024\n", @@ -715,10 +994,9 @@ } ], "source": [ - "joint.CI <- confint(effects.female, level = 0.95, joint = TRUE)\n", - "joint.CI\n", - "# plot(effects.female, joint=TRUE, level=0.95) # plot of the effects\n", - "print(xtable(joint.CI), type=\"latex\")" + "joint_ci <- confint(effects_female, level = 0.95, joint = TRUE)\n", + "joint_ci\n", + "print(xtable(joint_ci), type = \"latex\")" ] } ], @@ -748,4 +1026,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From e1f1683f34fd1fad0c17d837dc042bd14f77430c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 05:52:27 -0700 Subject: [PATCH 065/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index e05069b8..390ed380 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -135,7 +135,14 @@ jobs: rm -rf r_scripts rm ${{ matrix.directory }}_r_scripts.zip + - name: Check if there are any changes + id: verify_diff + run: | + git pull + git diff --quiet ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd || echo "changed=true" >> $GITHUB_OUTPUT + - name: Commit and push stripped .irnb and .Rmd files + if: steps.verify_diff.outputs.changed == 'true' run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' From 76139a43d0f425b82875e3d13e1abeb5b80887b5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:01:04 +0000 Subject: [PATCH 066/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM1 --- CM1/r-rct-penn-precision-adj.Rmd | 235 ++++++++ CM1/r-rct-penn-precision-adj.irnb | 889 +++++++++++++++--------------- CM1/r-rct-vaccines.Rmd | 253 +++++++++ CM1/r-rct-vaccines.irnb | 382 ++++++++++++- CM1/r-sim-precision-adj.Rmd | 114 ++++ CM1/r-sim-precision-adj.irnb | 215 +++++++- 6 files changed, 1637 insertions(+), 451 deletions(-) create mode 100644 CM1/r-rct-penn-precision-adj.Rmd create mode 100644 CM1/r-rct-vaccines.Rmd create mode 100644 CM1/r-sim-precision-adj.Rmd diff --git a/CM1/r-rct-penn-precision-adj.Rmd b/CM1/r-rct-penn-precision-adj.Rmd new file mode 100644 index 00000000..d34b6524 --- /dev/null +++ b/CM1/r-rct-penn-precision-adj.Rmd @@ -0,0 +1,235 @@ +--- +title: An R Markdown document converted from "CM1/r-rct-penn-precision-adj.irnb" +output: html_document +--- + +# Analyzing RCT data with Precision Adjustment + +```{r} +install.packages("sandwich") +install.packages("lmtest") +install.packages("xtable") +install.packages("hdm") +library(sandwich) +library(lmtest) +library(xtable) +library(hdm) +``` + +## Data + +In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in "Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data. + + +```{r} +## loading the data +file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat" +Penn <- as.data.frame(read.table(file, header=T)) + +n <- dim(Penn)[1] +p_1 <- dim(Penn)[2] +Penn<- subset(Penn, tg==4 | tg==0) +attach(Penn) +``` + +```{r} +T4<- (tg==4) +summary(T4) +``` + +```{r} +head(Penn) +``` + +### Model +To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model: + +$$ +Y = D \beta_1 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W')' = 0, +$$ + +where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\beta_1$ is the ATE, if the RCT assumptions hold rigorously. + + +We also consider interactive regression model: + +$$ +Y = D \alpha_1 + D W' \alpha_2 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W', DW')' = 0, +$$ +where $W$'s are demeaned (apart from the intercept), so that $\alpha_1$ is the ATE, if the RCT assumptions hold rigorously. + +Under RCT, the projection coefficient $\beta_1$ has +the interpretation of the causal effect of the treatment on +the average outcome. We thus refer to $\beta_1$ as the average +treatment effect (ATE). Note that the covariates, here are +independent of the treatment $D$, so we can identify $\beta_1$ by +just linear regression of $Y$ on $D$, without adding covariates. +However we do add covariates in an effort to improve the +precision of our estimates of the average treatment effect. + +### Analysis + +We consider + +* classical 2-sample approach, no adjustment (CL) +* classical linear regression adjustment (CRA) +* interactive regression adjusment (IRA) + +and carry out robust inference using the *estimatr* R packages. + +# Carry out covariate balance check + + +We first look at the coefficients individually with a $t$-test, and then we adjust the $p$-values to control for family-wise error. + +The regression below is done using "type='HC1'" which computes the correct Eicker-Huber-White standard errors, instead of the classical non-robust formula based on homoscedasticity. + +```{r} +data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2) + +# individual t-tests +m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data)) +coeftest(m, vcov = vcovHC(m, type="HC1")) +``` + + + +To test balance conditions, we employ the Holm-Bonferroni step-down method. With 100+ hypotheses, the family-wise type I error, or the probability of making at least one type I error treating all hypotheses independently, is close to 1. To control for this, we adjust p-values with the following procedure. + +First, set $\alpha=0.05$ and denote the list of $n$ p-values from the regression with the vector $p$. + +1. Sort $p$ from smallest to largest, so $p_{(1)} \leq p_{(2)} \leq \cdots \leq p_{(n)}$. Denote the corresponding hypothesis for $p_{(i)}$ as $H_{(i)}$. +2. For $i=1,\ldots, n$, +- If $$p_{(i)} > \frac{\alpha}{n-i+1} $$ Break the loop and do not reject any $H_{(j)}$ for $j \geq i$. +- Else reject $H_{(i)}$ if $$p_{(i)} \leq \frac{\alpha}{n-i+1} $$ Increment $i := i+1$. + + + + +```{r} +holm_bonferroni <- function(p, alpha = 0.05) { + n <- length(p) + sig_beta <- c() + + for (i in 1:n) { + if (sort(p)[i] > alpha / (n - i + 1)) { + break + } else { + sig_beta <- c(sig_beta, order(p)[i]) + } + } + + return(sig_beta) +} + +p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type="HC1"))[,4]) +significant_indices <- holm_bonferroni(p_values, alpha = 0.05) +print(paste("Significant Coefficients (Indices): ", significant_indices)) +``` + +There is also a built in R function to do this. + +```{r} +p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type="HC1"))[,4]) +holm_reject <- p.adjust(sort(p_values), "holm") <= 0.05 +holm_reject +``` + +We see that that even though this is a randomized experiment, balance conditions are failed. + + +# Model Specification + +```{r} +# model specifications + + +# no adjustment (2-sample approach) +formula_cl <- log(inuidur1)~T4 + +# adding controls +formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2 +# Omitted dummies: q1, nondurable, muld + + +ols.cl <- lm(formula_cl) +ols.cra <- lm(formula_cra) + + +ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type="HC1")) +ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type="HC1")) + +print(ols.cl) +print(ols.cra) + +``` + +The interactive specificaiton corresponds to the approach introduced in Lin (2013). + +```{r} + +#interactive regression model; + +X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1] +dim(X) +demean<- function(x){ x - mean(x)} +X = apply(X, 2, demean) + +ols.ira = lm(log(inuidur1) ~ T4*X) +ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type="HC1")) +print(ols.ira) + + +``` + +Next we try out partialling out with lasso + +```{r} +T4 = demean(T4) + +DX = model.matrix(~T4*X)[,-1] + +rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1)) + + +print(rlasso.ira) +``` + +### Results + +```{r} +str(ols.ira) +ols.ira[2,1] +``` + +```{r} +table<- matrix(0, 2, 4) +table[1,1]<- ols.cl[2,1] +table[1,2]<- ols.cra[2,1] +table[1,3]<- ols.ira[2,1] +table[1,4]<- rlasso.ira[[1]][1] + +table[2,1]<- ols.cl[2,2] +table[2,2]<- ols.cra[2,2] +table[2,3]<- ols.ira[2,2] +table[2,4]<- rlasso.ira[[1]][2] + + +colnames(table)<- c("CL","CRA","IRA", "IRA w Lasso") +rownames(table)<- c("estimate", "standard error") +tab<- xtable(table, digits=5) +tab + +print(tab, type="latex", digits=5) +``` + +Treatment group 4 experiences an average decrease of about $7.8\%$ in the length of unemployment spell. + + +Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct. + + + diff --git a/CM1/r-rct-penn-precision-adj.irnb b/CM1/r-rct-penn-precision-adj.irnb index c4433ed2..30c98e05 100644 --- a/CM1/r-rct-penn-precision-adj.irnb +++ b/CM1/r-rct-penn-precision-adj.irnb @@ -1,452 +1,443 @@ { - "cells": [ - { - "metadata": { - "id": "hz35ETpNelkG" - }, - "cell_type": "markdown", - "source": [ - "# Analyzing RCT data with Precision Adjustment" - ] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"sandwich\")\n", - "install.packages(\"lmtest\")\n", - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "library(sandwich)\n", - "library(lmtest)\n", - "library(xtable)\n", - "library(hdm)" - ], - "metadata": { - "id": "w7B1iDaqa2ZI" - }, - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "trusted": true, - "id": "v7-YfIbyelkV" - }, - "cell_type": "markdown", - "source": [ - "## Data\n", - "\n", - "In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in \"Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment\" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data.\n", - " " - ] - }, - { - "metadata": { - "trusted": true, - "id": "_whbk2z4elkY" - }, - "cell_type": "code", - "source": [ - "## loading the data\n", - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", - "Penn <- as.data.frame(read.table(file, header=T))\n", - "\n", - "n <- dim(Penn)[1]\n", - "p_1 <- dim(Penn)[2]\n", - "Penn<- subset(Penn, tg==4 | tg==0)\n", - "attach(Penn)" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "trusted": true, - "id": "iTghGG4Kelkk" - }, - "cell_type": "code", - "source": [ - "T4<- (tg==4)\n", - "summary(T4)" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "trusted": true, - "id": "ZbWuqKExelkl" - }, - "cell_type": "code", - "source": [ - "head(Penn)" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "72Rx7ECEelkn" - }, - "cell_type": "markdown", - "source": [ - "### Model\n", - "To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model:\n", - "\n", - "$$\n", - "Y = D \\beta_1 + W'\\beta_2 + \\varepsilon, \\quad E \\varepsilon (D,W')' = 0,\n", - "$$\n", - "\n", - "where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\\beta_1$ is the ATE, if the RCT assumptions hold rigorously.\n", - "\n", - "\n", - "We also consider interactive regression model:\n", - "\n", - "$$\n", - "Y = D \\alpha_1 + D W' \\alpha_2 + W'\\beta_2 + \\varepsilon, \\quad E \\varepsilon (D,W', DW')' = 0,\n", - "$$\n", - "where $W$'s are demeaned (apart from the intercept), so that $\\alpha_1$ is the ATE, if the RCT assumptions hold rigorously." - ] - }, - { - "metadata": { - "id": "uNtms5PHelko" - }, - "cell_type": "markdown", - "source": [ - "Under RCT, the projection coefficient $\\beta_1$ has\n", - "the interpretation of the causal effect of the treatment on\n", - "the average outcome. We thus refer to $\\beta_1$ as the average\n", - "treatment effect (ATE). Note that the covariates, here are\n", - "independent of the treatment $D$, so we can identify $\\beta_1$ by\n", - "just linear regression of $Y$ on $D$, without adding covariates.\n", - "However we do add covariates in an effort to improve the\n", - "precision of our estimates of the average treatment effect." - ] - }, - { - "metadata": { - "id": "cMy_2NxKelkq" - }, - "cell_type": "markdown", - "source": [ - "### Analysis\n", - "\n", - "We consider\n", - "\n", - "* classical 2-sample approach, no adjustment (CL)\n", - "* classical linear regression adjustment (CRA)\n", - "* interactive regression adjusment (IRA)\n", - "\n", - "and carry out robust inference using the *estimatr* R packages." - ] - }, - { - "metadata": { - "id": "lo-Cogv5elkx" - }, - "cell_type": "markdown", - "source": [ - "# Carry out covariate balance check\n", - "\n", - "\n", - "We first look at the coefficients individually with a $t$-test, and then we adjust the $p$-values to control for family-wise error." - ] - }, - { - "metadata": { - "id": "2FeCJjOselky" - }, - "cell_type": "markdown", - "source": [ - "The regression below is done using \"type='HC1'\" which computes the correct Eicker-Huber-White standard errors, instead of the classical non-robust formula based on homoscedasticity." - ] - }, - { - "cell_type": "code", - "source": [ - "data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2)\n", - "\n", - "# individual t-tests\n", - "m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data))\n", - "coeftest(m, vcov = vcovHC(m, type=\"HC1\"))" - ], - "metadata": { - "id": "GYOMYN_WRNTL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "\n", - "To test balance conditions, we employ the Holm-Bonferroni step-down method. With 100+ hypotheses, the family-wise type I error, or the probability of making at least one type I error treating all hypotheses independently, is close to 1. To control for this, we adjust p-values with the following procedure.\n", - "\n", - "First, set $\\alpha=0.05$ and denote the list of $n$ p-values from the regression with the vector $p$.\n", - "\n", - "1. Sort $p$ from smallest to largest, so $p_{(1)} \\leq p_{(2)} \\leq \\cdots \\leq p_{(n)}$. Denote the corresponding hypothesis for $p_{(i)}$ as $H_{(i)}$.\n", - "2. For $i=1,\\ldots, n$,\n", - "- If $$p_{(i)} > \\frac{\\alpha}{n-i+1} $$ Break the loop and do not reject any $H_{(j)}$ for $j \\geq i$.\n", - "- Else reject $H_{(i)}$ if $$p_{(i)} \\leq \\frac{\\alpha}{n-i+1} $$ Increment $i := i+1$.\n", - "\n", - "\n", - "\n" - ], - "metadata": { - "id": "hejmSGDuEj5U" - } - }, - { - "cell_type": "code", - "source": [ - "holm_bonferroni <- function(p, alpha = 0.05) {\n", - " n <- length(p)\n", - " sig_beta <- c()\n", - "\n", - " for (i in 1:n) {\n", - " if (sort(p)[i] > alpha / (n - i + 1)) {\n", - " break\n", - " } else {\n", - " sig_beta <- c(sig_beta, order(p)[i])\n", - " }\n", - " }\n", - "\n", - " return(sig_beta)\n", - "}\n", - "\n", - "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", - "significant_indices <- holm_bonferroni(p_values, alpha = 0.05)\n", - "print(paste(\"Significant Coefficients (Indices): \", significant_indices))\n" - ], - "metadata": { - "id": "T4AmK55IiiV1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "There is also a built in R function to do this." - ], - "metadata": { - "id": "66zng98lpK1w" - } - }, - { - "cell_type": "code", - "source": [ - "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", - "holm_reject <- p.adjust(sort(p_values), \"holm\") <= 0.05\n", - "holm_reject" - ], - "metadata": { - "id": "iBMiaj3jAZuo" - }, - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "tPtstQV7elk0" - }, - "cell_type": "markdown", - "source": [ - "We see that that even though this is a randomized experiment, balance conditions are failed.\n", - "" - ] - }, - { - "metadata": { - "id": "C-2Ii9rbelk1" - }, - "cell_type": "markdown", - "source": [ - "# Model Specification" - ] - }, - { - "metadata": { - "trusted": true, - "id": "43W-vaIzelk1" - }, - "cell_type": "code", - "source": [ - "# model specifications\n", - "\n", - "\n", - "# no adjustment (2-sample approach)\n", - "formula_cl <- log(inuidur1)~T4\n", - "\n", - "# adding controls\n", - "formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2\n", - "# Omitted dummies: q1, nondurable, muld\n", - "\n", - "\n", - "ols.cl <- lm(formula_cl)\n", - "ols.cra <- lm(formula_cra)\n", - "\n", - "\n", - "ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type=\"HC1\"))\n", - "ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type=\"HC1\"))\n", - "\n", - "print(ols.cl)\n", - "print(ols.cra)\n", - "\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "K2nSjv8Aelk2" - }, - "cell_type": "markdown", - "source": [ - "The interactive specificaiton corresponds to the approach introduced in Lin (2013)." - ] - }, - { - "metadata": { - "trusted": true, - "id": "SGdP0kQ3elk2" - }, - "cell_type": "code", - "source": [ - "\n", - "#interactive regression model;\n", - "\n", - "X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1]\n", - "dim(X)\n", - "demean<- function(x){ x - mean(x)}\n", - "X = apply(X, 2, demean)\n", - "\n", - "ols.ira = lm(log(inuidur1) ~ T4*X)\n", - "ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type=\"HC1\"))\n", - "print(ols.ira)\n", - "\n", - "\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "y7k740wbelk3" - }, - "cell_type": "markdown", - "source": [ - "Next we try out partialling out with lasso" - ] - }, - { - "metadata": { - "trusted": true, - "id": "O9AZ49XNelk3" - }, - "cell_type": "code", - "source": [ - "T4 = demean(T4)\n", - "\n", - "DX = model.matrix(~T4*X)[,-1]\n", - "\n", - "rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1))\n", - "\n", - "\n", - "print(rlasso.ira)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "UOzNgaLaellA" - }, - "cell_type": "markdown", - "source": [ - "### Results" - ] - }, - { - "metadata": { - "trusted": true, - "id": "UGh_LJouellB" - }, - "cell_type": "code", - "source": [ - "str(ols.ira)\n", - "ols.ira[2,1]" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "trusted": true, - "id": "wvxXEMUQellC" - }, - "cell_type": "code", - "source": [ - "table<- matrix(0, 2, 4)\n", - "table[1,1]<- ols.cl[2,1]\n", - "table[1,2]<- ols.cra[2,1]\n", - "table[1,3]<- ols.ira[2,1]\n", - "table[1,4]<- rlasso.ira[[1]][1]\n", - "\n", - "table[2,1]<- ols.cl[2,2]\n", - "table[2,2]<- ols.cra[2,2]\n", - "table[2,3]<- ols.ira[2,2]\n", - "table[2,4]<- rlasso.ira[[1]][2]\n", - "\n", - "\n", - "colnames(table)<- c(\"CL\",\"CRA\",\"IRA\", \"IRA w Lasso\")\n", - "rownames(table)<- c(\"estimate\", \"standard error\")\n", - "tab<- xtable(table, digits=5)\n", - "tab\n", - "\n", - "print(tab, type=\"latex\", digits=5)" - ], - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "id": "srdCKAK8ellD" - }, - "cell_type": "markdown", - "source": [ - "Treatment group 4 experiences an average decrease of about $7.8\\%$ in the length of unemployment spell.\n", - "\n", - "\n", - "Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct.\n", - "\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "hz35ETpNelkG" + }, + "source": [ + "# Analyzing RCT data with Precision Adjustment" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w7B1iDaqa2ZI" + }, + "outputs": [], + "source": [ + "install.packages(\"sandwich\")\n", + "install.packages(\"lmtest\")\n", + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "library(sandwich)\n", + "library(lmtest)\n", + "library(xtable)\n", + "library(hdm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v7-YfIbyelkV" + }, + "source": [ + "## Data\n", + "\n", + "In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in \"Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment\" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_whbk2z4elkY" + }, + "outputs": [], + "source": [ + "## loading the data\n", + "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", + "Penn <- as.data.frame(read.table(file, header=T))\n", + "\n", + "n <- dim(Penn)[1]\n", + "p_1 <- dim(Penn)[2]\n", + "Penn<- subset(Penn, tg==4 | tg==0)\n", + "attach(Penn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iTghGG4Kelkk" + }, + "outputs": [], + "source": [ + "T4<- (tg==4)\n", + "summary(T4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZbWuqKExelkl" + }, + "outputs": [], + "source": [ + "head(Penn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "72Rx7ECEelkn" + }, + "source": [ + "### Model\n", + "To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model:\n", + "\n", + "$$\n", + "Y = D \\beta_1 + W'\\beta_2 + \\varepsilon, \\quad E \\varepsilon (D,W')' = 0,\n", + "$$\n", + "\n", + "where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\\beta_1$ is the ATE, if the RCT assumptions hold rigorously.\n", + "\n", + "\n", + "We also consider interactive regression model:\n", + "\n", + "$$\n", + "Y = D \\alpha_1 + D W' \\alpha_2 + W'\\beta_2 + \\varepsilon, \\quad E \\varepsilon (D,W', DW')' = 0,\n", + "$$\n", + "where $W$'s are demeaned (apart from the intercept), so that $\\alpha_1$ is the ATE, if the RCT assumptions hold rigorously." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uNtms5PHelko" + }, + "source": [ + "Under RCT, the projection coefficient $\\beta_1$ has\n", + "the interpretation of the causal effect of the treatment on\n", + "the average outcome. We thus refer to $\\beta_1$ as the average\n", + "treatment effect (ATE). Note that the covariates, here are\n", + "independent of the treatment $D$, so we can identify $\\beta_1$ by\n", + "just linear regression of $Y$ on $D$, without adding covariates.\n", + "However we do add covariates in an effort to improve the\n", + "precision of our estimates of the average treatment effect." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cMy_2NxKelkq" + }, + "source": [ + "### Analysis\n", + "\n", + "We consider\n", + "\n", + "* classical 2-sample approach, no adjustment (CL)\n", + "* classical linear regression adjustment (CRA)\n", + "* interactive regression adjusment (IRA)\n", + "\n", + "and carry out robust inference using the *estimatr* R packages." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lo-Cogv5elkx" + }, + "source": [ + "# Carry out covariate balance check\n", + "\n", + "\n", + "We first look at the coefficients individually with a $t$-test, and then we adjust the $p$-values to control for family-wise error." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2FeCJjOselky" + }, + "source": [ + "The regression below is done using \"type='HC1'\" which computes the correct Eicker-Huber-White standard errors, instead of the classical non-robust formula based on homoscedasticity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GYOMYN_WRNTL" + }, + "outputs": [], + "source": [ + "data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2)\n", + "\n", + "# individual t-tests\n", + "m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data))\n", + "coeftest(m, vcov = vcovHC(m, type=\"HC1\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hejmSGDuEj5U" + }, + "source": [ + "\n", + "\n", + "To test balance conditions, we employ the Holm-Bonferroni step-down method. With 100+ hypotheses, the family-wise type I error, or the probability of making at least one type I error treating all hypotheses independently, is close to 1. To control for this, we adjust p-values with the following procedure.\n", + "\n", + "First, set $\\alpha=0.05$ and denote the list of $n$ p-values from the regression with the vector $p$.\n", + "\n", + "1. Sort $p$ from smallest to largest, so $p_{(1)} \\leq p_{(2)} \\leq \\cdots \\leq p_{(n)}$. Denote the corresponding hypothesis for $p_{(i)}$ as $H_{(i)}$.\n", + "2. For $i=1,\\ldots, n$,\n", + "- If $$p_{(i)} > \\frac{\\alpha}{n-i+1} $$ Break the loop and do not reject any $H_{(j)}$ for $j \\geq i$.\n", + "- Else reject $H_{(i)}$ if $$p_{(i)} \\leq \\frac{\\alpha}{n-i+1} $$ Increment $i := i+1$.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T4AmK55IiiV1" + }, + "outputs": [], + "source": [ + "holm_bonferroni <- function(p, alpha = 0.05) {\n", + " n <- length(p)\n", + " sig_beta <- c()\n", + "\n", + " for (i in 1:n) {\n", + " if (sort(p)[i] > alpha / (n - i + 1)) {\n", + " break\n", + " } else {\n", + " sig_beta <- c(sig_beta, order(p)[i])\n", + " }\n", + " }\n", + "\n", + " return(sig_beta)\n", + "}\n", + "\n", + "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", + "significant_indices <- holm_bonferroni(p_values, alpha = 0.05)\n", + "print(paste(\"Significant Coefficients (Indices): \", significant_indices))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "66zng98lpK1w" + }, + "source": [ + "There is also a built in R function to do this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iBMiaj3jAZuo" + }, + "outputs": [], + "source": [ + "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", + "holm_reject <- p.adjust(sort(p_values), \"holm\") <= 0.05\n", + "holm_reject" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tPtstQV7elk0" + }, + "source": [ + "We see that that even though this is a randomized experiment, balance conditions are failed.\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C-2Ii9rbelk1" + }, + "source": [ + "# Model Specification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "43W-vaIzelk1" + }, + "outputs": [], + "source": [ + "# model specifications\n", + "\n", + "\n", + "# no adjustment (2-sample approach)\n", + "formula_cl <- log(inuidur1)~T4\n", + "\n", + "# adding controls\n", + "formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2\n", + "# Omitted dummies: q1, nondurable, muld\n", + "\n", + "\n", + "ols.cl <- lm(formula_cl)\n", + "ols.cra <- lm(formula_cra)\n", + "\n", + "\n", + "ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type=\"HC1\"))\n", + "ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type=\"HC1\"))\n", + "\n", + "print(ols.cl)\n", + "print(ols.cra)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K2nSjv8Aelk2" + }, + "source": [ + "The interactive specificaiton corresponds to the approach introduced in Lin (2013)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SGdP0kQ3elk2" + }, + "outputs": [], + "source": [ + "\n", + "#interactive regression model;\n", + "\n", + "X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1]\n", + "dim(X)\n", + "demean<- function(x){ x - mean(x)}\n", + "X = apply(X, 2, demean)\n", + "\n", + "ols.ira = lm(log(inuidur1) ~ T4*X)\n", + "ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type=\"HC1\"))\n", + "print(ols.ira)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y7k740wbelk3" + }, + "source": [ + "Next we try out partialling out with lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "O9AZ49XNelk3" + }, + "outputs": [], + "source": [ + "T4 = demean(T4)\n", + "\n", + "DX = model.matrix(~T4*X)[,-1]\n", + "\n", + "rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1))\n", + "\n", + "\n", + "print(rlasso.ira)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UOzNgaLaellA" + }, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UGh_LJouellB" + }, + "outputs": [], + "source": [ + "str(ols.ira)\n", + "ols.ira[2,1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wvxXEMUQellC" + }, + "outputs": [], + "source": [ + "table<- matrix(0, 2, 4)\n", + "table[1,1]<- ols.cl[2,1]\n", + "table[1,2]<- ols.cra[2,1]\n", + "table[1,3]<- ols.ira[2,1]\n", + "table[1,4]<- rlasso.ira[[1]][1]\n", + "\n", + "table[2,1]<- ols.cl[2,2]\n", + "table[2,2]<- ols.cra[2,2]\n", + "table[2,3]<- ols.ira[2,2]\n", + "table[2,4]<- rlasso.ira[[1]][2]\n", + "\n", + "\n", + "colnames(table)<- c(\"CL\",\"CRA\",\"IRA\", \"IRA w Lasso\")\n", + "rownames(table)<- c(\"estimate\", \"standard error\")\n", + "tab<- xtable(table, digits=5)\n", + "tab\n", + "\n", + "print(tab, type=\"latex\", digits=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "srdCKAK8ellD" + }, + "source": [ + "Treatment group 4 experiences an average decrease of about $7.8\\%$ in the length of unemployment spell.\n", + "\n", + "\n", + "Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct.\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/CM1/r-rct-vaccines.Rmd b/CM1/r-rct-vaccines.Rmd new file mode 100644 index 00000000..f872025a --- /dev/null +++ b/CM1/r-rct-vaccines.Rmd @@ -0,0 +1,253 @@ +--- +title: An R Markdown document converted from "CM1/r-rct-vaccines.irnb" +output: html_document +--- + +This notebook contains some RCT examples for teaching. + +```{r} +install.packages("PropCIs") # Exact CI exploiting Bernoulli outcome using the Cornfield Procedure +library(PropCIs) +``` + + +# Polio RCT + +One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized controlled trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated average treatment effect is about +$$ +-40 +$$ +with the 95% confidence band (based on approximate normality of the two sample means and their differences): +$$[-52, -28].$$ +As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio. + +The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts. + +We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html): +$$ +VE = \frac{\text{Risk for Unvaccinated - Risk for Vaccinated}}{\text{Risk for Unvaccinated}}. +$$ +It describes the relative reduction in risk caused by vaccination. + + +It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals. + + +```{r} +NV = 200745 # number of vaccinated (treated) +NU = 201229 # number of unvaccinated (control) +RV= 33/NV # average outcome for vaccinated +RU =115/NU # average outcome for unvaccinated +VE = (RU - RV)/RU; # vaccine efficacy + +# incidence per 100000 +Incidence.RV=RV*100000 +Incidence.RU=RU*100000 + +print(paste("Incidence per 100000 among treated:", round(Incidence.RV,4))) + +print(paste("Incidence per 100000 among controlled:", round(Incidence.RU,4))) + +# treatment effect - estimated reduction in incidence per 100000 people +delta.hat = 100000*(RV-RU) + +print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) + +# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU +Var.delta.hat = 100000^2*(Var.RV + Var.RU) +Std.delta.hat = sqrt(Var.delta.hat) + +print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) + +CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), + delta.hat +1.96*sqrt(Var.delta.hat)) + +print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", + round(CI.delta[2],4), "]" )) + +print(paste("Overall VE is", round(VE,4) )) + +# we use an approximate bootstrap to find the confidence interval of vaccine efficacy +# via Monte Carlo draws +set.seed(1) +B = 10000 # number of bootstraps +RVs = RV + rnorm(B)*sqrt(Var.RV) +RUs = RU + rnorm(B)*sqrt(Var.RU) +VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps + +print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", + round(CI.VE[2],4), "]")) +``` + +# Pfizer/BNTX Covid-19 RCT + +Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion]( +https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data. + +Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they received. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table. + +![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj): + +Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age groups 65-74 and 75+ are much wider due to the relatively small number of people in these groups. We could group 65-74 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band. + +We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result. + +```{r} +NV = 19965; # number vaccinated +NU = 20172; # number unvaccinated +RV = 9/NV; # average outcome for vaccinated +RU = 169/NU; # average outcome for unvaccinated +VE = (RU - RV)/RU; # vaccine efficacy + +# incidence per 100000 +Incidence.RV=RV*100000 +Incidence.RU=RU*100000 + +print(paste("Incidence per 100000 among vaccinated:", round(Incidence.RV,4))) + +print(paste("Incidence per 100000 among unvaccinated:", round(Incidence.RU,4))) + +# treatment effect - estimated reduction in incidence per 100000 people +delta.hat = 100000*(RV-RU) + +print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) + +# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU +Var.delta.hat = 100000^2*(Var.RV + Var.RU) +Std.delta.hat = sqrt(Var.delta.hat) + +print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) + +CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), + delta.hat +1.96*sqrt(Var.delta.hat)) + +print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", + round(CI.delta[2],4), "]" )) + +print(paste("Overall VE is", round(VE,4) )) + +# we use an approximate bootstrap to find the VE confidence interval +# using Monte Carlo draws as before +set.seed(1) +B = 10000 +RVs = RV + rnorm(B)*sqrt(Var.RV) +RUs = RU + rnorm(B)*sqrt(Var.RU) +VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", + round(CI.VE[2],4), "]" )) +``` + +In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older + +```{r} +# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +print(paste("Overall VE is", round(VE,4)) ) + +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU + +# As before, we use an approximate bootstrap to find the confidence intervals +# using Monte Carlo draws + +set.seed(1) +B = 10000 + RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10) + RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10) + VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", + CI.VE[2], "]" )) + +OneSidedCI.VE = quantile(VEs, c(.05)) + +print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", + 1, "]" )) +``` + +Let's try the parametric boostrap next, using the fact that the outcome is Bernouli. + +```{r} +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +print(paste("Overall VE is", VE) ) + +set.seed(1) +B = 10000 #number of simulation draw + RVs = rbinom(100000, size= NV, prob = RV) + RUs = rbinom(100000, size= NU, prob = RU) + VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", + CI.VE[2], "]" )) + +OneSidedCI.VE = quantile(VEs, c(.05)) + +print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", 1, "]" )) +``` + +# Exact Binomial Test Inversion + +It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy. + +```{r} +# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure +NV = 19965; +NU = 20172; +RV = 9/NV; +RU = 169/NU; +VE = (RU - RV)/RU; + +# 1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08)) +1-riskscoreci(9,NV,169,NU,0.95)$conf.int[2] +1-riskscoreci(9,NV,169,NU,0.95)$conf.int[1] +``` + +Note that this exactly recovers the result in the FDA table (first row). + +Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully. + +```{r} +# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +# 1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08)) + +1-riskscoreci(1,NV,19,NU,0.95)$conf.int[2] +1-riskscoreci(1,NV,19,NU,0.95)$conf.int[1] +``` + diff --git a/CM1/r-rct-vaccines.irnb b/CM1/r-rct-vaccines.irnb index aee8e00d..73bcf929 100644 --- a/CM1/r-rct-vaccines.irnb +++ b/CM1/r-rct-vaccines.irnb @@ -1 +1,381 @@ -{"metadata":{"kernelspec":{"name":"ir","display_name":"R","language":"R"},"language_info":{"name":"R","codemirror_mode":"r","pygments_lexer":"r","mimetype":"text/x-r-source","file_extension":".r","version":"4.0.5"},"colab":{"provenance":[]}},"nbformat_minor":0,"nbformat":4,"cells":[{"cell_type":"markdown","source":["This notebook contains some RCT examples for teaching."],"metadata":{"id":"Di8kfLCYerNJ"}},{"cell_type":"code","source":["install.packages(\"PropCIs\") # Exact CI exploiting Bernoulli outcome using the Cornfield Procedure\n","library(PropCIs)"],"metadata":{"id":"fW54aax9mE2G"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":[" \n","# Polio RCT\n","\n","One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized controlled trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated average treatment effect is about\n","$$\n","-40\n","$$\n","with the 95% confidence band (based on approximate normality of the two sample means and their differences):\n","$$[-52, -28].$$\n","As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio.\n","\n","The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts.\n","\n","We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html):\n","$$\n","VE = \\frac{\\text{Risk for Unvaccinated - Risk for Vaccinated}}{\\text{Risk for Unvaccinated}}.\n","$$\n","It describes the relative reduction in risk caused by vaccination.\n","\n","\n","It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals.\n","\n"],"metadata":{"id":"Id8Df7PUerNX"}},{"cell_type":"code","source":["NV = 200745 # number of vaccinated (treated)\n","NU = 201229 # number of unvaccinated (control)\n","RV= 33/NV # average outcome for vaccinated\n","RU =115/NU # average outcome for unvaccinated\n","VE = (RU - RV)/RU; # vaccine efficacy\n","\n","# incidence per 100000\n","Incidence.RV=RV*100000\n","Incidence.RU=RU*100000\n","\n","print(paste(\"Incidence per 100000 among treated:\", round(Incidence.RV,4)))\n","\n","print(paste(\"Incidence per 100000 among controlled:\", round(Incidence.RU,4)))\n","\n","# treatment effect - estimated reduction in incidence per 100000 people\n","delta.hat = 100000*(RV-RU)\n","\n","print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n","\n","# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n","Var.RV = RV*(1-RV)/NV\n","Var.RU = RU*(1-RU)/NU\n","Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n","Std.delta.hat = sqrt(Var.delta.hat)\n","\n","print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n","\n","CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n"," delta.hat +1.96*sqrt(Var.delta.hat))\n","\n","print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n"," round(CI.delta[2],4), \"]\" ))\n","\n","print(paste(\"Overall VE is\", round(VE,4) ))\n","\n","# we use an approximate bootstrap to find the confidence interval of vaccine efficacy\n","# via Monte Carlo draws\n","set.seed(1)\n","B = 10000 # number of bootstraps\n","RVs = RV + rnorm(B)*sqrt(Var.RV)\n","RUs = RU + rnorm(B)*sqrt(Var.RU)\n","VEs= (RUs - RVs)/RUs\n","\n","plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n","\n","CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps\n","\n","print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n"," round(CI.VE[2],4), \"]\"))"],"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2021-07-13T19:07:13.95576Z","iopub.execute_input":"2021-07-13T19:07:13.957741Z","iopub.status.idle":"2021-07-13T19:07:14.425992Z"},"trusted":true,"id":"SE8nvAWberNc"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Pfizer/BNTX Covid-19 RCT\n","\n","Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion](\n","https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data.\n","\n","Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they received. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table.\n","\n","![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj):\n","\n","Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age groups 65-74 and 75+ are much wider due to the relatively small number of people in these groups. We could group 65-74 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band.\n","\n","We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result.\n"],"metadata":{"id":"ahcRWgyEerNk"}},{"cell_type":"code","source":["NV = 19965; # number vaccinated\n","NU = 20172; # number unvaccinated\n","RV = 9/NV; # average outcome for vaccinated\n","RU = 169/NU; # average outcome for unvaccinated\n","VE = (RU - RV)/RU; # vaccine efficacy\n","\n","# incidence per 100000\n","Incidence.RV=RV*100000\n","Incidence.RU=RU*100000\n","\n","print(paste(\"Incidence per 100000 among vaccinated:\", round(Incidence.RV,4)))\n","\n","print(paste(\"Incidence per 100000 among unvaccinated:\", round(Incidence.RU,4)))\n","\n","# treatment effect - estimated reduction in incidence per 100000 people\n","delta.hat = 100000*(RV-RU)\n","\n","print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n","\n","# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n","Var.RV = RV*(1-RV)/NV\n","Var.RU = RU*(1-RU)/NU\n","Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n","Std.delta.hat = sqrt(Var.delta.hat)\n","\n","print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n","\n","CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n"," delta.hat +1.96*sqrt(Var.delta.hat))\n","\n","print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n"," round(CI.delta[2],4), \"]\" ))\n","\n","print(paste(\"Overall VE is\", round(VE,4) ))\n","\n","# we use an approximate bootstrap to find the VE confidence interval\n","# using Monte Carlo draws as before\n","set.seed(1)\n","B = 10000\n","RVs = RV + rnorm(B)*sqrt(Var.RV)\n","RUs = RU + rnorm(B)*sqrt(Var.RU)\n","VEs= (RUs - RVs)/RUs\n","\n","plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n","\n","CI.VE = quantile(VEs, c(.025, .975))\n","\n","print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n"," round(CI.VE[2],4), \"]\" ))"],"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","execution":{"iopub.status.busy":"2021-07-13T19:07:14.428785Z","iopub.execute_input":"2021-07-13T19:07:14.460296Z","iopub.status.idle":"2021-07-13T19:07:14.631881Z"},"trusted":true,"id":"mdrjpK4XerNl"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older"],"metadata":{"id":"kqupG5u7erNn"}},{"cell_type":"code","source":["# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older\n","NV = 3239+805;\n","NU = 3255+812;\n","RV = 1/NV;\n","RU = (14+5)/NU;\n","VE = (RU - RV)/RU;\n","\n","print(paste(\"Overall VE is\", round(VE,4)) )\n","\n","Var.RV = RV*(1-RV)/NV\n","Var.RU = RU*(1-RU)/NU\n","\n","# As before, we use an approximate bootstrap to find the confidence intervals\n","# using Monte Carlo draws\n","\n","set.seed(1)\n","B = 10000\n"," RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10)\n"," RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10)\n"," VEs= (RUs - RVs)/RUs\n","\n","plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n","\n","CI.VE = quantile(VEs, c(.025, .975))\n","\n","print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n"," CI.VE[2], \"]\" ))\n","\n","OneSidedCI.VE = quantile(VEs, c(.05))\n","\n","print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\",\n"," 1, \"]\" ))"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:07:14.636036Z","iopub.execute_input":"2021-07-13T19:07:14.637717Z","iopub.status.idle":"2021-07-13T19:07:14.764241Z"},"trusted":true,"id":"kOcfliFTerNo"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Let's try the parametric boostrap next, using the fact that the outcome is Bernouli."],"metadata":{"id":"A8wEW0sQerNq"}},{"cell_type":"code","source":["NV = 3239+805;\n","NU = 3255+812;\n","RV = 1/NV;\n","RU = (14+5)/NU;\n","VE = (RU - RV)/RU;\n","\n","print(paste(\"Overall VE is\", VE) )\n","\n","set.seed(1)\n","B = 10000 #number of simulation draw\n"," RVs = rbinom(100000, size= NV, prob = RV)\n"," RUs = rbinom(100000, size= NU, prob = RU)\n"," VEs= (RUs - RVs)/RUs\n","\n","plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n","\n","CI.VE = quantile(VEs, c(.025, .975))\n","\n","print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n"," CI.VE[2], \"]\" ))\n","\n","OneSidedCI.VE = quantile(VEs, c(.05))\n","\n","print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\", 1, \"]\" ))"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:07:14.768208Z","iopub.execute_input":"2021-07-13T19:07:14.769906Z","iopub.status.idle":"2021-07-13T19:07:14.926769Z"},"trusted":true,"id":"3WTthWWeerNr"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Exact Binomial Test Inversion\n","\n","It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy."],"metadata":{"id":"MjenLiQAerNw"}},{"cell_type":"code","source":["# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure\n","NV = 19965;\n","NU = 20172;\n","RV = 9/NV;\n","RU = 169/NU;\n","VE = (RU - RV)/RU;\n","\n","# 1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n","1-riskscoreci(9,NV,169,NU,0.95)$conf.int[2]\n","1-riskscoreci(9,NV,169,NU,0.95)$conf.int[1]"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:07:14.930797Z","iopub.execute_input":"2021-07-13T19:07:14.932587Z","iopub.status.idle":"2021-07-13T19:07:30.725666Z"},"trusted":true,"id":"XwFMp7vLerNz"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Note that this exactly recovers the result in the FDA table (first row)."],"metadata":{"id":"jQ64-QaxerN0"}},{"cell_type":"markdown","source":["Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully."],"metadata":{"id":"iQ_lO8userN2"}},{"cell_type":"code","source":["# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older\n","NV = 3239+805;\n","NU = 3255+812;\n","RV = 1/NV;\n","RU = (14+5)/NU;\n","VE = (RU - RV)/RU;\n","\n","# 1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n","\n","1-riskscoreci(1,NV,19,NU,0.95)$conf.int[2]\n","1-riskscoreci(1,NV,19,NU,0.95)$conf.int[1]\n"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:07:30.727907Z","iopub.execute_input":"2021-07-13T19:07:30.72926Z","iopub.status.idle":"2021-07-13T19:07:30.759814Z"},"trusted":true,"id":"iP0ZCUw8erN3"},"execution_count":null,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Di8kfLCYerNJ" + }, + "source": [ + "This notebook contains some RCT examples for teaching." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fW54aax9mE2G" + }, + "outputs": [], + "source": [ + "install.packages(\"PropCIs\") # Exact CI exploiting Bernoulli outcome using the Cornfield Procedure\n", + "library(PropCIs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Id8Df7PUerNX" + }, + "source": [ + " \n", + "# Polio RCT\n", + "\n", + "One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized controlled trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated average treatment effect is about\n", + "$$\n", + "-40\n", + "$$\n", + "with the 95% confidence band (based on approximate normality of the two sample means and their differences):\n", + "$$[-52, -28].$$\n", + "As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio.\n", + "\n", + "The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts.\n", + "\n", + "We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html):\n", + "$$\n", + "VE = \\frac{\\text{Risk for Unvaccinated - Risk for Vaccinated}}{\\text{Risk for Unvaccinated}}.\n", + "$$\n", + "It describes the relative reduction in risk caused by vaccination.\n", + "\n", + "\n", + "It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "id": "SE8nvAWberNc" + }, + "outputs": [], + "source": [ + "NV = 200745 # number of vaccinated (treated)\n", + "NU = 201229 # number of unvaccinated (control)\n", + "RV= 33/NV # average outcome for vaccinated\n", + "RU =115/NU # average outcome for unvaccinated\n", + "VE = (RU - RV)/RU; # vaccine efficacy\n", + "\n", + "# incidence per 100000\n", + "Incidence.RV=RV*100000\n", + "Incidence.RU=RU*100000\n", + "\n", + "print(paste(\"Incidence per 100000 among treated:\", round(Incidence.RV,4)))\n", + "\n", + "print(paste(\"Incidence per 100000 among controlled:\", round(Incidence.RU,4)))\n", + "\n", + "# treatment effect - estimated reduction in incidence per 100000 people\n", + "delta.hat = 100000*(RV-RU)\n", + "\n", + "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n", + "\n", + "# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n", + "Var.RV = RV*(1-RV)/NV\n", + "Var.RU = RU*(1-RU)/NU\n", + "Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n", + "Std.delta.hat = sqrt(Var.delta.hat)\n", + "\n", + "print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n", + "\n", + "CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n", + " delta.hat +1.96*sqrt(Var.delta.hat))\n", + "\n", + "print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n", + " round(CI.delta[2],4), \"]\" ))\n", + "\n", + "print(paste(\"Overall VE is\", round(VE,4) ))\n", + "\n", + "# we use an approximate bootstrap to find the confidence interval of vaccine efficacy\n", + "# via Monte Carlo draws\n", + "set.seed(1)\n", + "B = 10000 # number of bootstraps\n", + "RVs = RV + rnorm(B)*sqrt(Var.RV)\n", + "RUs = RU + rnorm(B)*sqrt(Var.RU)\n", + "VEs= (RUs - RVs)/RUs\n", + "\n", + "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "\n", + "CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps\n", + "\n", + "print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n", + " round(CI.VE[2],4), \"]\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ahcRWgyEerNk" + }, + "source": [ + "# Pfizer/BNTX Covid-19 RCT\n", + "\n", + "Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion](\n", + "https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data.\n", + "\n", + "Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they received. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table.\n", + "\n", + "![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj):\n", + "\n", + "Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age groups 65-74 and 75+ are much wider due to the relatively small number of people in these groups. We could group 65-74 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band.\n", + "\n", + "We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", + "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", + "id": "mdrjpK4XerNl" + }, + "outputs": [], + "source": [ + "NV = 19965; # number vaccinated\n", + "NU = 20172; # number unvaccinated\n", + "RV = 9/NV; # average outcome for vaccinated\n", + "RU = 169/NU; # average outcome for unvaccinated\n", + "VE = (RU - RV)/RU; # vaccine efficacy\n", + "\n", + "# incidence per 100000\n", + "Incidence.RV=RV*100000\n", + "Incidence.RU=RU*100000\n", + "\n", + "print(paste(\"Incidence per 100000 among vaccinated:\", round(Incidence.RV,4)))\n", + "\n", + "print(paste(\"Incidence per 100000 among unvaccinated:\", round(Incidence.RU,4)))\n", + "\n", + "# treatment effect - estimated reduction in incidence per 100000 people\n", + "delta.hat = 100000*(RV-RU)\n", + "\n", + "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n", + "\n", + "# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n", + "Var.RV = RV*(1-RV)/NV\n", + "Var.RU = RU*(1-RU)/NU\n", + "Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n", + "Std.delta.hat = sqrt(Var.delta.hat)\n", + "\n", + "print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n", + "\n", + "CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n", + " delta.hat +1.96*sqrt(Var.delta.hat))\n", + "\n", + "print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n", + " round(CI.delta[2],4), \"]\" ))\n", + "\n", + "print(paste(\"Overall VE is\", round(VE,4) ))\n", + "\n", + "# we use an approximate bootstrap to find the VE confidence interval\n", + "# using Monte Carlo draws as before\n", + "set.seed(1)\n", + "B = 10000\n", + "RVs = RV + rnorm(B)*sqrt(Var.RV)\n", + "RUs = RU + rnorm(B)*sqrt(Var.RU)\n", + "VEs= (RUs - RVs)/RUs\n", + "\n", + "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "\n", + "CI.VE = quantile(VEs, c(.025, .975))\n", + "\n", + "print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n", + " round(CI.VE[2],4), \"]\" ))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kqupG5u7erNn" + }, + "source": [ + "In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kOcfliFTerNo" + }, + "outputs": [], + "source": [ + "# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older\n", + "NV = 3239+805;\n", + "NU = 3255+812;\n", + "RV = 1/NV;\n", + "RU = (14+5)/NU;\n", + "VE = (RU - RV)/RU;\n", + "\n", + "print(paste(\"Overall VE is\", round(VE,4)) )\n", + "\n", + "Var.RV = RV*(1-RV)/NV\n", + "Var.RU = RU*(1-RU)/NU\n", + "\n", + "# As before, we use an approximate bootstrap to find the confidence intervals\n", + "# using Monte Carlo draws\n", + "\n", + "set.seed(1)\n", + "B = 10000\n", + " RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10)\n", + " RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10)\n", + " VEs= (RUs - RVs)/RUs\n", + "\n", + "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "\n", + "CI.VE = quantile(VEs, c(.025, .975))\n", + "\n", + "print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n", + " CI.VE[2], \"]\" ))\n", + "\n", + "OneSidedCI.VE = quantile(VEs, c(.05))\n", + "\n", + "print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\",\n", + " 1, \"]\" ))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A8wEW0sQerNq" + }, + "source": [ + "Let's try the parametric boostrap next, using the fact that the outcome is Bernouli." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3WTthWWeerNr" + }, + "outputs": [], + "source": [ + "NV = 3239+805;\n", + "NU = 3255+812;\n", + "RV = 1/NV;\n", + "RU = (14+5)/NU;\n", + "VE = (RU - RV)/RU;\n", + "\n", + "print(paste(\"Overall VE is\", VE) )\n", + "\n", + "set.seed(1)\n", + "B = 10000 #number of simulation draw\n", + " RVs = rbinom(100000, size= NV, prob = RV)\n", + " RUs = rbinom(100000, size= NU, prob = RU)\n", + " VEs= (RUs - RVs)/RUs\n", + "\n", + "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "\n", + "CI.VE = quantile(VEs, c(.025, .975))\n", + "\n", + "print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n", + " CI.VE[2], \"]\" ))\n", + "\n", + "OneSidedCI.VE = quantile(VEs, c(.05))\n", + "\n", + "print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\", 1, \"]\" ))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MjenLiQAerNw" + }, + "source": [ + "# Exact Binomial Test Inversion\n", + "\n", + "It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XwFMp7vLerNz" + }, + "outputs": [], + "source": [ + "# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure\n", + "NV = 19965;\n", + "NU = 20172;\n", + "RV = 9/NV;\n", + "RU = 169/NU;\n", + "VE = (RU - RV)/RU;\n", + "\n", + "# 1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n", + "1-riskscoreci(9,NV,169,NU,0.95)$conf.int[2]\n", + "1-riskscoreci(9,NV,169,NU,0.95)$conf.int[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jQ64-QaxerN0" + }, + "source": [ + "Note that this exactly recovers the result in the FDA table (first row)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iQ_lO8userN2" + }, + "source": [ + "Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iP0ZCUw8erN3" + }, + "outputs": [], + "source": [ + "# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older\n", + "NV = 3239+805;\n", + "NU = 3255+812;\n", + "RV = 1/NV;\n", + "RU = (14+5)/NU;\n", + "VE = (RU - RV)/RU;\n", + "\n", + "# 1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n", + "\n", + "1-riskscoreci(1,NV,19,NU,0.95)$conf.int[2]\n", + "1-riskscoreci(1,NV,19,NU,0.95)$conf.int[1]\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/CM1/r-sim-precision-adj.Rmd b/CM1/r-sim-precision-adj.Rmd new file mode 100644 index 00000000..8e4c1ce9 --- /dev/null +++ b/CM1/r-sim-precision-adj.Rmd @@ -0,0 +1,114 @@ +--- +title: An R Markdown document converted from "CM1/r-sim-precision-adj.irnb" +output: html_document +--- + +# Analyzing RCT with Precision by Adjusting for Baseline Covariates + +```{r} +install.packages("sandwich") +install.packages("lmtest") +library(sandwich) # heterokedasticity robust standard errors +library(lmtest) # coefficient testing +``` + +# Jonathan Roth's DGP + +Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have +$$ +E [Y(0) | Z] = - Z, \quad E [Y(1) |Z] = Z, \quad Z \sim N(0,1). +$$ +The CATE is +$$ +E [Y(1) - Y(0) | Z ]= 2 Z. +$$ +and the ATE is +$$ +2 E Z = 0. +$$ + +We would like to estimate the ATE as precisely as possible. + +An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and let $Z$ be academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require a college degree, and cause higher earnings Y(1) in jobs that do require college degrees. This type of scenario is reflected in the DGP set-up above. + + +```{r} +# generate the simulated dataset +set.seed(123) # set MC seed +n = 1000 # sample size +Z = rnorm(n) # generate Z +Y0 = -Z + rnorm(n) # conditional average baseline response is -Z +Y1 = Z + rnorm(n) # conditional average treatment effect is +Z +D = (runif(n)<.2) # treatment indicator; only 20% get treated +Y = Y1*D + Y0*(1-D) # observed Y +D = D - mean(D) # demean D +Z = Z-mean(Z) # demean Z +``` + +# Analyze the RCT data with Precision Adjustment + +Consider the follow regression models: + +* classical 2-sample approach, no adjustment (CL) +* classical linear regression adjustment (CRA) +* interactive regression adjusment (IRA) + +We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). + +We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the linear model to be a correct model of the conditional expectation function of Y given D and X, which is not the case here. + +```{r} +# implement each of the models on the simulated data +CL = lm(Y ~ D) +CRA = lm(Y ~ D+ Z) #classical +IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach + +# we are interested in the coefficients on variable "D". +coeftest(CL, vcov = vcovHC(CL, type="HC1")) +coeftest(CRA, vcov = vcovHC(CRA, type="HC1")) +coeftest(IRA, vcov = vcovHC(IRA, type="HC1")) +``` + +# Using classical standard errors (non-robust) is misleading here. + +We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. + +Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors. + +```{r} +summary(CL) +summary(CRA) +summary(IRA) +``` + +# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment + +```{r} +set.seed(123) +n = 1000 +B= 1000 + +CLs = rep(0, B) +CRAs = rep(0, B) +IRAs = rep(0, B) + +for ( i in 1:B){ + Z = rnorm(n) + Y0 = -Z + rnorm(n) + Y1 = Z + rnorm(n) + Z = Z - mean(Z) + D = (runif(n)<.1) + D = D- mean(D) + Y = Y1*D + Y0*(1-D) + CLs[i]= lm(Y ~ D)$coef[2] + CRAs[i] = lm(Y ~ D+ Z)$coef[2] + IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2] + } + +print("Standard deviations for estimators") + +sqrt(mean(CLs^2)) +sqrt(mean(CRAs^2)) +sqrt(mean(IRAs^2)) +``` + diff --git a/CM1/r-sim-precision-adj.irnb b/CM1/r-sim-precision-adj.irnb index eee38ff6..2f50dd80 100644 --- a/CM1/r-sim-precision-adj.irnb +++ b/CM1/r-sim-precision-adj.irnb @@ -1 +1,214 @@ -{"metadata":{"kernelspec":{"name":"ir","display_name":"R","language":"R"},"language_info":{"name":"R","codemirror_mode":"r","pygments_lexer":"r","mimetype":"text/x-r-source","file_extension":".r","version":"3.6.3"},"colab":{"provenance":[]}},"nbformat_minor":0,"nbformat":4,"cells":[{"cell_type":"markdown","source":["# Analyzing RCT with Precision by Adjusting for Baseline Covariates"],"metadata":{"id":"w8J4Q7djd8zU"}},{"cell_type":"code","source":["install.packages(\"sandwich\")\n","install.packages(\"lmtest\")\n","library(sandwich) # heterokedasticity robust standard errors\n","library(lmtest) # coefficient testing"],"metadata":{"id":"VzLzwihLjfEJ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Jonathan Roth's DGP\n","\n","Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have\n","$$\n","E [Y(0) | Z] = - Z, \\quad E [Y(1) |Z] = Z, \\quad Z \\sim N(0,1).\n","$$\n","The CATE is\n","$$\n","E [Y(1) - Y(0) | Z ]= 2 Z.\n","$$\n","and the ATE is\n","$$\n","2 E Z = 0.\n","$$\n","\n","We would like to estimate the ATE as precisely as possible.\n","\n","An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and let $Z$ be academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require a college degree, and cause higher earnings Y(1) in jobs that do require college degrees. This type of scenario is reflected in the DGP set-up above.\n","\n"],"metadata":{"id":"XxvsV64Dd8zg"}},{"cell_type":"code","source":["# generate the simulated dataset\n","set.seed(123) # set MC seed\n","n = 1000 # sample size\n","Z = rnorm(n) # generate Z\n","Y0 = -Z + rnorm(n) # conditional average baseline response is -Z\n","Y1 = Z + rnorm(n) # conditional average treatment effect is +Z\n","D = (runif(n)<.2) # treatment indicator; only 20% get treated\n","Y = Y1*D + Y0*(1-D) # observed Y\n","D = D - mean(D) # demean D\n","Z = Z-mean(Z) # demean Z"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:12:27.196748Z","iopub.execute_input":"2021-07-13T19:12:27.199147Z","iopub.status.idle":"2021-07-13T19:12:27.227259Z"},"trusted":true,"id":"GXWON7gHd8zj"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Analyze the RCT data with Precision Adjustment\n","\n","Consider the follow regression models:\n","\n","* classical 2-sample approach, no adjustment (CL)\n","* classical linear regression adjustment (CRA)\n","* interactive regression adjusment (IRA)\n","\n","We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). \n","\n","We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the linear model to be a correct model of the conditional expectation function of Y given D and X, which is not the case here."],"metadata":{"id":"cdTsTrJxd8zr"}},{"cell_type":"code","source":["# implement each of the models on the simulated data\n","CL = lm(Y ~ D)\n","CRA = lm(Y ~ D+ Z) #classical\n","IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach\n","\n","# we are interested in the coefficients on variable \"D\".\n","coeftest(CL, vcov = vcovHC(CL, type=\"HC1\"))\n","coeftest(CRA, vcov = vcovHC(CRA, type=\"HC1\"))\n","coeftest(IRA, vcov = vcovHC(IRA, type=\"HC1\"))"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:12:27.229611Z","iopub.execute_input":"2021-07-13T19:12:27.231383Z","iopub.status.idle":"2021-07-13T19:12:27.283914Z"},"trusted":true,"id":"mGfqEgHLd8zs"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Using classical standard errors (non-robust) is misleading here.\n","\n","We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. \n","\n","Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors.\n"],"metadata":{"id":"Fczm-t8ld8zu"}},{"cell_type":"code","source":["summary(CL)\n","summary(CRA)\n","summary(IRA)"],"metadata":{"execution":{"iopub.status.busy":"2021-07-13T19:12:27.286308Z","iopub.execute_input":"2021-07-13T19:12:27.288083Z","iopub.status.idle":"2021-07-13T19:12:27.31659Z"},"trusted":true,"id":"rOTRDgBld8zw"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment"],"metadata":{"id":"-6zymi0Md8z0"}},{"cell_type":"code","source":["set.seed(123)\n","n = 1000\n","B= 1000\n","\n","CLs = rep(0, B)\n","CRAs = rep(0, B)\n","IRAs = rep(0, B)\n","\n","for ( i in 1:B){\n"," Z = rnorm(n)\n"," Y0 = -Z + rnorm(n)\n"," Y1 = Z + rnorm(n)\n"," Z = Z - mean(Z)\n"," D = (runif(n)<.1)\n"," D = D- mean(D)\n"," Y = Y1*D + Y0*(1-D)\n"," CLs[i]= lm(Y ~ D)$coef[2]\n"," CRAs[i] = lm(Y ~ D+ Z)$coef[2]\n"," IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2]\n"," }\n","\n","print(\"Standard deviations for estimators\")\n","\n","sqrt(mean(CLs^2))\n","sqrt(mean(CRAs^2))\n","sqrt(mean(IRAs^2))"],"metadata":{"_uuid":"051d70d956493feee0c6d64651c6a088724dca2a","_execution_state":"idle","execution":{"iopub.status.busy":"2021-07-13T19:12:27.319053Z","iopub.execute_input":"2021-07-13T19:12:27.320929Z","iopub.status.idle":"2021-07-13T19:12:30.337708Z"},"trusted":true,"id":"bmtL0a9Nd8z2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"NTFe4jwBlVNo"},"execution_count":null,"outputs":[]}]} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "w8J4Q7djd8zU" + }, + "source": [ + "# Analyzing RCT with Precision by Adjusting for Baseline Covariates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VzLzwihLjfEJ" + }, + "outputs": [], + "source": [ + "install.packages(\"sandwich\")\n", + "install.packages(\"lmtest\")\n", + "library(sandwich) # heterokedasticity robust standard errors\n", + "library(lmtest) # coefficient testing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XxvsV64Dd8zg" + }, + "source": [ + "# Jonathan Roth's DGP\n", + "\n", + "Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have\n", + "$$\n", + "E [Y(0) | Z] = - Z, \\quad E [Y(1) |Z] = Z, \\quad Z \\sim N(0,1).\n", + "$$\n", + "The CATE is\n", + "$$\n", + "E [Y(1) - Y(0) | Z ]= 2 Z.\n", + "$$\n", + "and the ATE is\n", + "$$\n", + "2 E Z = 0.\n", + "$$\n", + "\n", + "We would like to estimate the ATE as precisely as possible.\n", + "\n", + "An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and let $Z$ be academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require a college degree, and cause higher earnings Y(1) in jobs that do require college degrees. This type of scenario is reflected in the DGP set-up above.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GXWON7gHd8zj" + }, + "outputs": [], + "source": [ + "# generate the simulated dataset\n", + "set.seed(123) # set MC seed\n", + "n = 1000 # sample size\n", + "Z = rnorm(n) # generate Z\n", + "Y0 = -Z + rnorm(n) # conditional average baseline response is -Z\n", + "Y1 = Z + rnorm(n) # conditional average treatment effect is +Z\n", + "D = (runif(n)<.2) # treatment indicator; only 20% get treated\n", + "Y = Y1*D + Y0*(1-D) # observed Y\n", + "D = D - mean(D) # demean D\n", + "Z = Z-mean(Z) # demean Z" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cdTsTrJxd8zr" + }, + "source": [ + "# Analyze the RCT data with Precision Adjustment\n", + "\n", + "Consider the follow regression models:\n", + "\n", + "* classical 2-sample approach, no adjustment (CL)\n", + "* classical linear regression adjustment (CRA)\n", + "* interactive regression adjusment (IRA)\n", + "\n", + "We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). \n", + "\n", + "We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the linear model to be a correct model of the conditional expectation function of Y given D and X, which is not the case here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mGfqEgHLd8zs" + }, + "outputs": [], + "source": [ + "# implement each of the models on the simulated data\n", + "CL = lm(Y ~ D)\n", + "CRA = lm(Y ~ D+ Z) #classical\n", + "IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach\n", + "\n", + "# we are interested in the coefficients on variable \"D\".\n", + "coeftest(CL, vcov = vcovHC(CL, type=\"HC1\"))\n", + "coeftest(CRA, vcov = vcovHC(CRA, type=\"HC1\"))\n", + "coeftest(IRA, vcov = vcovHC(IRA, type=\"HC1\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fczm-t8ld8zu" + }, + "source": [ + "# Using classical standard errors (non-robust) is misleading here.\n", + "\n", + "We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. \n", + "\n", + "Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rOTRDgBld8zw" + }, + "outputs": [], + "source": [ + "summary(CL)\n", + "summary(CRA)\n", + "summary(IRA)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-6zymi0Md8z0" + }, + "source": [ + "# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "bmtL0a9Nd8z2" + }, + "outputs": [], + "source": [ + "set.seed(123)\n", + "n = 1000\n", + "B= 1000\n", + "\n", + "CLs = rep(0, B)\n", + "CRAs = rep(0, B)\n", + "IRAs = rep(0, B)\n", + "\n", + "for ( i in 1:B){\n", + " Z = rnorm(n)\n", + " Y0 = -Z + rnorm(n)\n", + " Y1 = Z + rnorm(n)\n", + " Z = Z - mean(Z)\n", + " D = (runif(n)<.1)\n", + " D = D- mean(D)\n", + " Y = Y1*D + Y0*(1-D)\n", + " CLs[i]= lm(Y ~ D)$coef[2]\n", + " CRAs[i] = lm(Y ~ D+ Z)$coef[2]\n", + " IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2]\n", + " }\n", + "\n", + "print(\"Standard deviations for estimators\")\n", + "\n", + "sqrt(mean(CLs^2))\n", + "sqrt(mean(CRAs^2))\n", + "sqrt(mean(IRAs^2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NTFe4jwBlVNo" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 4ba388a32c2ae0aebb0e681d68968725c7b5e06b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:01:18 +0000 Subject: [PATCH 067/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM5 --- PM5/Autoencoders.Rmd | 250 ++ PM5/Autoencoders.irnb | 798 ++-- ...leML_and_Feature_Engineering_with_BERT.Rmd | 873 +++++ ...eML_and_Feature_Engineering_with_BERT.irnb | 3415 +++++++---------- 4 files changed, 2977 insertions(+), 2359 deletions(-) create mode 100644 PM5/Autoencoders.Rmd create mode 100644 PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd diff --git a/PM5/Autoencoders.Rmd b/PM5/Autoencoders.Rmd new file mode 100644 index 00000000..6b6b7bc3 --- /dev/null +++ b/PM5/Autoencoders.Rmd @@ -0,0 +1,250 @@ +--- +title: An R Markdown document converted from "PM5/Autoencoders.irnb" +output: html_document +--- + +# Autoencoders + +In this notebook, we'll introduce and explore "autoencoders," which are a very successful family of models in modern deep learning. In particular we will: + + +1. Illustrate the connection between autoencoders and classical *Principal Component Analysis (PCA)* +3. Train a non-linear auto-encoder that uses a deep neural network + +### Overview +As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \in \mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \mathbb{R}^d \to \mathbb{R}^k$ and $g: \mathbb{R}^k \to \mathbb{R}^d$, with $k \ll d$, such that $$g(f(X)) \approx X.$$ +In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$. + +It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\mathbb{R}^d \to \mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize +$$\mathbb{E}[\mathrm{loss}(X, f(g(X))]$$ over functions $f \in F$ and $g \in G$. As usual, this is done by minimizing the sample analog. + + + +## Linear Autoencoders and PCA: Practice + +It turns out that linear autoencoders are the same as PCA. Let's do a small sanity check to verify this. In particular, let's perform PCA two ways: first using a standard (linear algebra) toolkit, and second as a linear autoencoder using a neural network library. +If all goes well, they should give you the same reconstructions! + +To make it a bit more fun, we will use the [*Labeled Faces in the Wild*](https://www.kaggle.com/jessicali9530/celeba-dataset) dataset which consists of standardized images of roughly 5,000 celebrities' faces. In this data, PCA amounts to looking for a small number of "proto-faces" such that a linear combination of them can accurately reconstruct any celebrity's face. + +```{r} +install.packages("keras") +``` + +```{r} +install.packages("reticulate") +install.packages("abind") +install.packages("grid") +install.packages("gridExtra") +install.packages("dplyr") +install.packages("purrr") +install.packages("reshape2") +``` + +```{r} +library(reticulate) + +# Import Python's sklearn.datasets +sklearn <- import("sklearn.datasets") + +# Fetch the dataset +faces <- sklearn$fetch_lfw_people() + +# Access the images and reshape the data similar to Python's reshape method +n_examples <- dim(faces$images)[1] +height <- dim(faces$images)[2] +width <- dim(faces$images)[3] +design_matrix <- array_reshape(faces$images, c(n_examples, height * width)) + +n_features <- dim(design_matrix)[2] + +# Print the dataset details +cat(sprintf("Labeled Faces in the Wild Dataset:\n Number of examples: %d\n Number of features: %d\n Image height: %d\n Image width: %d", + n_examples, n_features, height, width)) +``` + +```{r} +library(ggplot2) +library(gridExtra) +library(grid) + +# Find indices where the label is 'Arnold Schwarzenegger' +# faces$target uses python style indexing that starts at 0 rather than R style +# indexing that starts at 1, so we subtract 1 so the indexing lines up +arnold_labels <- which(faces$target_names == "Arnold Schwarzenegger")-1 +# Get indices of all images corresponding to Arnold +arnold_pics <- which(faces$target %in% arnold_labels) + +plot_faces <- function(images, n_row=2, n_col=3, width, height) { + par(mfrow=c(n_row, n_col), mar=c(0.5, 0.5, 0.5, 0.5)) + for (i in seq_len(n_row * n_col)) { + if (i <= length(images)) { + # image needs to be transposed for and then flipped for correct orientation + # using R "image" + tmp <- t(images[[i]]) + tmp <- tmp[,ncol(tmp):1] + image(tmp, col=gray.colors(256), axes=FALSE, xlab="", ylab="") + } + } +} + +# Ensure arnold_images contains the right amount of data and is not NULL +arnold_images <- lapply(arnold_pics[1:min(6, length(arnold_pics))], function(idx) { + faces$images[idx, , ] +}) + +plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47) +``` + +```{r} +library(stats) + +# Perform PCA on the design matrix +pca <- prcomp(design_matrix, rank. = 128, retx = TRUE, center = TRUE, scale. = FALSE) + +# Extract the principal components (eigenfaces) +eigenfaces <- pca$rotation +``` + +```{r} +# 2. Plot the first 6 "eigenfaces," the six images whose linear span best explains the variation in our dataset +pca_images <- lapply(1:6, function(idx) { + array_reshape(eigenfaces[,idx], c(height,width)) +}) + +plot_faces(pca_images, height = height, width = width) +# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that captured the highest variation in our dataset of images) +# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors) +``` + +```{r} +reconstruct <- function(image_vector, n_components, eigenfaces) { + components <- eigenfaces[, 1:n_components, drop = FALSE] + compimage <- components %*% (t(components) %*% image_vector) + return(array_reshape(compimage, c(height,width))) +} + +# Select an Arnold image for reconstruction +face_vector <- t(design_matrix[arnold_pics[1], , drop = FALSE]) + +# Perform reconstructions with varying number of components +reconstructions <- lapply(c(1, 2, 8, 32, 64, 128), function(k) { + reconstruct(face_vector, k, eigenfaces) +}) + +# Plot the reconstructed faces +plot_faces(reconstructions, height = height, width = width) +``` + +```{r} +# 4. Train linear autoencoder with 64 neurons using Keras +# 5. Compare reconstructions of Arnold's face both using MSE and visually +``` + +```{r} +library(keras) +encoding_dimension <- 64 +input_image <- layer_input(shape = n_features) +encoded <- layer_dense(units = encoding_dimension, activation = 'linear')(input_image) +decoded <- layer_dense(units = n_features, activation = 'linear')(encoded) +autoencoder <- keras_model(inputs = input_image, outputs = decoded) +autoencoder %>% compile( + optimizer = 'adam', + loss = 'mse' +) +autoencoder %>% fit( + design_matrix, + design_matrix, + epochs = 50, + batch_size = 256, + shuffle = TRUE, + verbose = 0 +) +``` + +```{r} +autoencoder %>% fit( + design_matrix, + design_matrix, + epochs = 50, + batch_size = 256, + shuffle = TRUE, + verbose = 0 +) +``` + +```{r} +# Compute neural reconstruction +face_vector_flat <- as.numeric(face_vector) +reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1)) +library(ggplot2) +library(gridExtra) +library(reshape2) + +# Do visual comparison +image_height <- 62 +image_width <- 47 +image1 <- matrix(reconstructions[[4]], nrow = image_height, ncol = image_width) +image2 <- t(matrix(reconstruction, nrow = image_width, ncol = image_height)) + +images <- list(image1, image2) +plot_faces(images, n_row = 1, n_col = 2, width = image_width, height = image_height) + + +# Do numeric comparison +# We also normalize the black/white gradient to take values in [0,1] (divide by 255) +img1 <- as.numeric(reconstructions[[4]]) / 255 +img2 <- as.numeric(reconstruction) / 255 +mse <- mean((img1 - img2)^2) +mse +``` + +## Neural Autoencoders + +Finally, let's train a nonlinear autoencoder for the same data where $F$ and $G$ are neural networks, and we restrict the dimension to be $k=64$. + +```{r} +# Use a nonlinear neural network + +library(tensorflow) +n_features <- 2914 +encoding_dimension <- 64 + +input_image <- layer_input(shape = n_features) +encoded <- input_image %>% + layer_dense(units = encoding_dimension, activation = 'relu') %>% + layer_dense(units = encoding_dimension, activation = 'relu') + +decoded <- encoded %>% + layer_dense(units = encoding_dimension, activation = 'relu') %>% + layer_dense(units = n_features, activation = 'relu') + +autoencoder <- keras_model(inputs = input_image, outputs = decoded) + +autoencoder %>% compile( + optimizer = 'adam', + loss = 'mse' +) +autoencoder %>% fit( + design_matrix, + design_matrix, + epochs = 50, + batch_size = 256, + shuffle = TRUE, + verbose = 0 +) + +# Compute neural reconstruction +reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1)) + +# Do visual comparison +plot_faces(list(reconstructions[[4]],t(matrix(reconstruction, nrow = image_width, ncol = image_height))), n_row = 1, n_col = 2, width = image_width, height = image_height) + +# Do numeric comparison +# We also normalize the black/white gradient to take values in [0,1] (divide by 255) +img1 <- as.numeric(reconstructions[[4]]) / 255 +img2 <- as.numeric(reconstruction) / 255 +mse <- mean((img1 - img2)^2) +mse +``` + diff --git a/PM5/Autoencoders.irnb b/PM5/Autoencoders.irnb index 5a1fb047..845b5f96 100644 --- a/PM5/Autoencoders.irnb +++ b/PM5/Autoencoders.irnb @@ -1,401 +1,401 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Autoencoders\n", - "\n", - "In this notebook, we'll introduce and explore \"autoencoders,\" which are a very successful family of models in modern deep learning. In particular we will:\n", - "\n", - "\n", - "1. Illustrate the connection between autoencoders and classical *Principal Component Analysis (PCA)*\n", - "3. Train a non-linear auto-encoder that uses a deep neural network\n", - "\n", - "### Overview\n", - "As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \\in \\mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \\mathbb{R}^d \\to \\mathbb{R}^k$ and $g: \\mathbb{R}^k \\to \\mathbb{R}^d$, with $k \\ll d$, such that $$g(f(X)) \\approx X.$$\n", - "In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$.\n", - "\n", - "It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\\mathbb{R}^d \\to \\mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize\n", - "$$\\mathbb{E}[\\mathrm{loss}(X, f(g(X))]$$ over functions $f \\in F$ and $g \\in G$. As usual, this is done by minimizing the sample analog.\n", - "\n", - "\n" - ], - "metadata": { - "id": "QkLbE3GXm1Jo" - }, - "id": "QkLbE3GXm1Jo" - }, - { - "cell_type": "markdown", - "source": [ - "## Linear Autoencoders and PCA: Practice\n", - "\n", - "It turns out that linear autoencoders are the same as PCA. Let's do a small sanity check to verify this. In particular, let's perform PCA two ways: first using a standard (linear algebra) toolkit, and second as a linear autoencoder using a neural network library.\n", - "If all goes well, they should give you the same reconstructions!\n", - "\n", - "To make it a bit more fun, we will use the [*Labeled Faces in the Wild*](https://www.kaggle.com/jessicali9530/celeba-dataset) dataset which consists of standardized images of roughly 5,000 celebrities' faces. In this data, PCA amounts to looking for a small number of \"proto-faces\" such that a linear combination of them can accurately reconstruct any celebrity's face." - ], - "metadata": { - "id": "lnwQdyzmm8UU" - }, - "id": "lnwQdyzmm8UU" - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"keras\")" - ], - "metadata": { - "id": "nf4aybFuwTft" - }, - "id": "nf4aybFuwTft", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"reticulate\")\n", - "install.packages(\"abind\")\n", - "install.packages(\"grid\")\n", - "install.packages(\"gridExtra\")\n", - "install.packages(\"dplyr\")\n", - "install.packages(\"purrr\")\n", - "install.packages(\"reshape2\")" - ], - "metadata": { - "id": "ID08-PSOeKRf" - }, - "id": "ID08-PSOeKRf", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "library(reticulate)\n", - "\n", - "# Import Python's sklearn.datasets\n", - "sklearn <- import(\"sklearn.datasets\")\n", - "\n", - "# Fetch the dataset\n", - "faces <- sklearn$fetch_lfw_people()\n", - "\n", - "# Access the images and reshape the data similar to Python's reshape method\n", - "n_examples <- dim(faces$images)[1]\n", - "height <- dim(faces$images)[2]\n", - "width <- dim(faces$images)[3]\n", - "design_matrix <- array_reshape(faces$images, c(n_examples, height * width))\n", - "\n", - "n_features <- dim(design_matrix)[2]\n", - "\n", - "# Print the dataset details\n", - "cat(sprintf(\"Labeled Faces in the Wild Dataset:\\n Number of examples: %d\\n Number of features: %d\\n Image height: %d\\n Image width: %d\",\n", - " n_examples, n_features, height, width))\n" - ], - "metadata": { - "id": "Z_ZpuBEBfCeH" - }, - "id": "Z_ZpuBEBfCeH", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "PX_E23v-5yZY" - }, - "source": [ - "library(ggplot2)\n", - "library(gridExtra)\n", - "library(grid)\n", - "\n", - "# Find indices where the label is 'Arnold Schwarzenegger'\n", - "# faces$target uses python style indexing that starts at 0 rather than R style\n", - "# indexing that starts at 1, so we subtract 1 so the indexing lines up\n", - "arnold_labels <- which(faces$target_names == \"Arnold Schwarzenegger\")-1\n", - "# Get indices of all images corresponding to Arnold\n", - "arnold_pics <- which(faces$target %in% arnold_labels)\n", - "\n", - "plot_faces <- function(images, n_row=2, n_col=3, width, height) {\n", - " par(mfrow=c(n_row, n_col), mar=c(0.5, 0.5, 0.5, 0.5))\n", - " for (i in seq_len(n_row * n_col)) {\n", - " if (i <= length(images)) {\n", - " # image needs to be transposed for and then flipped for correct orientation\n", - " # using R \"image\"\n", - " tmp <- t(images[[i]])\n", - " tmp <- tmp[,ncol(tmp):1]\n", - " image(tmp, col=gray.colors(256), axes=FALSE, xlab=\"\", ylab=\"\")\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Ensure arnold_images contains the right amount of data and is not NULL\n", - "arnold_images <- lapply(arnold_pics[1:min(6, length(arnold_pics))], function(idx) {\n", - " faces$images[idx, , ]\n", - "})\n", - "\n", - "plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47)\n" - ], - "execution_count": null, - "outputs": [], - "id": "PX_E23v-5yZY" - }, - { - "cell_type": "code", - "source": [ - "library(stats)\n", - "\n", - "# Perform PCA on the design matrix\n", - "pca <- prcomp(design_matrix, rank. = 128, retx = TRUE, center = TRUE, scale. = FALSE)\n", - "\n", - "# Extract the principal components (eigenfaces)\n", - "eigenfaces <- pca$rotation\n" - ], - "metadata": { - "id": "imSXA7-jsGKl" - }, - "id": "imSXA7-jsGKl", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "sLi4k8t3DrHe" - }, - "source": [ - "# 2. Plot the first 6 \"eigenfaces,\" the six images whose linear span best explains the variation in our dataset\n", - "pca_images <- lapply(1:6, function(idx) {\n", - " array_reshape(eigenfaces[,idx], c(height,width))\n", - "})\n", - "\n", - "plot_faces(pca_images, height = height, width = width)\n", - "# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that captured the highest variation in our dataset of images)\n", - "# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors)" - ], - "execution_count": null, - "outputs": [], - "id": "sLi4k8t3DrHe" - }, - { - "cell_type": "code", - "metadata": { - "id": "Gmj2lpTfCXKC" - }, - "source": [ - "reconstruct <- function(image_vector, n_components, eigenfaces) {\n", - " components <- eigenfaces[, 1:n_components, drop = FALSE]\n", - " compimage <- components %*% (t(components) %*% image_vector)\n", - " return(array_reshape(compimage, c(height,width)))\n", - "}\n", - "\n", - "# Select an Arnold image for reconstruction\n", - "face_vector <- t(design_matrix[arnold_pics[1], , drop = FALSE])\n", - "\n", - "# Perform reconstructions with varying number of components\n", - "reconstructions <- lapply(c(1, 2, 8, 32, 64, 128), function(k) {\n", - " reconstruct(face_vector, k, eigenfaces)\n", - "})\n", - "\n", - "# Plot the reconstructed faces\n", - "plot_faces(reconstructions, height = height, width = width)\n" - ], - "execution_count": null, - "outputs": [], - "id": "Gmj2lpTfCXKC" - }, - { - "cell_type": "code", - "metadata": { - "id": "eoZ_BsXYDE7P" - }, - "source": [ - "# 4. Train linear autoencoder with 64 neurons using Keras\n", - "# 5. Compare reconstructions of Arnold's face both using MSE and visually" - ], - "execution_count": null, - "outputs": [], - "id": "eoZ_BsXYDE7P" - }, - { - "cell_type": "code", - "metadata": { - "id": "urlMaifVJCDc" - }, - "source": [ - "library(keras)\n", - "encoding_dimension <- 64\n", - "input_image <- layer_input(shape = n_features)\n", - "encoded <- layer_dense(units = encoding_dimension, activation = 'linear')(input_image)\n", - "decoded <- layer_dense(units = n_features, activation = 'linear')(encoded)\n", - "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", - "autoencoder %>% compile(\n", - " optimizer = 'adam',\n", - " loss = 'mse'\n", - ")\n", - "autoencoder %>% fit(\n", - " design_matrix,\n", - " design_matrix,\n", - " epochs = 50,\n", - " batch_size = 256,\n", - " shuffle = TRUE,\n", - " verbose = 0\n", - ")" - ], - "execution_count": null, - "outputs": [], - "id": "urlMaifVJCDc" - }, - { - "cell_type": "code", - "metadata": { - "id": "5OTUbWg8NcIE" - }, - "source": [ - "autoencoder %>% fit(\n", - " design_matrix,\n", - " design_matrix,\n", - " epochs = 50,\n", - " batch_size = 256,\n", - " shuffle = TRUE,\n", - " verbose = 0\n", - ")" - ], - "execution_count": null, - "outputs": [], - "id": "5OTUbWg8NcIE" - }, - { - "cell_type": "code", - "source": [ - "# Compute neural reconstruction\n", - "face_vector_flat <- as.numeric(face_vector)\n", - "reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1))\n", - "library(ggplot2)\n", - "library(gridExtra)\n", - "library(reshape2)\n", - "\n", - "# Do visual comparison\n", - "image_height <- 62\n", - "image_width <- 47\n", - "image1 <- matrix(reconstructions[[4]], nrow = image_height, ncol = image_width)\n", - "image2 <- t(matrix(reconstruction, nrow = image_width, ncol = image_height))\n", - "\n", - "images <- list(image1, image2)\n", - "plot_faces(images, n_row = 1, n_col = 2, width = image_width, height = image_height)\n", - "\n", - "\n", - "# Do numeric comparison\n", - "# We also normalize the black/white gradient to take values in [0,1] (divide by 255)\n", - "img1 <- as.numeric(reconstructions[[4]]) / 255\n", - "img2 <- as.numeric(reconstruction) / 255\n", - "mse <- mean((img1 - img2)^2)\n", - "mse\n" - ], - "metadata": { - "id": "90nSf8Y8yIsl" - }, - "id": "90nSf8Y8yIsl", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zrdD6A55yJML" - }, - "source": [ - "## Neural Autoencoders\n", - "\n", - "Finally, let's train a nonlinear autoencoder for the same data where $F$ and $G$ are neural networks, and we restrict the dimension to be $k=64$." - ], - "id": "zrdD6A55yJML" - }, - { - "cell_type": "code", - "source": [ - "# Use a nonlinear neural network\n", - "\n", - "library(tensorflow)\n", - "n_features <- 2914\n", - "encoding_dimension <- 64\n", - "\n", - "input_image <- layer_input(shape = n_features)\n", - "encoded <- input_image %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu')\n", - "\n", - "decoded <- encoded %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", - " layer_dense(units = n_features, activation = 'relu')\n", - "\n", - "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", - "\n", - "autoencoder %>% compile(\n", - " optimizer = 'adam',\n", - " loss = 'mse'\n", - ")\n", - "autoencoder %>% fit(\n", - " design_matrix,\n", - " design_matrix,\n", - " epochs = 50,\n", - " batch_size = 256,\n", - " shuffle = TRUE,\n", - " verbose = 0\n", - ")\n", - "\n", - "# Compute neural reconstruction\n", - "reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1))\n", - "\n", - "# Do visual comparison\n", - "plot_faces(list(reconstructions[[4]],t(matrix(reconstruction, nrow = image_width, ncol = image_height))), n_row = 1, n_col = 2, width = image_width, height = image_height)\n", - "\n", - "# Do numeric comparison\n", - "# We also normalize the black/white gradient to take values in [0,1] (divide by 255)\n", - "img1 <- as.numeric(reconstructions[[4]]) / 255\n", - "img2 <- as.numeric(reconstruction) / 255\n", - "mse <- mean((img1 - img2)^2)\n", - "mse\n" - ], - "metadata": { - "id": "KHPoFiS9fuhr" - }, - "id": "KHPoFiS9fuhr", - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.0.5" - }, - "papermill": { - "default_parameters": {}, - "duration": 427.936706, - "end_time": "2022-04-19T09:13:53.230849", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2022-04-19T09:06:45.294143", - "version": "2.3.4" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "QkLbE3GXm1Jo" + }, + "source": [ + "# Autoencoders\n", + "\n", + "In this notebook, we'll introduce and explore \"autoencoders,\" which are a very successful family of models in modern deep learning. In particular we will:\n", + "\n", + "\n", + "1. Illustrate the connection between autoencoders and classical *Principal Component Analysis (PCA)*\n", + "3. Train a non-linear auto-encoder that uses a deep neural network\n", + "\n", + "### Overview\n", + "As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \\in \\mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \\mathbb{R}^d \\to \\mathbb{R}^k$ and $g: \\mathbb{R}^k \\to \\mathbb{R}^d$, with $k \\ll d$, such that $$g(f(X)) \\approx X.$$\n", + "In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$.\n", + "\n", + "It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\\mathbb{R}^d \\to \\mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize\n", + "$$\\mathbb{E}[\\mathrm{loss}(X, f(g(X))]$$ over functions $f \\in F$ and $g \\in G$. As usual, this is done by minimizing the sample analog.\n", + "\n", + "\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "lnwQdyzmm8UU" + }, + "source": [ + "## Linear Autoencoders and PCA: Practice\n", + "\n", + "It turns out that linear autoencoders are the same as PCA. Let's do a small sanity check to verify this. In particular, let's perform PCA two ways: first using a standard (linear algebra) toolkit, and second as a linear autoencoder using a neural network library.\n", + "If all goes well, they should give you the same reconstructions!\n", + "\n", + "To make it a bit more fun, we will use the [*Labeled Faces in the Wild*](https://www.kaggle.com/jessicali9530/celeba-dataset) dataset which consists of standardized images of roughly 5,000 celebrities' faces. In this data, PCA amounts to looking for a small number of \"proto-faces\" such that a linear combination of them can accurately reconstruct any celebrity's face." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "id": "nf4aybFuwTft" + }, + "outputs": [], + "source": [ + "install.packages(\"keras\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": { + "id": "ID08-PSOeKRf" + }, + "outputs": [], + "source": [ + "install.packages(\"reticulate\")\n", + "install.packages(\"abind\")\n", + "install.packages(\"grid\")\n", + "install.packages(\"gridExtra\")\n", + "install.packages(\"dplyr\")\n", + "install.packages(\"purrr\")\n", + "install.packages(\"reshape2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "id": "Z_ZpuBEBfCeH" + }, + "outputs": [], + "source": [ + "library(reticulate)\n", + "\n", + "# Import Python's sklearn.datasets\n", + "sklearn <- import(\"sklearn.datasets\")\n", + "\n", + "# Fetch the dataset\n", + "faces <- sklearn$fetch_lfw_people()\n", + "\n", + "# Access the images and reshape the data similar to Python's reshape method\n", + "n_examples <- dim(faces$images)[1]\n", + "height <- dim(faces$images)[2]\n", + "width <- dim(faces$images)[3]\n", + "design_matrix <- array_reshape(faces$images, c(n_examples, height * width))\n", + "\n", + "n_features <- dim(design_matrix)[2]\n", + "\n", + "# Print the dataset details\n", + "cat(sprintf(\"Labeled Faces in the Wild Dataset:\\n Number of examples: %d\\n Number of features: %d\\n Image height: %d\\n Image width: %d\",\n", + " n_examples, n_features, height, width))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "id": "PX_E23v-5yZY" + }, + "outputs": [], + "source": [ + "library(ggplot2)\n", + "library(gridExtra)\n", + "library(grid)\n", + "\n", + "# Find indices where the label is 'Arnold Schwarzenegger'\n", + "# faces$target uses python style indexing that starts at 0 rather than R style\n", + "# indexing that starts at 1, so we subtract 1 so the indexing lines up\n", + "arnold_labels <- which(faces$target_names == \"Arnold Schwarzenegger\")-1\n", + "# Get indices of all images corresponding to Arnold\n", + "arnold_pics <- which(faces$target %in% arnold_labels)\n", + "\n", + "plot_faces <- function(images, n_row=2, n_col=3, width, height) {\n", + " par(mfrow=c(n_row, n_col), mar=c(0.5, 0.5, 0.5, 0.5))\n", + " for (i in seq_len(n_row * n_col)) {\n", + " if (i <= length(images)) {\n", + " # image needs to be transposed for and then flipped for correct orientation\n", + " # using R \"image\"\n", + " tmp <- t(images[[i]])\n", + " tmp <- tmp[,ncol(tmp):1]\n", + " image(tmp, col=gray.colors(256), axes=FALSE, xlab=\"\", ylab=\"\")\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Ensure arnold_images contains the right amount of data and is not NULL\n", + "arnold_images <- lapply(arnold_pics[1:min(6, length(arnold_pics))], function(idx) {\n", + " faces$images[idx, , ]\n", + "})\n", + "\n", + "plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": { + "id": "imSXA7-jsGKl" + }, + "outputs": [], + "source": [ + "library(stats)\n", + "\n", + "# Perform PCA on the design matrix\n", + "pca <- prcomp(design_matrix, rank. = 128, retx = TRUE, center = TRUE, scale. = FALSE)\n", + "\n", + "# Extract the principal components (eigenfaces)\n", + "eigenfaces <- pca$rotation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "id": "sLi4k8t3DrHe" + }, + "outputs": [], + "source": [ + "# 2. Plot the first 6 \"eigenfaces,\" the six images whose linear span best explains the variation in our dataset\n", + "pca_images <- lapply(1:6, function(idx) {\n", + " array_reshape(eigenfaces[,idx], c(height,width))\n", + "})\n", + "\n", + "plot_faces(pca_images, height = height, width = width)\n", + "# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that captured the highest variation in our dataset of images)\n", + "# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": { + "id": "Gmj2lpTfCXKC" + }, + "outputs": [], + "source": [ + "reconstruct <- function(image_vector, n_components, eigenfaces) {\n", + " components <- eigenfaces[, 1:n_components, drop = FALSE]\n", + " compimage <- components %*% (t(components) %*% image_vector)\n", + " return(array_reshape(compimage, c(height,width)))\n", + "}\n", + "\n", + "# Select an Arnold image for reconstruction\n", + "face_vector <- t(design_matrix[arnold_pics[1], , drop = FALSE])\n", + "\n", + "# Perform reconstructions with varying number of components\n", + "reconstructions <- lapply(c(1, 2, 8, 32, 64, 128), function(k) {\n", + " reconstruct(face_vector, k, eigenfaces)\n", + "})\n", + "\n", + "# Plot the reconstructed faces\n", + "plot_faces(reconstructions, height = height, width = width)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "id": "eoZ_BsXYDE7P" + }, + "outputs": [], + "source": [ + "# 4. Train linear autoencoder with 64 neurons using Keras\n", + "# 5. Compare reconstructions of Arnold's face both using MSE and visually" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": { + "id": "urlMaifVJCDc" + }, + "outputs": [], + "source": [ + "library(keras)\n", + "encoding_dimension <- 64\n", + "input_image <- layer_input(shape = n_features)\n", + "encoded <- layer_dense(units = encoding_dimension, activation = 'linear')(input_image)\n", + "decoded <- layer_dense(units = n_features, activation = 'linear')(encoded)\n", + "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", + "autoencoder %>% compile(\n", + " optimizer = 'adam',\n", + " loss = 'mse'\n", + ")\n", + "autoencoder %>% fit(\n", + " design_matrix,\n", + " design_matrix,\n", + " epochs = 50,\n", + " batch_size = 256,\n", + " shuffle = TRUE,\n", + " verbose = 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "5OTUbWg8NcIE" + }, + "outputs": [], + "source": [ + "autoencoder %>% fit(\n", + " design_matrix,\n", + " design_matrix,\n", + " epochs = 50,\n", + " batch_size = 256,\n", + " shuffle = TRUE,\n", + " verbose = 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": { + "id": "90nSf8Y8yIsl" + }, + "outputs": [], + "source": [ + "# Compute neural reconstruction\n", + "face_vector_flat <- as.numeric(face_vector)\n", + "reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1))\n", + "library(ggplot2)\n", + "library(gridExtra)\n", + "library(reshape2)\n", + "\n", + "# Do visual comparison\n", + "image_height <- 62\n", + "image_width <- 47\n", + "image1 <- matrix(reconstructions[[4]], nrow = image_height, ncol = image_width)\n", + "image2 <- t(matrix(reconstruction, nrow = image_width, ncol = image_height))\n", + "\n", + "images <- list(image1, image2)\n", + "plot_faces(images, n_row = 1, n_col = 2, width = image_width, height = image_height)\n", + "\n", + "\n", + "# Do numeric comparison\n", + "# We also normalize the black/white gradient to take values in [0,1] (divide by 255)\n", + "img1 <- as.numeric(reconstructions[[4]]) / 255\n", + "img2 <- as.numeric(reconstruction) / 255\n", + "mse <- mean((img1 - img2)^2)\n", + "mse\n" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": { + "id": "zrdD6A55yJML" + }, + "source": [ + "## Neural Autoencoders\n", + "\n", + "Finally, let's train a nonlinear autoencoder for the same data where $F$ and $G$ are neural networks, and we restrict the dimension to be $k=64$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": { + "id": "KHPoFiS9fuhr" + }, + "outputs": [], + "source": [ + "# Use a nonlinear neural network\n", + "\n", + "library(tensorflow)\n", + "n_features <- 2914\n", + "encoding_dimension <- 64\n", + "\n", + "input_image <- layer_input(shape = n_features)\n", + "encoded <- input_image %>%\n", + " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", + " layer_dense(units = encoding_dimension, activation = 'relu')\n", + "\n", + "decoded <- encoded %>%\n", + " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", + " layer_dense(units = n_features, activation = 'relu')\n", + "\n", + "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", + "\n", + "autoencoder %>% compile(\n", + " optimizer = 'adam',\n", + " loss = 'mse'\n", + ")\n", + "autoencoder %>% fit(\n", + " design_matrix,\n", + " design_matrix,\n", + " epochs = 50,\n", + " batch_size = 256,\n", + " shuffle = TRUE,\n", + " verbose = 0\n", + ")\n", + "\n", + "# Compute neural reconstruction\n", + "reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1))\n", + "\n", + "# Do visual comparison\n", + "plot_faces(list(reconstructions[[4]],t(matrix(reconstruction, nrow = image_width, ncol = image_height))), n_row = 1, n_col = 2, width = image_width, height = image_height)\n", + "\n", + "# Do numeric comparison\n", + "# We also normalize the black/white gradient to take values in [0,1] (divide by 255)\n", + "img1 <- as.numeric(reconstructions[[4]]) / 255\n", + "img2 <- as.numeric(reconstruction) / 255\n", + "mse <- mean((img1 - img2)^2)\n", + "mse\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" + }, + "papermill": { + "default_parameters": {}, + "duration": 427.936706, + "end_time": "2022-04-19T09:13:53.230849", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-04-19T09:06:45.294143", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd b/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd new file mode 100644 index 00000000..edf2972a --- /dev/null +++ b/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd @@ -0,0 +1,873 @@ +--- +title: An R Markdown document converted from "PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb" +output: html_document +--- + +# BERT + +**Bidirectional Encoder Representations from Transformers.** + +_ | _ +- | - +![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png) + + +### **Overview** + +BERT was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin *et al.* The model is based on the Transformer architecture introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani *et al.* and has led to significant improvements in a wide range of natural language tasks. + +At the highest level, BERT maps from a block of text to a numeric vector which summarizes the relevant information in the text. + +What is remarkable is that numeric summary is sufficiently informative that, for example, the numeric summary of a paragraph followed by a reading comprehension question contains all the information necessary to satisfactorily answer the question. + +#### **Transfer Learning** + +BERT is a great example of a paradigm called *transfer learning*, which has proved very effective in recent years. In the first step, a network is trained on an unsupervised task using massive amounts of data. In the case of BERT, it was trained to predict missing words and to detect when pairs of sentences are presented in reversed order using all of Wikipedia. This was initially done by Google, using intense computational resources. + +Once this network has been trained, it is then used to perform many other supervised tasks using only limited data and computational resources: for example, sentiment classification in tweets or quesiton answering. The network is re-trained to perform these other tasks in such a way that only the final, output parts of the network are allowed to adjust by very much, so that most of the "information'' originally learned the network is preserved. This process is called *fine tuning*. + +##Getting to know BERT + +BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology. + +In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below. + +```{r} +install.packages("remotes") +remotes::install_github("rstudio/tensorflow") +install.packages("dplyr") +install.packages("DBI") +install.packages("ggplot2") +install.packages("reticulate") +install.packages("readr") +install.packages("stringr") +install.packages("tidyr") +install.packages("purrr") +install.packages("glmnet") +install.packages("caret") +install.packages("keras") +``` + +##Getting to know BERT + +BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology. + +In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below. + +```{r} +library(reticulate) +library(ggplot2) +library(DBI) +library(dplyr) +theme_set(theme_bw()) +``` + +```{r} +use_python("/usr/bin/python3", required = TRUE) # Adjust the path as needed +``` + +```{r} +py_run_string(' +import tensorflow as tf +import numpy as np +import pandas as pd +from transformers import BertTokenizer, TFBertModel +import warnings +warnings.simplefilter("ignore") +') +``` + +```{r} +# Check GPU availability +# py_run_string(' +# device_name = tf.test.gpu_device_name() +# if device_name != "/device:GPU:0": +# raise SystemError("GPU device not found") +# print("Found GPU at:", device_name) +# ') +``` + +```{r} +ssq <- function(x) sum(x * x) + +get_r2 <- function(y, yhat) { + resids <- yhat - y + flucs <- y - mean(y) + rss <- ssq(resids) + tss <- ssq(flucs) + cat(sprintf("RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f", rss, tss + mean(y)^2, tss, 1 - rss/tss)) +} +``` + +```{r} +py_run_string(' +tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") +bert = TFBertModel.from_pretrained("bert-base-uncased") +') +``` + +### Tokenization + +The first step in using BERT (or any similar text embedding tool) is to *tokenize* the data. This step standardizes blocks of text, so that meaningless differences in text presentation don't affect the behavior of our algorithm. + +Typically the text is transformed into a sequence of 'tokens,' each of which corresponds to a numeric code. + +```{r} +py_run_string(' +s = "What happens to this string?" +tensors = tokenizer.encode_plus(s, add_special_tokens = True, return_tensors = "tf") +output = bert(tensors) +') +``` + +```{r} +# Let's try it out! +s <- "What happens to this string?" +py_run_string +input_ids <- py$tensors$input_ids +attention_mask <- py$tensors$attention_mask +token_type_ids <- py$tensors$token_type_ids + +print(sprintf('Original String: "%s"', s)) +print("Numeric encoding: ") +print(list( + input_ids = input_ids, + attention_mask = attention_mask, + token_type_ids = token_type_ids +)) +# What does this mean? +py_run_string('tokens = tokenizer.convert_ids_to_tokens(tensors["input_ids"].numpy().flatten().tolist())') +tokens <- py$tokens +print("Actual tokens:") +print(tokens) +``` + +### BERT in a nutshell + +Once we have our numeric tokens, we can simply plug them into the BERT network and get a numeric vector summary. Note that in applications, the BERT summary will be "fine tuned" to a particular task, which hasn't happened yet. + +```{r} +# Load the reticulate library +library(reticulate) + +input_text <- "What happens to this string?" + + +cat(sprintf("Input: \"%s\"\n\n", input_text)) + +py_run_string(sprintf(' +tensors_tf = tokenizer("%s", return_tensors="tf") +output = bert(tensors_tf) +', input_text)) + +output <- py$output + +py_run_string(' +from pprint import pformat +output_type = str(type(output["pooler_output"])) +output_shape = output["pooler_output"].shape +output_preview = pformat(output["pooler_output"].numpy()) +') + +output_type <- py$output_type +output_shape <- py$output_shape +output_preview <- py$output_preview + +cat(sprintf( +"Output type: %s\n\nOutput shape: %s\n\nOutput preview: %s\n", +output_type, +paste(output_shape, collapse=", "), +output_preview +)) +``` + +# A practical introduction to BERT + +In the next part of the notebook, we are going to explore how a tool like BERT may be useful for causal inference. + +In particular, we are going to apply BERT to a subset of data from the Amazon marketplace consisting of roughly 10,000 listings for products in the toy category. Each product comes with a text description, a price, and a number of times reviewed (which we'll use as a proxy for demand / market share). + +For more information on the dataset, checkout the [Dataset README](https://github.com/CausalAIBook/MetricsMLNotebooks/blob/main/data/amazon_toys.md). + +**For thought**: +What are some issues you may anticipate when using number of reviews as a proxy for demand or market share? + +### Getting to know the data + +First, we'll download and clean up the data, and do some preliminary inspection. + +```{r} +library(readr) +library(stringr) +library(tidyr) +library(purrr) +data_url <- "https://github.com/CausalAIBook/MetricsMLNotebooks/raw/main/data/amazon_toys.csv" +data <- read_csv(data_url, show_col_types = FALSE) +problems(data) + +data <- data %>% + mutate( + number_of_reviews = as.numeric(str_replace_all(number_of_reviews, ",", "")) + ) +``` + +```{r} +data <- data %>% + mutate( + number_of_reviews = as.numeric(str_replace_all(number_of_reviews, "\\D+", "")), + price = as.numeric(str_extract(price, "\\d+\\.?\\d*")) + ) %>% + filter(number_of_reviews > 0) %>% + mutate( + ln_p = log(price), + ln_q = log(number_of_reviews / sum(number_of_reviews)), + text = str_c(product_name, manufacturer, product_description, sep = " | ") + ) %>% + select(text, ln_p, ln_q, amazon_category_and_sub_category) %>% + drop_na() +print(head(data)) +data$text_num_words <- str_split(data$text, "\\s+") %>% map_int(length) +print(quantile(data$text_num_words, 0.99, na.rm = TRUE)) +``` + +```{r} +ggplot(data, aes(x = text_num_words)) + + geom_density() + + labs(title = "Density Plot of Text Lengths in Words") +``` + +Let's make a two-way scatter plot of prices and (proxied) market shares. + +```{r} +p1 <- ggplot(data, aes(x = ln_p, y = ln_q)) + + geom_point() + + geom_smooth(method = "lm", color = "red") + + labs(title = "Scatter Plot with Regression Line") +print(p1) +``` + +```{r} +p2 <- ggplot(data, aes(x = ln_p, y = ln_q)) + + geom_smooth(method = "lm", color = "red") + + labs(title = "Regression Line Only") +print(p2) +``` + +```{r} +model <- lm(ln_q ~ ln_p, data = data) +elasticity <- coef(model)["ln_p"] +se <- summary(model)$coefficients["ln_p", "Std. Error"] +r_squared_adj <- summary(model)$adj.r.squared +cat(sprintf("Elasticity: %f, SE: %f, R2: %f\n\n", elasticity, se, r_squared_adj)) +conf_intervals <- confint(model, c("(Intercept)", "ln_p"), level = 0.95) +print(conf_intervals) +``` + +Let's begin with a simple prediction task. We will discover how well can we explain the price of these products using their textual descriptions. + +```{r} +install.packages("caTools") +install.packages("base") +library(caTools) +``` + +```{r} +library(caTools) +set.seed(124) +split <- sample.split(Y = data$ln_p, SplitRatio = 0.8) +train_main <- data[split, ] +holdout <- data[!split, ] +split_main <- sample.split(Y = train_main$ln_p, SplitRatio = 0.75) +train <- train_main[split_main, ] +val <- train_main[!split_main, ] +``` + +```{r} +library(reticulate) +use_python("/usr/bin/python3", required = TRUE) +py_run_string('import tensorflow as tf') + +py$train_texts <- train$text +train_tensors <- py_run_string(" +tensors = tokenizer( + list(train_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +train_tensors <- py$tensors + +py$val_texts <- val$text +val_tensors <- py_run_string(" +val_tensors = tokenizer( + list(val_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +val_tensors <- py$val_tensors + +py$holdout_texts <- holdout$text +tensors_holdout <- py_run_string(" +tensors_holdout = tokenizer( + list(holdout_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +tensors_holdout <- py$tensors_holdout +ln_p <- train$ln_p +ln_q <- train$ln_q +val_ln_p <- val$ln_p +val_ln_q <- val$ln_q +``` + +```{r} +ln_p <- train$ln_p +ln_q <- train$ln_q +val_ln_p <- val$ln_p +val_ln_q <- val$ln_q +``` + +# Using BERT as Feature Extractor + +```{r} +library(reticulate) +#Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python") +library(keras) +#install_keras() +``` + +```{r} +library(caTools) +library(dplyr) +library(readr) +library(reticulate) +library(keras) +library(caret) +library(glmnet) +library(stringr) + +use_python("/usr/bin/python3", required = TRUE) +py_run_string('import tensorflow as tf') +py_run_string('from transformers import BertTokenizer, TFBertModel') +py_run_string(' +tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") +bert_model = TFBertModel.from_pretrained("bert-base-uncased") +') + +py$train_texts <- train$text +train_tensors <- py_run_string(" +tensors = tokenizer( + list(train_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +train_tensors <- py$tensors + +py$val_texts <- val$text +val_tensors <- py_run_string(" +val_tensors = tokenizer( + list(val_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +val_tensors <- py$val_tensors + +py$holdout_texts <- holdout$text +tensors_holdout <- py_run_string(" +tensors_holdout = tokenizer( + list(holdout_texts), + padding=True, + truncation=True, + max_length=128, + return_tensors='tf' +)") +tensors_holdout <- py$tensors_holdout + +ln_p <- train$ln_p +val_ln_p <- val$ln_p +holdout_ln_p <- holdout$ln_p + +py_run_string(' +import tensorflow as tf +from transformers import TFBertModel + +# Define the input layers +input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids") +token_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="token_type_ids") +attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask") + +# Load the pre-trained BERT model +bert_model = TFBertModel.from_pretrained("bert-base-uncased") +outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) + +# Define the embedding model +embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :]) +') + +py_run_string(' +import numpy as np +embeddings = embedding_model.predict({ + "input_ids": tf.convert_to_tensor(tensors["input_ids"]), + "token_type_ids": tf.convert_to_tensor(tensors["token_type_ids"]), + "attention_mask": tf.convert_to_tensor(tensors["attention_mask"]) +}) +') + +embeddings <- py$embeddings + +py$ln_p <- ln_p +py_run_string(' +from sklearn.linear_model import LassoCV +from sklearn.model_selection import KFold +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline + +lcv = make_pipeline(StandardScaler(), LassoCV(cv=KFold(n_splits=5, shuffle=True, random_state=123), random_state=123)) +lcv.fit(embeddings, ln_p) +') + +py_run_string(' +embeddings_val = embedding_model.predict({ + "input_ids": tf.convert_to_tensor(val_tensors["input_ids"]), + "token_type_ids": tf.convert_to_tensor(val_tensors["token_type_ids"]), + "attention_mask": tf.convert_to_tensor(val_tensors["attention_mask"]) +}) +val_predictions = lcv.predict(embeddings_val) +') + +val_predictions <- py$val_predictions + +r2_val <- caret::R2(val_predictions, val_ln_p) + +py_run_string(' +embeddings_holdout = embedding_model.predict({ + "input_ids": tf.convert_to_tensor(tensors_holdout["input_ids"]), + "token_type_ids": tf.convert_to_tensor(tensors_holdout["token_type_ids"]), + "attention_mask": tf.convert_to_tensor(tensors_holdout["attention_mask"]) +}) +holdout_predictions = lcv.predict(embeddings_holdout) +') + +holdout_predictions <- py$holdout_predictions + +r2_holdout <- caret::R2(holdout_predictions, holdout_ln_p) + +print(r2_val) +print(r2_holdout) +ln_p_hat_holdout <- holdout_predictions +``` + +# Linear Probing: Training Only Final Layer after BERT + +```{r} +### Now let's prepare our model + +from tensorflow.keras import Model, Input +from tensorflow.keras.layers import Dense, Dropout, Concatenate +import tensorflow_addons as tfa +from tensorflow.keras import regularizers + +tf.keras.utils.set_random_seed(123) + +input_ids = Input(shape=(128,), dtype=tf.int32) +token_type_ids = Input(shape=(128,), dtype=tf.int32) +attention_mask = Input(shape=(128,), dtype=tf.int32) + +# # First we compute the text embedding +Z = bert(input_ids, token_type_ids, attention_mask) + +for layer in bert.layers: + layer.trainable=False + for w in layer.weights: w._trainable=False + +# # We want the "pooled / summary" embedding, not individual word embeddings +Z = Z[1] + +# # Then we do a regular regression +# Z = Dropout(0.2)(Z) +ln_p_hat = Dense(1, activation='linear', + kernel_regularizer=regularizers.L2(1e-3))(Z) + +PricePredictionNetwork = Model([ + input_ids, + token_type_ids, + attention_mask, + ], ln_p_hat) +PricePredictionNetwork.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), + loss=tf.keras.losses.MeanSquaredError(), + metrics=tfa.metrics.RSquare(), +) +PricePredictionNetwork.summary() +``` + +```{r} +from livelossplot import PlotLossesKeras + +tf.keras.utils.set_random_seed(123) +earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) +modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/pweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) + +PricePredictionNetwork.fit( + x= [tensors['input_ids'], + tensors['token_type_ids'], + tensors['attention_mask'],], + y=ln_p, + validation_data = ( + [val_tensors['input_ids'], + val_tensors['token_type_ids'], + val_tensors['attention_mask']], val_ln_p + ), + epochs=5, + callbacks = [earlystopping, modelcheckpoint, + PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], + batch_size=16, + shuffle=True) +``` + +# Fine Tuning starting from the Linear Probing Trained Weights + +Now we train the whole network, initializing the weights based on the result of the linear probing phase in the previous section. + +```{r} +### Now let's prepare our model + +from tensorflow.keras import Model, Input +from tensorflow.keras.layers import Dense, Dropout, Concatenate +import tensorflow_addons as tfa +from tensorflow.keras import regularizers + +tf.keras.utils.set_random_seed(123) + +input_ids = Input(shape=(128,), dtype=tf.int32) +token_type_ids = Input(shape=(128,), dtype=tf.int32) +attention_mask = Input(shape=(128,), dtype=tf.int32) + +# # First we compute the text embedding +Z = bert(input_ids, token_type_ids, attention_mask) + +for layer in bert.layers: + layer.trainable=True + for w in layer.weights: w._trainable=True + +# # We want the "pooled / summary" embedding, not individual word embeddings +Z = Z[1] + +# # Then we do a regularized linear regression +ln_p_hat = Dense(1, activation='linear', + kernel_regularizer=regularizers.L2(1e-3))(Z) + +PricePredictionNetwork = Model([ + input_ids, + token_type_ids, + attention_mask, + ], ln_p_hat) +PricePredictionNetwork.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), + loss=tf.keras.losses.MeanSquaredError(), + metrics=tfa.metrics.RSquare(), +) +PricePredictionNetwork.summary() +``` + +```{r} +PricePredictionNetwork.load_weights("/content/gdrive/MyDrive/pweights.hdf5") +``` + +```{r} +from livelossplot import PlotLossesKeras + +tf.keras.utils.set_random_seed(123) + +earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) +modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/pweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) + +PricePredictionNetwork.fit( + x= [tensors['input_ids'], + tensors['token_type_ids'], + tensors['attention_mask'],], + y=ln_p, + validation_data = ( + [val_tensors['input_ids'], + val_tensors['token_type_ids'], + val_tensors['attention_mask']], val_ln_p + ), + epochs=10, + callbacks = [earlystopping, modelcheckpoint, + PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], + batch_size=16, + shuffle=True) +``` + +```{r} +PricePredictionNetwork.load_weights("/content/gdrive/MyDrive/pweights.hdf5") +``` + +```{r} +# Compute predictions +ln_p_hat_holdout = PricePredictionNetwork.predict([ + tensors_holdout['input_ids'], + tensors_holdout['token_type_ids'], + tensors_holdout['attention_mask'], + ]) +``` + +```{r} +print('Neural Net R^2, Price Prediction:') +get_r2(holdout['ln_p'], ln_p_hat_holdout) +``` + +```{r} +import matplotlib.pyplot as plt +plt.hist(ln_p_hat_holdout) +plt.show() +``` + +Now, let's go one step further and construct a DML estimator of the average price elasticity. In particular, we will model market share $q_i$ as +$$\ln q_i = \alpha + \beta \ln p_i + \psi(d_i) + \epsilon_i,$$ where $d_i$ denotes the description of product $i$ and $\psi$ is the composition of text embedding and a linear layer. + +```{r} +## Build the quantity prediction network + +tf.keras.utils.set_random_seed(123) + +# Initialize new BERT model from original +bert2 = TFBertModel.from_pretrained("bert-base-uncased") + +# for layer in bert2.layers: +# layer.trainable=False +# for w in layer.weights: w._trainable=False + +# Define inputs +input_ids = Input(shape=(128,), dtype=tf.int32) +token_type_ids = Input(shape=(128,), dtype=tf.int32) +attention_mask = Input(shape=(128,), dtype=tf.int32) + +# First we compute the text embedding +Z = bert2(input_ids, token_type_ids, attention_mask) + +# We want the "pooled / summary" embedding, not individual word embeddings +Z = Z[1] + +ln_q_hat = Dense(1, activation='linear', kernel_regularizer=regularizers.L2(1e-3))(Z) + +# Compile model and optimization routine +QuantityPredictionNetwork = Model([ + input_ids, + token_type_ids, + attention_mask, + ], ln_q_hat) +QuantityPredictionNetwork.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), + loss=tf.keras.losses.MeanSquaredError(), + metrics=tfa.metrics.RSquare(), +) +QuantityPredictionNetwork.summary() +``` + +```{r} +## Fit the quantity prediction network in the main sample +tf.keras.utils.set_random_seed(123) + +earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) +modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/qweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) + +QuantityPredictionNetwork.fit( + [ + tensors['input_ids'], + tensors['token_type_ids'], + tensors['attention_mask'], + ], + ln_q, + validation_data = ( + [val_tensors['input_ids'], + val_tensors['token_type_ids'], + val_tensors['attention_mask']], val_ln_q + ), + epochs=10, + callbacks = [earlystopping, modelcheckpoint, + PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], + batch_size=16, + shuffle=True) +``` + +```{r} +QuantityPredictionNetwork.load_weights("/content/gdrive/MyDrive/qweights.hdf5") +``` + +```{r} +## Predict in the holdout sample, residualize and regress + +ln_q_hat_holdout = QuantityPredictionNetwork.predict([ + tensors_holdout['input_ids'], + tensors_holdout['token_type_ids'], + tensors_holdout['attention_mask'], + ]) +``` + +```{r} +print('Neural Net R^2, Quantity Prediction:') +get_r2(holdout['ln_q'], ln_q_hat_holdout) +``` + +```{r} +# Compute residuals +r_p = holdout["ln_p"] - ln_p_hat_holdout.reshape((-1,)) +r_q = holdout["ln_q"] - ln_q_hat_holdout.reshape((-1,)) + +# Regress to obtain elasticity estimate +beta = np.mean(r_p * r_q) / np.mean(r_p * r_p) + +# standard error on elastiticy estimate +se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout["ln_p"].size) + +print('Elasticity of Demand with Respect to Price: {}'.format(beta)) +print('Standard Error: {}'.format(se)) +``` + +# Heterogeneous Elasticities within Major Product Categories + +We now look at the major product categories that have many products and we investigate whether the "within group" price elasticities + +```{r} +holdout['category'] = holdout['amazon_category_and_sub_category'].str.split('>').apply(lambda x: x[0]) +``` + +```{r} +# Elasticity within the main product categories +sql.run(""" + SELECT category, COUNT(*) + FROM holdout + GROUP BY 1 + HAVING COUNT(*)>=100 + ORDER BY 2 desc +""") +``` + +```{r} +main_cats = sql.run(""" + SELECT category + FROM holdout + GROUP BY 1 + HAVING COUNT(*)>=100 +""")['category'] + +dfs = [] +for cat in main_cats: + r_p = holdout[holdout['category'] == cat]["ln_p"] - ln_p_hat_holdout.reshape((-1,))[holdout['category'] == cat] + r_q = holdout[holdout['category'] == cat]["ln_q"] - ln_q_hat_holdout.reshape((-1,))[holdout['category'] == cat] + # Regress to obtain elasticity estimate + beta = np.mean(r_p * r_q) / np.mean(r_p * r_p) + + # standard error on elastiticy estimate + se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout["ln_p"].size) + + df = pd.DataFrame({'point': beta, 'se': se, 'lower': beta - 1.96 * se, 'upper': beta + 1.96 * se}, index=[0]) + df['category'] = cat + df['N'] = holdout[holdout['category'] == cat].shape[0] + dfs.append(df) + +df = pd.concat(dfs) +df +``` + +## Clustering Products + +In this final part of the notebook, we'll illustrate how the BERT text embeddings can be used to cluster products based on their descriptions. + +Intiuitively, our neural network has now learned which aspects of the text description are relevant to predict prices and market shares. +We can therefore use the embeddings produced by our network to cluster products, and we might expect that the clusters reflect market-relevant information. + +In the following block of cells, we compute embeddings using our learned models and cluster them using $k$-means clustering with $k=10$. Finally, we will explore how the estimated price elasticity differs across clusters. + +### Overview of **$k$-means clustering** +The $k$-means clustering algorithm seeks to divide $n$ data vectors into $k$ groups, each of which contain points that are "close together." + +In particular, let $C_1, \ldots, C_k$ be a partitioning of the data into $k$ disjoint, nonempty subsets (clusters), and define +$$\bar{C_i}=\frac{1}{\#C_i}\sum_{x \in C_i} x$$ +to be the *centroid* of the cluster $C_i$. The $k$-means clustering score $\mathrm{sc}(C_1 \ldots C_k)$ is defined to be +$$\mathrm{sc}(C_1 \ldots C_k) = \sum_{i=1}^k \sum_{x \in C_i} \left(x - \bar{C_i}\right)^2.$$ + +The $k$-means clustering is then defined to be any partitioning $C^*_1 \ldots C^*_k$ that minimizes the score $\mathrm{sc}(-)$. + +```{r} +## STEP 1: Compute embeddings + +input_ids = Input(shape=(128,), dtype=tf.int32) +token_type_ids = Input(shape=(128,), dtype=tf.int32) +attention_mask = Input(shape=(128,), dtype=tf.int32) + +Y1 = bert(input_ids, token_type_ids, attention_mask)[1] +Y2 = bert2(input_ids, token_type_ids, attention_mask)[1] +Y = Concatenate()([Y1,Y2]) + +embedding_model = Model([input_ids, token_type_ids, attention_mask], Y) + +embeddings = embedding_model.predict([tensors_holdout['input_ids'], + tensors_holdout['token_type_ids'], + tensors_holdout['attention_mask']]) +``` + +### Dimension reduction and the **Johnson-Lindenstrauss transform** + +Our learned embeddings have dimension in the $1000$s, and $k$-means clustering is often an expensive operation. To improve the situation, we will use a neat trick that is used extensively in machine learning applications: the *Johnson-Lindenstrauss transform*. + +This trick involves finding a low-dimensional linear projection of the embeddings that approximately preserves pairwise distances. + +In fact, Johnson and Lindenstrauss proved a much more interesting statement: a Gaussian random matrix will *almost always* approximately preserve pairwise distances. + + +```{r} +# STEP 2 Make low-dimensional projections +from sklearn.random_projection import GaussianRandomProjection + +jl = GaussianRandomProjection(eps=.25) +embeddings_lowdim = jl.fit_transform(embeddings) +``` + +```{r} +# STEP 3 Compute clusters +from sklearn.cluster import KMeans + +k_means = KMeans(n_clusters=10) +k_means.fit(embeddings_lowdim) +cluster_ids = k_means.labels_ +``` + +```{r} +# STEP 4 Regress within each cluster + +betas = np.zeros(10) +ses = np.zeros(10) + +r_p = holdout["ln_p"] - ln_p_hat_holdout.reshape((-1,)) +r_q = holdout["ln_q"] - ln_q_hat_holdout.reshape((-1,)) + +for c in range(10): + + r_p_c = r_p[cluster_ids == c] + r_q_c = r_q[cluster_ids == c] + + # Regress to obtain elasticity estimate + betas[c] = np.mean(r_p_c * r_q_c) / np.mean(r_p_c * r_p_c) + + # standard error on elastiticy estimate + ses[c] = np.sqrt(np.mean( (r_p_c * r_q_c)**2)/(np.mean(r_p_c*r_p_c)**2)/r_p_c.size) +``` + +```{r} +# STEP 5 Plot +from matplotlib import pyplot as plt + +plt.bar(range(10), betas, yerr = 1.96 * ses) +``` + diff --git a/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb b/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb index 06819c02..9bd6d4ed 100644 --- a/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb +++ b/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb @@ -1,1963 +1,1458 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "dYwg9btt1wJH" - }, - "source": [ - "# BERT\n", - "\n", - "**Bidirectional Encoder Representations from Transformers.**\n", - "\n", - "_ | _\n", - "- | -\n", - "![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png)\n", - "\n", - "\n", - "### **Overview**\n", - "\n", - "BERT was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin *et al.* The model is based on the Transformer architecture introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani *et al.* and has led to significant improvements in a wide range of natural language tasks.\n", - "\n", - "At the highest level, BERT maps from a block of text to a numeric vector which summarizes the relevant information in the text.\n", - "\n", - "What is remarkable is that numeric summary is sufficiently informative that, for example, the numeric summary of a paragraph followed by a reading comprehension question contains all the information necessary to satisfactorily answer the question.\n", - "\n", - "#### **Transfer Learning**\n", - "\n", - "BERT is a great example of a paradigm called *transfer learning*, which has proved very effective in recent years. In the first step, a network is trained on an unsupervised task using massive amounts of data. In the case of BERT, it was trained to predict missing words and to detect when pairs of sentences are presented in reversed order using all of Wikipedia. This was initially done by Google, using intense computational resources.\n", - "\n", - "Once this network has been trained, it is then used to perform many other supervised tasks using only limited data and computational resources: for example, sentiment classification in tweets or quesiton answering. The network is re-trained to perform these other tasks in such a way that only the final, output parts of the network are allowed to adjust by very much, so that most of the \"information'' originally learned the network is preserved. This process is called *fine tuning*." - ], - "id": "dYwg9btt1wJH" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wNjNs3ViKTiO" - }, - "source": [ - "##Getting to know BERT\n", - "\n", - "BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology.\n", - "\n", - "In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below." - ], - "id": "wNjNs3ViKTiO" - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"remotes\")\n", - "remotes::install_github(\"rstudio/tensorflow\")\n", - "install.packages(\"dplyr\")\n", - "install.packages(\"DBI\")\n", - "install.packages(\"ggplot2\")\n", - "install.packages(\"reticulate\")\n", - "install.packages(\"readr\")\n", - "install.packages(\"stringr\")\n", - "install.packages(\"tidyr\")\n", - "install.packages(\"purrr\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"caret\")\n", - "install.packages(\"keras\")" - ], - "metadata": { - "id": "9rooQWVdri1m", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "e8d7baf0-1420-4405-d5d5-c63466fbbcdd", - "collapsed": true - }, - "id": "9rooQWVdri1m", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Downloading GitHub repo rstudio/tensorflow@HEAD\n", - "\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "rlang (1.1.3 -> 1.1.4 ) [CRAN]\n", - "png (NA -> 0.1-8 ) [CRAN]\n", - "here (NA -> 1.0.1 ) [CRAN]\n", - "RcppTOML (NA -> 0.2.2 ) [CRAN]\n", - "Rcpp (NA -> 1.0.12) [CRAN]\n", - "cli (3.6.2 -> 3.6.3 ) [CRAN]\n", - "backports (1.4.1 -> 1.5.0 ) [CRAN]\n", - "reticulate (NA -> 1.38.0) [CRAN]\n", - "rstudioapi (0.15.0 -> 0.16.0) [CRAN]\n", - "tidyselect (1.2.0 -> 1.2.1 ) [CRAN]\n", - "whisker (0.4 -> 0.4.1 ) [CRAN]\n", - "config (NA -> 0.3.2 ) [CRAN]\n", - "tfautograph (NA -> 0.3.2 ) [CRAN]\n", - "tfruns (NA -> 1.5.3 ) [CRAN]\n", - "processx (3.8.3 -> 3.8.4 ) [CRAN]\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing 15 packages: rlang, png, here, RcppTOML, Rcpp, cli, backports, reticulate, rstudioapi, tidyselect, whisker, config, tfautograph, tfruns, processx\n", - "\n", - "Installing packages into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[36m──\u001b[39m \u001b[36mR CMD build\u001b[39m \u001b[36m─────────────────────────────────────────────────────────────────\u001b[39m\n", - "* checking for file ‘/tmp/Rtmpa90Uuv/remotesb13c88dc59/rstudio-tensorflow-b68aa4a/DESCRIPTION’ ... OK\n", - "* preparing ‘tensorflow’:\n", - "* checking DESCRIPTION meta-information ... OK\n", - "* checking for LF line-endings in source and make files and shell scripts\n", - "* checking for empty or unneeded directories\n", - "* building ‘tensorflow_2.16.0.9000.tar.gz’\n", - "\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘RcppEigen’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘proxy’, ‘clock’, ‘gower’, ‘hardhat’, ‘ipred’, ‘timeDate’, ‘e1071’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’, ‘reshape2’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependency ‘zeallot’\n", - "\n", - "\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TgWpXdSIl5KL" - }, - "source": [ - "##Getting to know BERT\n", - "\n", - "BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology.\n", - "\n", - "In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below." - ], - "id": "TgWpXdSIl5KL" - }, - { - "cell_type": "code", - "source": [ - "library(reticulate)\n", - "library(ggplot2)\n", - "library(DBI)\n", - "library(dplyr)\n", - "theme_set(theme_bw())" - ], - "metadata": { - "id": "ppJlcoIatlAw", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "3a5cbcea-04c9-4c67-ccb1-1b45e691ccf8", - "collapsed": true - }, - "id": "ppJlcoIatlAw", - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n", - "Attaching package: ‘dplyr’\n", - "\n", - "\n", - "The following objects are masked from ‘package:stats’:\n", - "\n", - " filter, lag\n", - "\n", - "\n", - "The following objects are masked from ‘package:base’:\n", - "\n", - " intersect, setdiff, setequal, union\n", - "\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "use_python(\"/usr/bin/python3\", required = TRUE) # Adjust the path as needed" - ], - "metadata": { - "id": "GmOhRKEG4jEy" - }, - "id": "GmOhRKEG4jEy", - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "py_run_string('\n", - "import tensorflow as tf\n", - "import numpy as np\n", - "import pandas as pd\n", - "from transformers import BertTokenizer, TFBertModel\n", - "import warnings\n", - "warnings.simplefilter(\"ignore\")\n", - "')" - ], - "metadata": { - "id": "bUEb1TDIs4TK" - }, - "id": "bUEb1TDIs4TK", - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Check GPU availability\n", - "# py_run_string('\n", - "# device_name = tf.test.gpu_device_name()\n", - "# if device_name != \"/device:GPU:0\":\n", - "# raise SystemError(\"GPU device not found\")\n", - "# print(\"Found GPU at:\", device_name)\n", - "# ')" - ], - "metadata": { - "id": "A7HTpjkA4u54", - "collapsed": true - }, - "id": "A7HTpjkA4u54", - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "ssq <- function(x) sum(x * x)\n", - "\n", - "get_r2 <- function(y, yhat) {\n", - " resids <- yhat - y\n", - " flucs <- y - mean(y)\n", - " rss <- ssq(resids)\n", - " tss <- ssq(flucs)\n", - " cat(sprintf(\"RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f\", rss, tss + mean(y)^2, tss, 1 - rss/tss))\n", - "}" - ], - "metadata": { - "id": "hZaltj7Fv5Gh" - }, - "id": "hZaltj7Fv5Gh", - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "py_run_string('\n", - "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "bert = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", - "')" - ], - "metadata": { - "id": "CB3ur5xF41o-" - }, - "id": "CB3ur5xF41o-", - "execution_count": 8, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "26mRwUFwardQ" - }, - "source": [ - "### Tokenization\n", - "\n", - "The first step in using BERT (or any similar text embedding tool) is to *tokenize* the data. This step standardizes blocks of text, so that meaningless differences in text presentation don't affect the behavior of our algorithm.\n", - "\n", - "Typically the text is transformed into a sequence of 'tokens,' each of which corresponds to a numeric code." - ], - "id": "26mRwUFwardQ" - }, - { - "cell_type": "code", - "source": [ - "py_run_string('\n", - "s = \"What happens to this string?\"\n", - "tensors = tokenizer.encode_plus(s, add_special_tokens = True, return_tensors = \"tf\")\n", - "output = bert(tensors)\n", - "')" - ], - "metadata": { - "id": "cER5mL4fMSCr" - }, - "id": "cER5mL4fMSCr", - "execution_count": 9, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Let's try it out!\n", - "s <- \"What happens to this string?\"\n", - "py_run_string\n", - "input_ids <- py$tensors$input_ids\n", - "attention_mask <- py$tensors$attention_mask\n", - "token_type_ids <- py$tensors$token_type_ids\n", - "\n", - "print(sprintf('Original String: \"%s\"', s))\n", - "print(\"Numeric encoding: \")\n", - "print(list(\n", - " input_ids = input_ids,\n", - " attention_mask = attention_mask,\n", - " token_type_ids = token_type_ids\n", - "))\n", - "# What does this mean?\n", - "py_run_string('tokens = tokenizer.convert_ids_to_tokens(tensors[\"input_ids\"].numpy().flatten().tolist())')\n", - "tokens <- py$tokens\n", - "print(\"Actual tokens:\")\n", - "print(tokens)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 329 - }, - "id": "KVETer7w5euE", - "outputId": "3b7b87f7-69f4-4727-bccf-3bd8d6162e00" - }, - "id": "KVETer7w5euE", - "execution_count": 10, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
function (code, local = FALSE, convert = TRUE) \n",
-              "{\n",
-              "    invisible(py_run_string_impl(code, local, convert))\n",
-              "}
" - ], - "text/markdown": "```r\nfunction (code, local = FALSE, convert = TRUE) \n{\n invisible(py_run_string_impl(code, local, convert))\n}\n```", - "text/latex": "\\begin{minted}{r}\nfunction (code, local = FALSE, convert = TRUE) \n\\{\n invisible(py\\_run\\_string\\_impl(code, local, convert))\n\\}\n\\end{minted}", - "text/plain": [ - "function (code, local = FALSE, convert = TRUE) \n", - "{\n", - " invisible(py_run_string_impl(code, local, convert))\n", - "}\n", - "\n", - "" - ] - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[1] \"Original String: \\\"What happens to this string?\\\"\"\n", - "[1] \"Numeric encoding: \"\n", - "$input_ids\n", - "\n", - "\n", - "$attention_mask\n", - "\n", - "\n", - "$token_type_ids\n", - "\n", - "\n", - "[1] \"Actual tokens:\"\n", - "[1] \"[CLS]\" \"what\" \"happens\" \"to\" \"this\" \"string\" \"?\" \n", - "[8] \"[SEP]\" \n" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JJaz6eEocefa" - }, - "source": [ - "### BERT in a nutshell\n", - "\n", - "Once we have our numeric tokens, we can simply plug them into the BERT network and get a numeric vector summary. Note that in applications, the BERT summary will be \"fine tuned\" to a particular task, which hasn't happened yet." - ], - "id": "JJaz6eEocefa" - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "Q1ODAgBMa3Zg", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "7ccd4481-6489-4830-c86b-a3c40cb1c37c", - "collapsed": true - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Input: \"What happens to this string?\"\n", - "\n", - "Output type: \n", - "\n", - "Output shape: (1, 768)\n", - "\n", - "Output preview: array([[-0.969214 , -0.5247471 , -0.921133 , 0.91463196, 0.51964986,\n", - " -0.32456928, 0.9706122 , 0.49300137, -0.82831514, -0.99999857,\n", - " -0.3816222 , 0.9569837 , 0.9913891 , 0.70681006, 0.98294646,\n", - " -0.9001979 , -0.6150293 , -0.7787723 , 0.46308163, -0.82709247,\n", - " 0.8457982 , 0.9999907 , 0.08221605, 0.5372562 , 0.6345672 ,\n", - " 0.9896697 , -0.87372965, 0.9743809 , 0.98041123, 0.8377358 ,\n", - " -0.87578595, 0.34426346, -0.9932326 , -0.32497373, -0.9177454 ,\n", - " -0.99635667, 0.5514211 , -0.8553031 , -0.14537612, -0.15756267,\n", - " -0.94222736, 0.3940587 , 0.9999956 , -0.15938273, 0.56559354,\n", - " -0.49784598, -1. , 0.45159677, -0.94606155, 0.9138866 ,\n", - " 0.894105 , 0.8145405 , 0.34917742, 0.6689711 , 0.63060194,\n", - " -0.44194648, 0.07209369, 0.25390887, -0.41195402, -0.73743373,\n", - " -0.75841326, 0.607392 , -0.8874512 , -0.9479397 , 0.9202822 ,\n", - " 0.8419094 , -0.27363837, -0.4571576 , -0.30537918, 0.13075443,\n", - " 0.966184 , 0.33864886, -0.19612612, -0.9207082 , 0.7088074 ,\n", - " 0.45681113, -0.77244145, 1. , -0.79896045, -0.9872875 ,\n", - " 0.776911 , 0.8458453 , 0.71652466, -0.41961768, 0.6352247 ,\n", - " -1. , 0.67543024, -0.20674996, -0.99473673, 0.45124725,\n", - " 0.56537026, -0.35305175, 0.5736773 , 0.7470167 , -0.7259696 ,\n", - " -0.6088721 , -0.47434014, -0.8264615 , -0.36119705, -0.46513397,\n", - " 0.17759693, -0.42618233, -0.5134769 , -0.5112346 , 0.507512 ,\n", - " -0.6803123 , -0.78912073, 0.56096 , 0.31418112, 0.8256635 ,\n", - " 0.6137928 , -0.48744348, 0.63172615, -0.97583294, 0.804665 ,\n", - " -0.5379689 , -0.9944425 , -0.7640108 , -0.9945089 , 0.79497546,\n", - " -0.5527532 , -0.38338676, 0.9836922 , -0.21038826, 0.5786874 ,\n", - " -0.4175546 , -0.90507156, -1. , -0.80496746, -0.67468506,\n", - " -0.33667699, -0.44921163, -0.9891523 , -0.9734636 , 0.7879726 ,\n", - " 0.98081094, 0.34198916, 0.99997675, -0.47837928, 0.96807194,\n", - " -0.31762508, -0.6842363 , 0.6482304 , -0.62070376, 0.851923 ,\n", - " 0.5820076 , -0.8776717 , 0.3753403 , -0.43739372, 0.38933575,\n", - " -0.84821516, -0.42670748, -0.8712478 , -0.9629567 , -0.5227983 ,\n", - " 0.9735279 , -0.58774203, -0.9501498 , -0.04851773, -0.4030671 ,\n", - " -0.6616436 , 0.937181 , 0.8399379 , 0.5786255 , -0.53851986,\n", - " 0.608611 , 0.5894956 , 0.6932158 , -0.9471889 , -0.08071883,\n", - " 0.61745256, -0.48077726, -0.85866666, -0.9854245 , -0.54912895,\n", - " 0.70028615, 0.99461323, 0.8898859 , 0.47220692, 0.83854306,\n", - " -0.50452465, 0.7651799 , -0.9775896 , 0.98937917, -0.292829 ,\n", - " 0.3849252 , -0.320667 , 0.42379427, -0.93404233, 0.20117493,\n", - " 0.9306183 , -0.5900559 , -0.8919762 , -0.2591962 , -0.56046534,\n", - " -0.6167952 , -0.7926062 , 0.67214435, -0.43549895, -0.51297027,\n", - " -0.20104748, 0.9576271 , 0.9956705 , 0.87649155, 0.2149536 ,\n", - " 0.81554365, -0.9258257 , -0.7148402 , 0.3126772 , 0.4378628 ,\n", - " 0.21117692, 0.9972139 , -0.6545783 , -0.34187165, -0.97159296,\n", - " -0.9917849 , 0.13874346, -0.95807856, -0.24090737, -0.8362934 ,\n", - " 0.83134925, -0.02485377, 0.5902031 , 0.66127837, -0.9960211 ,\n", - " -0.9039419 , 0.57700354, -0.6185345 , 0.6124113 , -0.4798829 ,\n", - " 0.7660803 , 0.9616995 , -0.7682784 , 0.9197671 , 0.9530273 ,\n", - " -0.86191773, -0.8475627 , 0.94052964, -0.4518577 , 0.9570647 ,\n", - " -0.7576784 , 0.99691117, 0.9099429 , 0.8641262 , -0.96159774,\n", - " -0.75002694, -0.9515814 , -0.8360319 , -0.13117565, 0.2420047 ,\n", - " 0.9326485 , 0.7831623 , 0.55954635, 0.29953393, -0.7969798 ,\n", - " 0.999857 , -0.8307854 , -0.9768555 , 0.11506277, -0.3768013 ,\n", - " -0.9939753 , 0.91130793, 0.42434266, 0.27517143, -0.6154874 ,\n", - " -0.86753005, -0.98156863, 0.9588888 , 0.36419028, 0.99798036,\n", - " -0.5174248 , -0.9825296 , -0.76033354, -0.96041787, 0.01664599,\n", - " -0.38303357, -0.28204 , -0.07684401, -0.97971845, 0.67197174,\n", - " 0.7270586 , 0.6505339 , -0.863773 , 0.99991965, 1. ,\n", - " 0.9831044 , 0.95112723, 0.9671731 , -0.9999366 , -0.47617579,\n", - " 0.9999995 , -0.9957464 , -1. , -0.9702093 , -0.7724236 ,\n", - " 0.5686776 , -1. , -0.44194952, -0.19867025, -0.9511127 ,\n", - " 0.7548529 , 0.98874015, 0.99917936, -1. , 0.9133393 ,\n", - " 0.9748863 , -0.78451705, 0.9694428 , -0.60926914, 0.98597527,\n", - " 0.73211765, 0.6097775 , -0.41699415, 0.5658006 , -0.9509543 ,\n", - " -0.94915485, -0.69839555, -0.7952085 , 0.99812067, 0.37889785,\n", - " -0.9252992 , -0.95140046, 0.60411584, -0.2677717 , -0.19147323,\n", - " -0.97972316, -0.28978664, 0.7013556 , 0.8766078 , 0.25837412,\n", - " 0.47869623, -0.86088717, 0.36461812, 0.08905052, 0.5959429 ,\n", - " 0.80502397, -0.9765572 , -0.82025707, -0.05646016, -0.20963307,\n", - " -0.7015158 , -0.9814151 , 0.981326 , -0.58565354, 0.88929856,\n", - " 1. , 0.45330504, -0.94868374, 0.7073493 , 0.45062408,\n", - " -0.5440919 , 1. , 0.8821116 , -0.9873539 , -0.7220794 ,\n", - " 0.75933313, -0.76312953, -0.7806591 , 0.999871 , -0.45650977,\n", - " -0.7414432 , -0.62843347, 0.9847614 , -0.9941429 , 0.9960819 ,\n", - " -0.9639135 , -0.98231494, 0.9804208 , 0.966725 , -0.7936699 ,\n", - " -0.7981794 , 0.37011516, -0.8434755 , 0.46403134, -0.9849263 ,\n", - " 0.8601775 , 0.75816876, -0.2712624 , 0.9483012 , -0.9555475 ,\n", - " -0.7290788 , 0.5245557 , -0.7172286 , -0.16985686, 0.9483513 ,\n", - " 0.68013823, -0.39875364, 0.24682792, -0.4570932 , -0.579493 ,\n", - " -0.99015033, 0.51719975, 1. , -0.31598833, 0.7676579 ,\n", - " -0.6707193 , -0.21699165, 0.03049472, 0.68588054, 0.74038446,\n", - " -0.43400556, -0.9253986 , 0.7652241 , -0.9887925 , -0.9929267 ,\n", - " 0.8894325 , 0.3060767 , -0.5071547 , 1. , 0.6168672 ,\n", - " 0.34638077, 0.4324086 , 0.98720336, 0.16241549, 0.73432744,\n", - " 0.90731424, 0.991909 , -0.40463305, 0.716668 , 0.94364953,\n", - " -0.94109166, -0.47788474, -0.8124714 , 0.08105253, -0.9579067 ,\n", - " -0.13010952, -0.97531664, 0.98324805, 0.92016804, 0.6030984 ,\n", - " 0.41097453, 0.5904851 , 1. , -0.5345653 , 0.7872861 ,\n", - " -0.6765484 , 0.9543399 , -0.9998312 , -0.946341 , -0.54018164,\n", - " -0.23017377, -0.8565407 , -0.46756762, 0.47398734, -0.9830405 ,\n", - " 0.81026053, 0.6330656 , -0.9979726 , -0.9962177 , -0.12355503,\n", - " 0.9419812 , 0.26598173, -0.9904067 , -0.8542188 , -0.69247544,\n", - " 0.74725544, -0.51003075, -0.9649188 , -0.25007054, -0.4905193 ,\n", - " 0.63271457, -0.37441203, 0.70345974, 0.8695898 , 0.5029391 ,\n", - " -0.5380263 , -0.33281726, -0.23805283, -0.9059604 , 0.95224476,\n", - " -0.9431482 , -0.89280534, -0.42554915, 1. , -0.7899458 ,\n", - " 0.9125757 , 0.9127243 , 0.8982813 , -0.35490638, 0.4078732 ,\n", - " 0.9253425 , 0.38586822, -0.85559326, -0.9025214 , -0.9011115 ,\n", - " -0.60153526, 0.85533476, 0.59665257, 0.81345993, 0.8964449 ,\n", - " 0.7443972 , 0.27125114, -0.20871066, 0.27481025, 0.99998266,\n", - " -0.2840492 , -0.29991323, -0.7458876 , -0.20509811, -0.5339975 ,\n", - " -0.7270546 , 1. , 0.48854077, 0.64012146, -0.99460614,\n", - " -0.8695033 , -0.9664111 , 1. , 0.905261 , -0.88474786,\n", - " 0.8443341 , 0.595781 , -0.3124752 , 0.93808615, -0.33896095,\n", - " -0.41890037, 0.46116385, 0.2836125 , 0.9736608 , -0.73128635,\n", - " -0.9826918 , -0.7349575 , 0.6490712 , -0.98314494, 0.9999535 ,\n", - " -0.72397393, -0.523643 , -0.5632338 , -0.30314294, 0.7867379 ,\n", - " 0.07749737, -0.99158907, -0.32926816, 0.31927675, 0.978824 ,\n", - " 0.44623816, -0.74046755, -0.9677978 , 0.7664044 , 0.81417197,\n", - " -0.931385 , -0.95153093, 0.9802107 , -0.99172 , 0.7891608 ,\n", - " 1. , 0.49714422, -0.12492784, 0.36447287, -0.7481762 ,\n", - " 0.49326956, -0.25167897, 0.8847678 , -0.976958 , -0.49024653,\n", - " -0.34821123, 0.40166238, -0.31271335, -0.24131863, 0.80776453,\n", - " 0.28327364, -0.71646124, -0.7461512 , -0.36569625, 0.65089214,\n", - " 0.95127594, -0.4239073 , -0.3882214 , 0.33418632, -0.3306449 ,\n", - " -0.97719604, -0.49457264, -0.60294574, -0.99999803, 0.85043633,\n", - " -1. , 0.47601175, 0.28597838, -0.42246222, 0.925224 ,\n", - " 0.45430136, 0.75456715, -0.8798291 , -0.8653055 , 0.33285752,\n", - " 0.8717118 , -0.47331586, -0.51333266, -0.8749331 , 0.5257795 ,\n", - " -0.22822788, 0.32426873, -0.60738736, 0.87474453, -0.44259733,\n", - " 1. , 0.37895727, -0.8666054 , -0.992649 , 0.33378804,\n", - " -0.4953156 , 1. , -0.9816087 , -0.9676157 , 0.57130563,\n", - " -0.87143755, -0.9153314 , 0.4464076 , 0.21691309, -0.82991815,\n", - " -0.94637966, 0.9853671 , 0.97283095, -0.72234654, 0.62565166,\n", - " -0.51548594, -0.72417736, 0.12238231, 0.8079959 , 0.9913893 ,\n", - " 0.56857324, 0.972528 , 0.18605296, -0.332497 , 0.9861151 ,\n", - " 0.38999626, 0.7957976 , 0.2695957 , 1. , 0.54275763,\n", - " -0.95341367, 0.15813261, -0.99422103, -0.36610997, -0.9807028 ,\n", - " 0.45175987, 0.43771988, 0.95341367, -0.3477924 , 0.9871621 ,\n", - " -0.80394137, 0.13630904, -0.7316009 , -0.42436185, 0.5563093 ,\n", - " -0.9603625 , -0.99102366, -0.9926393 , 0.7987094 , -0.60799867,\n", - " -0.2829846 , 0.34003675, 0.30692574, 0.5922452 , 0.55965894,\n", - " -1. , 0.9650474 , 0.586469 , 0.9308493 , 0.97693413,\n", - " 0.81679493, 0.6935902 , 0.3594853 , -0.99401385, -0.99560463,\n", - " -0.4776145 , -0.44716814, 0.8899843 , 0.80234253, 0.9501147 ,\n", - " 0.60127044, -0.6393888 , -0.53501534, -0.5651595 , -0.5560495 ,\n", - " -0.9964298 , 0.5939859 , -0.7229713 , -0.9930243 , 0.9788234 ,\n", - " -0.06926313, -0.31818512, -0.08477148, -0.81810784, 0.9833096 ,\n", - " 0.88363856, 0.6447559 , 0.24477816, 0.69223064, 0.94665855,\n", - " 0.9869473 , 0.9916134 , -0.8412526 , 0.92761016, -0.6290666 ,\n", - " 0.6587624 , 0.7275249 , -0.96384245, 0.25654906, 0.6279618 ,\n", - " -0.62836856, 0.43710572, -0.43389598, -0.9941003 , 0.7835287 ,\n", - " -0.42459017, 0.7451301 , -0.55962986, -0.12343442, -0.5661953 ,\n", - " -0.33871347, -0.8647836 , -0.8123264 , 0.7892974 , 0.678103 ,\n", - " 0.92779475, 0.7987918 , -0.17816365, -0.85906523, -0.33404765,\n", - " -0.84278965, -0.94070536, 0.9852418 , -0.30863348, -0.5421796 ,\n", - " 0.6880254 , 0.07184966, 0.81081134, 0.2298809 , -0.56959224,\n", - " -0.53890955, -0.8816053 , 0.9449031 , -0.5475274 , -0.73295534,\n", - " -0.73966575, 0.7808232 , 0.4786223 , 0.99999666, -0.82445186,\n", - " -0.915314 , -0.42598093, -0.48665914, 0.5250293 , -0.6174287 ,\n", - " -1. , 0.655752 , -0.3137039 , 0.77189934, -0.7153862 ,\n", - " 0.8287944 , -0.7801402 , -0.994018 , -0.4000655 , 0.52331597,\n", - " 0.784202 , -0.6928701 , -0.7076483 , 0.75868523, -0.11197665,\n", - " 0.9726137 , 0.9263615 , -0.5616764 , 0.07877975, 0.78827417,\n", - " -0.7552874 , -0.82799286, 0.96749127]], dtype=float32)\n" - ] - } - ], - "source": [ - "# Load the reticulate library\n", - "library(reticulate)\n", - "\n", - "input_text <- \"What happens to this string?\"\n", - "\n", - "\n", - "cat(sprintf(\"Input: \\\"%s\\\"\\n\\n\", input_text))\n", - "\n", - "py_run_string(sprintf('\n", - "tensors_tf = tokenizer(\"%s\", return_tensors=\"tf\")\n", - "output = bert(tensors_tf)\n", - "', input_text))\n", - "\n", - "output <- py$output\n", - "\n", - "py_run_string('\n", - "from pprint import pformat\n", - "output_type = str(type(output[\"pooler_output\"]))\n", - "output_shape = output[\"pooler_output\"].shape\n", - "output_preview = pformat(output[\"pooler_output\"].numpy())\n", - "')\n", - "\n", - "output_type <- py$output_type\n", - "output_shape <- py$output_shape\n", - "output_preview <- py$output_preview\n", - "\n", - "cat(sprintf(\n", - "\"Output type: %s\\n\\nOutput shape: %s\\n\\nOutput preview: %s\\n\",\n", - "output_type,\n", - "paste(output_shape, collapse=\", \"),\n", - "output_preview\n", - "))\n" - ], - "id": "Q1ODAgBMa3Zg" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y_CnEClsl_1p" - }, - "source": [ - "# A practical introduction to BERT\n", - "\n", - "In the next part of the notebook, we are going to explore how a tool like BERT may be useful for causal inference.\n", - "\n", - "In particular, we are going to apply BERT to a subset of data from the Amazon marketplace consisting of roughly 10,000 listings for products in the toy category. Each product comes with a text description, a price, and a number of times reviewed (which we'll use as a proxy for demand / market share).\n", - "\n", - "For more information on the dataset, checkout the [Dataset README](https://github.com/CausalAIBook/MetricsMLNotebooks/blob/main/data/amazon_toys.md).\n", - "\n", - "**For thought**:\n", - "What are some issues you may anticipate when using number of reviews as a proxy for demand or market share?\n", - "\n", - "### Getting to know the data\n", - "\n", - "First, we'll download and clean up the data, and do some preliminary inspection." - ], - "id": "y_CnEClsl_1p" - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "_d5eA3xyzdtb" - }, - "id": "_d5eA3xyzdtb", - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "library(readr)\n", - "library(stringr)\n", - "library(tidyr)\n", - "library(purrr)\n", - "data_url <- \"https://github.com/CausalAIBook/MetricsMLNotebooks/raw/main/data/amazon_toys.csv\"\n", - "data <- read_csv(data_url, show_col_types = FALSE)\n", - "problems(data)\n", - "\n", - "data <- data %>%\n", - " mutate(\n", - " number_of_reviews = as.numeric(str_replace_all(number_of_reviews, \",\", \"\"))\n", - " )\n" - ], - "metadata": { - "id": "5kzXygwH0BKw", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 247 - }, - "outputId": "f368dd28-317f-4815-d05b-a0c2ce7f05d6" - }, - "execution_count": 45, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Warning message:\n", - "“\u001b[1m\u001b[22mOne or more parsing issues, call `problems()` on your data frame for details,\n", - "e.g.:\n", - " dat <- vroom(...)\n", - " problems(dat)”\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "
A tibble: 2 × 5
rowcolexpectedactualfile
<int><int><chr><chr><chr>
88746a double1,040
93176a double1,399
\n" - ], - "text/markdown": "\nA tibble: 2 × 5\n\n| row <int> | col <int> | expected <chr> | actual <chr> | file <chr> |\n|---|---|---|---|---|\n| 8874 | 6 | a double | 1,040 | |\n| 9317 | 6 | a double | 1,399 | |\n\n", - "text/latex": "A tibble: 2 × 5\n\\begin{tabular}{lllll}\n row & col & expected & actual & file\\\\\n & & & & \\\\\n\\hline\n\t 8874 & 6 & a double & 1,040 & \\\\\n\t 9317 & 6 & a double & 1,399 & \\\\\n\\end{tabular}\n", - "text/plain": [ - " row col expected actual file\n", - "1 8874 6 a double 1,040 \n", - "2 9317 6 a double 1,399 " - ] - }, - "metadata": {} - } - ], - "id": "5kzXygwH0BKw" - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "id": "1Su5vOGhD3Df", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "577bbbd9-1a7d-4f7e-d9bd-decd46830dc0" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[90m# A tibble: 6 × 4\u001b[39m\n", - " text ln_p ln_q amazon_category_and_…¹\n", - " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \n", - "\u001b[90m1\u001b[39m Hornby 2014 Catalogue | Hornby | Product … 1.23 -\u001b[31m8\u001b[39m\u001b[31m.\u001b[39m\u001b[31m69\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m2\u001b[39m FunkyBuys® Large Christmas Holiday Expres… 2.83 -\u001b[31m10\u001b[39m\u001b[31m.\u001b[39m\u001b[31m7\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m3\u001b[39m CLASSIC TOY TRAIN SET TRACK CARRIAGES LIG… 2.30 -\u001b[31m8\u001b[39m\u001b[31m.\u001b[39m\u001b[31m56\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m4\u001b[39m HORNBY Coach R4410A BR Hawksworth Corrido… 3.69 -\u001b[31m11\u001b[39m\u001b[31m.\u001b[39m\u001b[31m4\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m5\u001b[39m Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. … 3.47 -\u001b[31m10\u001b[39m\u001b[31m.\u001b[39m\u001b[31m3\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m6\u001b[39m 20pcs Model Garden Light Double Heads Lam… 1.94 -\u001b[31m10\u001b[39m\u001b[31m.\u001b[39m\u001b[31m7\u001b[39m Hobbies > Model Train…\n", - "\u001b[90m# ℹ abbreviated name: ¹​amazon_category_and_sub_category\u001b[39m\n", - "99% \n", - "316 \n" - ] - } - ], - "source": [ - "data <- data %>%\n", - " mutate(\n", - " number_of_reviews = as.numeric(str_replace_all(number_of_reviews, \"\\\\D+\", \"\")),\n", - " price = as.numeric(str_extract(price, \"\\\\d+\\\\.?\\\\d*\"))\n", - " ) %>%\n", - " filter(number_of_reviews > 0) %>%\n", - " mutate(\n", - " ln_p = log(price),\n", - " ln_q = log(number_of_reviews / sum(number_of_reviews)),\n", - " text = str_c(product_name, manufacturer, product_description, sep = \" | \")\n", - " ) %>%\n", - " select(text, ln_p, ln_q, amazon_category_and_sub_category) %>%\n", - " drop_na()\n", - "print(head(data))\n", - "data$text_num_words <- str_split(data$text, \"\\\\s+\") %>% map_int(length)\n", - "print(quantile(data$text_num_words, 0.99, na.rm = TRUE))" - ], - "id": "1Su5vOGhD3Df" - }, - { - "cell_type": "code", - "source": [ - "ggplot(data, aes(x = text_num_words)) +\n", - " geom_density() +\n", - " labs(title = \"Density Plot of Text Lengths in Words\")" - ], - "metadata": { - "id": "lovFEHaWp4lC", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 437 - }, - "outputId": "2c191aaa-d7ed-4b62-ef28-c513fabfab05" - }, - "execution_count": 47, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO2deYBN5f/HPzMMwlgKyZKkEhE1\nfRUViUrZE4oQUqIiKopCiiylKEmlfaONSr5CRV/RYikRWQvFNWMWM5Yx8/zuucvcc5dzzznP\n82Ge8Xu//rjnnOc5n/fz3OU1c9dzSAAAlKHCngAApwIQCQAGIBIADEAkABiASAAwAJEAYAAi\nAcCALiKNoJec7jqKZjjcc+1/ksrslJyQBBNpinWniytozVpqoR4CTgQnRKS1ZFCmdpfXs52W\nPF3rHe/lW/Ot4yipetflxma0SLHLhGhALR/Y71sbQyHKW0zBlLKW6jiduKk2rkj+KxjJ1fS6\nf+V6GulfuZsetg6BSNpygkQq271797b1Eqj6YleFVftbx3W/riYlPC9iiRS7TByhcrmB1fc7\nemlClY1FD/vBXYvkq40rUmwmUTffMqcUNfK3nEPLrXeHSNpygkTyPw53DqCk/7qo20YWIvnj\n8mZQib9iiGRRJg5SrbDtT6mVw8HdiuSvlRDpd6p43FgupLMT/jFWNtHpx613h0jackJFEmIy\n1cjxLvJfuaJsqQtHH/Kujqb5GzpVLtnoPaN7XsuKSWe1WSj8LyG6GE+8rrySvvCVfkNXRsVd\nS7OCIh177rKyJesM3iMCZcGxQ+0dfU/k/gxNKyhSaDYrEs8/7G3IrJa0JiwlTCSrye+6rdJp\nl32cSk2CM5hIz/7etkKpRh+EXzU/vtdIYfU+zqXvjcX9NNX/JG8a9TBfB/EozX+xmvfJ6M7u\nZ5zW6DW/SJHRQANOsEj5jekt7+J2Omv4o1dQ40whnqCx5a9/oC3RUiFmU+W7H+t3esJb/sfZ\nF3fQFdPmvkpdfKX3GNJExA2kJwIi5d1IFw557EY6a2egLLCHqX3BBKo4bVp6aFpBkUyzGU6P\nCeNhPDY8JUwki8kfqElXjulf6klqHqydSMMrXD+8AyUsCbtqfnwimev93E+jjMUF1f+h7sZK\nG3rPfB3EWBpW+rYBIq0mNR99d9W+hkhR0UADTrBI4mm6XYgPKcX7KMy/13hBPZFKvO1tf5D6\nCNGQtnpX/06+IvA4m2c8Q8osXeKAt/V4lZIHo+KuoTkBkWZT0yPC+BPfLVAWwNxu8dTOPJvD\n9UpsEmuKXZobnmIWyWryo6ird/V/pxmP7XmBp3a+3oepd9hV8+O7guZ6P4vpUu/lDm9J/dPz\nvPM5rVha2HWYQOWNJ8eP+zT7p6oxWFQ00IATLdIC7x9s0Zp87zkcTDrLeCz5nj+tpqZC1PS/\nLjgqzCKJXjTde/k13RIZlz+LyuwPiHQl+Z7aZJQokROmgLndQiTzbMSPxVscb1Jyg7AWyWry\nF9OPxnpfs0jNjZYfjBbTVfMTEClU7+docsK/Qsykd8UQ+kGIRUaE+TpMJJ8vjYxOIcbFjAYa\ncKJFWkaXCFGWDvk2LqFdxvMfY3ULNRZiEF342j/+/UwiLfP9kR5A80NxvnftbqxNSe8Lv0j5\npcj/nK0h/WRWIKzdQiTzbIy/+zfQ08amlUgWk88rkXjMWP/ILJLvnevNEVfNdAXNVz5AF3pD\niA4J+8VCGiPEUJocfh0m0gPetbwSZLzU9HrWIkY00IATLdI874M3J/QZzsrgW1t/Gm/3Hrs7\niaj+w9tFmEj5telXkXtGpWOhOB/Fq/dca2waImVSCX/ftfSVWYGw9tgihc3GO4eL6NLj/pnG\nFMlq8hmBz6N+MosU86r5CYgU2iPA694nbceSU4TILnm5EPVoY/h1mEgTvGsZVCo0WFQ00IAT\nLdJDNFgcpoQxAbZHPJb2zuqYTCU+DBNJjKFh4iu6L1acgSFSFiX5N66hRWYFwtpjixQ2GyH+\nKkcVdhu9FiJZTT6dKvj6f44tkumq+bESaX9ixbxv6FHvWqvEtL/o3Ijr4K9Ip5K+llX+t78j\nooEGnGCRjlanL4UoT/sL+qIeS4dfKl7hSJhIOxKq5/fyPq2JjvPhe41UmvxvRVxEv4QpYG63\neGpnno0Q1yVOpDbGitVTO4vJHy9W7Lg/tEVskQqumh8rkcTl9NMo+k4YH85+9DrdH3Ed/BXH\ni/mf2n1W8DmSORpowAkW6T5qkG98/+Uj31aqCH8s7dzra27ufT5jFkm0pIVl68WK8+ETqbn/\n06bU4qcdCVPA3G4hknk2Ypb3P18/mi2sRbKa/HlkvEUh7o4tkumq+bEU6UmacnWy8TR2Hd3f\nx/fGhvk6BCoupFXG4kFjsKhooAEnVKS/e1DpNcJ4B7mB8Vd9efFbwh5L6+ha462nzBrFDvgf\nZ59TJ1/dm1Tb99IgIi6AT6Q3qZlRO5zuDJX5S0Ptlm9/h2azM/nsLJFWJXlneEr4298xJy8G\n093e1dXJxmPbX2tx1fxYirSO2pTs6FurmlI7+WjEdQhUDPN9lWh7Re9g0dFAA06QSGW6dOnS\nvnEiVf+fr+E2qj5sTJek5NXhj6UedO7gxwfVoiGBx9nWhKR+A72d2cmU+Jc5Llqk/I7U4KFH\nW9EFB0JlBuZ2q68IhWaTf63xzFO8Ty3zw1LWUum2ft61nPyO8tTm8T7JkwyR/LUWV82PpUii\nZiLN9K30SqSbI69DoGJPJbrsvlvL+/4jRUUDDTiB3/5OrNxqRo6/Ie+VpsnFa/TeJMIfS3kv\nNqtUrPzVc/KDvzJ4ulJJ461v0Z+uDYuLFknkPndp6ZIXPuJ7NREsE+HtViKFZvMi3eZrucn4\n6MqUsrbgnboxlpMXa69LLtdi2W90TXAGFlfNj7VI95DvE1Yh3iHj8+bw6xD8/t6mjhVKNXwl\nlS6PEQ00QJffI4Uzgd4u7Ck4ZRW1LewpAA3QUqRjNSvp/47Uv1+uNxYv0b2FPROgAVqKNNT4\nbqruvE3NjgmRfh4tKuyZAA3QT6RNI66iRjmFPQt7jl5FFz00uAZ1LuyJAB3QT6SliWVv22+/\nW+GTOe6isqc1npJrvyc49dFPJACKIBAJAAYgEgAMQCQAGIBIADAAkQBgACIBwABEAoABiAQA\nAydCpGn3xOGuAfF6Zbh7IHPgwAF3cyfexRx4zwD2RPbAuwdw3zF3s98vqvf0YydUpLt+2m3N\npt/+jtMrw5YdzIHbf9vGnLhzM3Pg7t82cif+zn6//LaLOXEr+/3y2xa1+g4nVqS/4nSme7h/\nkJbFfajEI57DzInHMpkDheeg/T7uSGO/XzxxTgcgRQ77/eI5pFYPkeIBkViASKpApEggEgsQ\nyQREYgEicQCRzEAkFiCSRD1EigdEYgEiqQKRIoFILEAkExCJBYjEAUQyA5FYgEgS9RApHhCJ\nBYikCkSKBCKxAJFMQCQWIBIHEMkMRGIBIknUQ6R4QCQWIJIqECkSiMQCRDIBkViASBxAJDMQ\niQWIJFEPkeIBkViASKpApEggEgsQyQREYgEicQCRzEAkFiCSRD1EigdEYgEiqQKRIoFILEAk\nExCJBYjEAUQyA5FYgEgS9RApHhCJBYikCkSKBCKxAJFMQCQWIBIHEMkMRGIBIknUQ6R4QCQW\nIJIqECkSiMQCRDIRU6SsiX/LDweRWIBIEvW6ifQ8Vfyv9HAQiQWIJFGvm0g3UfGG0sNBJBYg\nkkS9ZiIdLXv+1Qke2eEgEgsQSaJeM5GW0L2j6RPZ4SASCxBJol4zkUbQ/CV0v+xwEIkFiCRR\nr5lIrelgdolGssNBJBYgkkS9ZiLVLS/ElYnpksNBJBYgkkS9ZiKVbiDEIPpBcjiIxAJEkqjX\nS6QDdKMQ0+l1yeEgEgsQSaJeL5HW0l1CLKaRksNBJBYgkkS9XiItoPFC/E0dJYeDSCxAJIl6\nvUR6kd4QIr9cXcnhIBILEEmiXi+RRtJS7+VlxSWFgEgsQCSJer1E6klbvJe300a54SASCxBJ\nol4vkVpQtvfyKfpYbjiIxAJEkqjXS6RzKxmX8+hpueEgEgsQSaJeK5HySzY2Fj/TQLnhIBIL\nEEmiXiuRDlIbY3GAbpAbDiKxAJEk6rUSaRv18C3LXyA3HERiASJJ1Gsl0k90r295cck8qeEg\nEgsQSaJeK5EW02O+ZUfaLTUcRGIBIknUuxAp65k+Pcbti1jfPbyjedu8j3At0of0rG85lFY4\nm37kDCESBxBJot6FSONHbN8zZXBe2Pry3tM6mrfN+wjXIr0U+N738/S20ysQBkRiASJJ1DsX\nydNhm/cm6bQubH3p/h86mrbN+xi4FGkCzfctF9ATbq5EARCJBYgkUe9cpJVdjNv33g8j1n0i\nBbfN+xi4FOkhWu5b/kZ9Hc4/HIjEAkSSqHcu0qI7jMtRsyPWfSIFt03tu97w0ntLjjVpnuyI\nlr70k2+5n5rHqbMmPUuqzJpMTyZzYtZB5sAcTyp3Ymrk/aLKwah7WpUM9vvFk65Un9HeuUh9\nTSKF1v0iBbZN7ctSvHRb73FDO/rNv3J6LVd1ABQye9s5FmmV/2nbvIh1n0jBbVP7vq+99N16\nxJo0z+GIlhZ00L/SuEROnEJL0g/JVMXhkCeLOTH7IHPgEU8ad2Jq5P2iSrpH6u6MQyb7/eLJ\nUKrPcv4fKbXDn0JkdNwQse4TKbht3sfA5WukxiUDK51oj81sYoLXSCzgNZJEvYu3vycO3b57\n7LB8sXhBaD3Ns7ijx/toC24HlwFcinT2WYGVIbTSzbUIApFYgEgS9S5Eyp7Wu+eENCEmjw6t\n929vML9gO7gM4FKk5PqBlWfofRdXogCIxAJEkqjX6StCuQlXBdbm0SSZ4SASCxBJol4nkfZT\n8BXbjzRYZjiIxAJEkqjXSaTN1Duwto/aRe3uAIjEAkSSqNdJpB9oaGAt/zSps41BJBYgkkS9\nTiItpHHB1QvKyQwHkViASBL1Oon0Hj0fXL2OZB4eEIkFiCRRr5NIL9Oc4Gp/Wi8xHERiASJJ\n1Osk0jM0L7g6jj6XGA4isQCRJOp1EmksfRVcfYNekBgOIrEAkSTqdRLpodAvzL+hhyWGg0gs\nQCSJep1EuofWBle3UXeJ4SASCxBJol4nkXrRn8HVo4lNJYaDSCxAJIl6nUTqRP8WrFevJjEc\nRGIBIknU6yRSawpdmaaJElJAJBYgkkS9TiJdkRhquJW2uR8OIrEAkSTqdRKpQXJo/WFa5n44\niMQCRJKo10mkc84Krb8YOFakKyASCxBJol4nkc44P7T+BY1xPxxEYgEiSdTrJFLJS0Prv1Mf\n98NBJBYgkkS9RiLlUvPQRk5Cc+EaiMQCRJKo10ikNLrJtHVmTffDQSQWIJJEvUYi/UXdTFuX\nS3yQBJFYgEgS9RqJtJH6mbZupa2uh4NILEAkiXqNRFpNQ0xbI+lr18NBJBYgkkS9RiItpVGm\nrZfpFdfDQSQWIJJEvUYizacJpq3/0qOuh4NILEAkiXqNRHqXppu2tlAP18NBJBYgkkS9RiKZ\njn3i5WixK1wPB5FYgEgS9RqJNDV07BOD2pVcDweRWIBIEvUaiWQ69onBdZQmXAKRWIBIEvUa\niWQ69onBIPrR7XAQiQWIJFGvkUimY58YPEPvuR0OIrEAkSTqNRLp9tCxTwwWhI4E7hSIxAJE\nkqjXSKTO9I95cxP1cjscRGIBIknUayTSDRT2IDtSzPURuSASCxBJol4jka6m3LDtcyq7HQ4i\nsQCRJOo1Euk/SeH911Gqy+EgEgsQSaJeI5Euiji52JDwt8MdAJFYgEgS9RqJVPvM8P7Z9JLL\n4SASCxBJol4jkarWDu9fSfe6HA4isQCRJOo1Eql8/fD+zIRrXA4HkViASBL1GomUdFnEDme7\n/doqRGIBIknU6yNSLl0dscNNprNTOAIisQCRJOr1ESmTro/Y4WFa4m44iMQCRJKo10ekfdQx\nYoe3aYq74SASCxBJol4fkXbQbRE7/EG3uBsOIrEAkSTq9RFpI/WP2CG/osujrUIkFiCSRL0+\nIv0S/bHR9bTH1XAQiQWIJFGvj0gr6KHIPR6jT10NB5FYgEgS9fqItDj6jEhf0EhXw0EkFiCS\nRL0+Is2niZF7HEi4ytVwEIkFiCRRr49I79PzUbtcVuyAm+EgEgsQSaJeH5Hm0OyoXca4OwAK\nRGIBIknU6yPSC/RO1C4/Uk83w0EkFiCSRL0+Ik2lj6N2yat6xjEXw0EkFiCSRL0+Ij1BC6P3\nGURvuxgOIrEAkSTq9RHpEfo2ep8tiRe7uFchEgsQSaJeH5GG0OoYO3Vw85ksRGIBIknU6yPS\nXfRrjJ1+KVnue3HsryOOhoNILEAkiXp9ROoVfsTiIG8lUHmicvc5uSsgEgsQSaJeH5G6WHxD\n9fO25zfrVI0edjAcRGIBIknU6yPSTfHOh5RaO2GZ/XAQiQWIJFGvj0gtKd6N862TH/lBJBYg\nkkS9PiJdnhg3ql6J/bbDQSQWIJJEvT4iNSwTN2oSPWs7HERiASJJ1Osj0vnxj2K3NyHyaF3R\nQCQWIJJEvT4iVT87flb9krY3HkRiASJJ1OsjUsW6Nln0P7vhIBILEEmiXh+RSl0SP+tNmmQ3\nHERiASJJ1GsjUl5Cs/hZ26hD/B0gEhMQSaJeG5GyqaVNWLUz7O5giMQCRJKo10akNLrRJqwj\nxUszgEgsQCSJem1E2kOdbcIeoUU2e0AkFiCSRL02Im2POvR3JG/ZfiQLkViASBL12oi0kfra\nhP1MA2z2gEgsQCSJem1EWkMDbcKyE6+02QMisQCRJOq1EWklDbVLO6eCzQ4QiQWIJFF/YkXq\n/1uaNQc85q0FNDTOvj6uoz/i73Ag1S7CJake9sQDzIFpHv5E7sAD7Imp/Pe02s24r90JFcnF\nf6RF0cfQj+RBu1Nh4j8SC/iPJFGvzVO7z6KPoR/JKzQr/g4QiQWIJFGvjUjv2//e6Gt6NP4O\nEIkFiCRRr41Ib9BMu7Qt1CP+DhCJBYgkUa+NSLNojl3a0USb77VCJBYgkkS9NiI95+AMLtWq\nxe+HSCxAJIl6bUR6mj6xjWuWEP/2g0gsQCSJem1EGkdf2sb1oC1x+yESCxBJol4bkR6lpbZx\nj9LiuP0QiQWIJFGvjUjD7A/JIF6mV+L2QyQWIJJEvTYiDaJfbOMW0ei4/RCJBYgkUa+NSP1o\ng23cZro9bj9EYgEiSdRrI1JP2mobl2NzXAeIxAJEkqjXRqQutNs+r0L8Y99BJBYgkkS9NiK1\npQP2efWT43ZDJBYgkkS9NiK1oiz7PJudIBILEEmiXhuRrqJc+7xetDleN0RiASJJ1Gsj0mXF\nHOSNoG/idUMkFiCSRL02IjWIf3okPzbfbIVILEAkiXptRDrvDAd5c2lqvG6IxAJEkqjXRqQa\n1R3kfU/D43VDJBYgkkS9NiJVruMgb1v8w7FCJBYgkkS9NiIl13eQdzihRbxuiMQCRJKo10ak\npEudBJ5+frxeiMQCRJKo10WkPGrqJLBB2Xi9EIkFiCRRr4tI9ucZ89GK4j0QIRILEEmiXheR\nUm3PM+ajR9zviEMkFiCSRL0uItmfZ8zH0Li/o4VILEAkiXpdRLI/z5iPCfRpnF6IxAJEkqjX\nRST784z5eJVejtMLkViASBL1uohkf54xH5/TE3F6IRILEEmiXheRHJxnzGA13RunFyKxAJEk\n6nUR6Rsa6SRwJ3WL0wuRWIBIEvW6iPSV/XnGDHKoRZxeiMQCRJKo10WkT+3PM+YjuV6cTojE\nAkSSqNdFpA/szzPmI+7PliASCxBJol4Xkd60P8+Yj2YJx6w7IRILEEmiXheRZtOrjhI70R7r\nTojEAkSSqNdFpBn0tqPEu2mtdSdEYgEiSdTrItIzNNdR4mO0yLoTIrEAkSTqdRFpAn3mKDHu\nfy6IxAJEkqjXRaQx9JWjxPfjvbsHkViASBL1uoj0CC1zlPg1jbLuhEgsQCSJel1EcnLCPoO1\ndLd1J0RiASJJ1Osi0mD6yVHibupi3QmRWIBIEvW6iDSAfnWUeISaW3dCJBYgkkS9LiL1jn+e\niRDxjn8HkViASBL1uojUnXY4i6xdxboPIrEAkSTqdRGpM+11FvmfYnmWfRCJBYgkUa+LSDdR\nqrPIG+PsCJFYgEgS9bqI5OjMlwbxXkxBJBYgkkS9LiJdTXF+HmFmGH1v2QeRWIBIEvW6iNQk\nwWHkU3G+lAeRWIBIEvW6iNSolMPIl+P8cAkisQCRJOp1EenC8g4jP6GnLfsgEgsQSaJeF5HO\nrewwcjk9aNkHkViASBL1uohUo4bDyI10h2UfRGIBIknU6yJS5XMdRu6ndpZ9EIkFiCRRr4tI\n5eIdr87M8UTrU/tBJBYgkkS9LiKVbOw0s0Jdyy6IxAJEkqjXRaSEy51m1qlk2QWRWIBIEvWa\niHSUrnaa2cT6W6sQiQWIJFGviUhZ1Npp5o2UZtUFkViASBL1moh0gNo6zbyd/rTqgkgsQCSJ\nek1E2kM3O80cQqusuiASCxBJol4TkbbTrU4zx9GXVl0QiQWIJFGviUh/UB+nmS/QW1ZdEIkF\niCRRr4lI6+kup5nv0zSrLojEAkSSqNdEpB/jnmQ5jMU02qoLIrEAkSTqNRHpexruNPMXGmTV\nBZFYgEgS9ZqItJQedZq5k7pbdUEkFiCSRL0mIi2ksU4z43x2C5FYgEgS9ZqI9JnDk5oblLzE\nqgcisQCRJOo1EelDhyc1NzjrbKseiMQCRJKo10Skt+gFx6EXlbHqgUgsQCSJek1EepVmOw5t\nTkcseiASCxBJot6FSFnP9Okxbl/4emD5a3sfX4j7jEXXghLHIr1o/XWFKKwPEw6RWIBIEvUu\nRBo/YvueKYPzwtYDy2MeL793/Uv0/dy7Ejo4t2ORnqUPHM+5P22w6IFILEAkiXrnInk6bPPe\nJJ3WmdfNbWL0e0LcEn7ePcciTaRPHc/5IVpu0QORWIBIEvXORVrZxbh97/3QvG5uW94/Vxxr\nP31Ivwm7C2ocizSOFjqe8wTLgxZDJBYgkkS9c5EW+Y4nN2q2ed3Uljfwa68bvZ7dvHlsL2NO\nu97w0ntLjjVpnuyC9Yfpyzh7hjODZln0pGc5DnFGpieTOTHrIHNgjieVOzE1234fVxz0cCdm\nsN8vnnSl+oz2zkXqaxIpsG5qW35H8K9OTtfF3stlKV66rfc4YzB94XBPj+c1Gud4XwBOCntD\nR1u0E2mV/2ncPPO6qW1c6P3rQd4XS2Lf1176bj1iTZrncMH6IPo+zp7hfEUjLXrSDzkOccYh\nTxZzYvZB5sAjnjTuxNTD9vu4It2Tw5yYyX6/eDKU6rOc/0dK7fCnEBkdN5jXQ22HfO847JyR\nK8ThrsuCNY5fI91F622GD7GG7rHowWskFvAaSaLexdvfE4du3z12WL5YvCC0HlyKde2NT5gy\ne0z7Z/eEvgUfmDoWqQ9tcjxn669/QyQWIJJEvQuRsqf17jkhTYjJo0PrwaX4pkOusc+20d1v\nH/9vQYljkW6j7Y7nnEnXWfRAJBYgkkS9Jl8R6kK74+wZTn5SikUPRGIBIknUayJSO/I4Tz2z\ntkUHRGIBIknUayLS9ZThPLWe1dn9IBILEEmiXhORriEXt8yVCbmxOyASCxBJol4TkZqR5ZHx\no2lP+2N3QCQWIJJEvSYipRR3kdqH/ojdAZFYgEgS9ZqI1LC0i9QHaGXsDojEAkSSqNdEpLoV\nXKQ+QV/E7oBILEAkiXpNRKp9povUF+jt2B0QiQWIJFGviUjVa7pIfZeej90BkViASBL1mohU\nqY6L1K9oTOwOiMQCRJKo10SkcvVcpK6i+2N3QCQWIJJEvSYilWrkInUL9YrdAZFYgEgS9ZqI\nlNjERarlCWchEgsQSaJeD5Fy6SoXqccTm8bugEgsQCSJej1EyqZr3cRWuDB2O0RiASJJ1Osh\nUhq1cRNbu0rsdojEAkSSqNdDpH+pQ5wdo0hJin1XQyQWIJJEvR4i/UVd4+wYRWuK/XCESCxA\nJIl6PUTaSj3dxHajXTHbIRILEEmiXg+RNlJfN7EDaW3MdojEAkSSqNdDpHV0t5vYR2hpzHaI\nxAJEkqjXQ6TVdJ+b2Mn0Ucx2iMQCRJKo10OkFfSgm1ir8/tBJBYgkkS9HiItpUfdxH5Ck2K2\nQyQWIJJEvR4iWf4wIjbf0MiY7RCJBYgkUa+HSPPpKTex6+mumO0QiQWIJFGvh0jzaIqb2L/o\nlpjtEIkFiCRRr4dI79B0N7GHqFXMdojEAkSSqNdDpDk0y1VuyUtiNkMkFiCSRL0eIs2iOa5y\nq54TsxkisQCRJOr1EGk6veMqt165mM0QiQWIJFGvh0hTaK6rXIvD6EMkFiCSRL0eIj1F813l\nWpxOCSKxAJEk6vUQaQx95Sq3N22O1QyRWIBIEvV6iPSoxde5rRhCq2I1QyQWIJJEvR4iDacV\nrnLH0sJYzRCJBYgkUa+HSPfRale5z9O7sZohEgsQSaJeD5HupnWuct+mGbGaIRILEEmiXg+R\n+tJGV7lf0BOxmiESCxBJol4PkXrSVle5K+mBWM0QiQWIJFGvh0hdKd6O0WyiPrGaIRILEEmi\nXg+ROtC/rnL3xT6gJERiASJJ1OshUhty92g4lhDzoPsQiQWIJFGvh0jXUra74LIXxWqFSCxA\nJIl6PUS6imJ+CdWas8+K1QqRWIBIEvV6iNQk0WVw41KxWiESCxBJol4PkRrF9CIOLSknRitE\nYgEiSdTrIZLFD/Ws6UK7Y7RCJBYgkkS9HiLVqewyeAD9GqMVIrEAkSTq9RCpZnWXwSPo2xit\nEIkFiCRRr4dIZ9Z2Gfw0fRKjFSKxAJEk6vUQqUJdl8Ev02sxWiESCxBJol4PkUo3dBkc+9Cs\nEIkFiCRRr4dIxVNcBsc+fQVEYgEiSdRrIVIeNXUZvJYGxmiFSCxAJIl6LUQ6TNe4DN5J3WK0\nQiQWIJJEvRYiZdD1LoMz6boYrRCJBYgkUa+FSB5q5za5RKxXVRCJBYgkUa+FSHvoZrfJMT95\ngkgsQCSJei1E2kHd3SZfWD5GI0RiASJJ1Gsh0mbq5Ta5WazD6EMkFiCSRL0WIv1G/d0mxzyM\nPkRiASJJ1Gsh0hq6x21yzHFS0AEAACAASURBVMPoQyQWIJJEvRYiraIhbpOH0A/RjRCJBYgk\nUa+FSMvpQbfJ42IdRh8isQCRJOq1EGkJjXKbPIPejm6ESCxAJIl6LURaSGPdJr9Lz0c3QiQW\nIJJEvRYifUYT3CZ/RWOiGyESCxBJol4LkebSVLfJP9J90Y0QiQWIJFGvhUjv0HS3yVupZ3Qj\nRGIBIknUayHSHJrlNjmNboxuhEgsQCSJei1EmkVz3CbnFWsS3QiRWIBIEvVaiDSd3nEdfcZ5\n0W0QiQWIJFGvhUhTaa7r6AtOj26DSCxAJIl6LUSaQJ+5jr4iMfrOgUgsQCSJ+hMs0q58a9I9\neYG1MfRlnP1icxN5otqyjriOic9hTw5z4tFM5sB8z0HuxLQ8+31ckeXJZU7MZr9fPIfU6k+s\nSP1/S7PmgCe4Npw+jbNfbLrTT9GJqa5j4pPqYU88wByY5uFP5A48wJ6Yyn9Pq92M+0IHSyjE\np3YP0Xeuo4fQqqg2PLVjAU/tJOq1eI0U8zcRNoyjL6PaIBILEEmiXguRBtEvrqNjff0bIrEA\nkSTqtRDpTvrNdXSsr39DJBYgkkS9FiLF/N24DbG+/g2RWIBIEvVaiHQr7XAdHevr3xCJBYgk\nUa+FSDfTHtfRW6lHVBtEYgEiSdRrIVLMY2vZcJBuiGqDSCxAJIl6LUS6njJcR+cnXRbVBpFY\ngEgS9VqIdA1J3C5VzolqgkgsQCSJei1EakZ57rPrJ0c1QSQWIJJEvRYiXVZcIvtqOhLZBJFY\ngEgS9VqIdHFpiexOtDeyCSKxAJEk6rUQKeY5WuyI8XUIiMQCRJKo10KkcytLZI+gbyKbIBIL\nEEmiXguRalSXyJ5MH0U2QSQWIJJEvRYiVYl1Hks7XqOXI5sgEgsQSaJeC5Eq1JXInk9PRTZB\nJBYgkkS9FiKVaSiR/T0Ni2yCSCxAJIl6LURKSpHI3kR9IpsgEgsQSaJeB5HyqalEtofaRTZB\nJBYgkkS9DiIdoRYS2ccTr4hsgkgsQCSJeh1EyqTrZMJPPz+yBSKxAJEk6nUQ6QC1lQk/P+qg\nxRCJBYgkUa+DSHupk0x408TciBaIxAJEkqjXQaSd1F0mvD3ti2iBSCxAJIl6HUTaQr1kwvvR\nhogWiMQCRJKo10GkDdRfJjz6W6sQiQWIJFGvg0hr6B6Z8Ck0L6IFIrEAkSTqdRBpFd0vE/4G\nzYxogUgsQCSJeh1EWk4PyoR/SeMiWiASCxBJol4HkZbQozLhq6OOtQqRWIBIEvU6iLQwxmG8\nHbCdbo1ogUgsQCSJeh1EivHLIidkUavIFojEAUSSqNdBpHk0RSq9VKOIBojEAkSSqNdBpFin\nOnJCzWoRDRCJBYgkUa+DSK/TS1Lpl5SIuMchEgsQSaJeB5Feptek0q+n9PAGiMQCRJKo10Gk\nWKeDdUJP+jO8ASKxAJEk6i1EOqaUWoAzkZ6hD6XSh0aeDB0isQCRJOotRKp0v/vzjMfAmUhP\n06dS6U/S/PAGiMQCRJKotxDpmkRqMDnqIPWucSbSOPpSKv0VeiW8ASKxAJEk6q1eI/0z4+qE\nYm3ez1FKdyjSaPpaKn0+PRneAJFYgEgS9XHebNjz3H+o3J2rVeKdifQwfSeVviryy3YQiQWI\nJFEf9127DT2IqNlP8vHORIp608AhO6hbeANEYgEiSdRbi/TvMxdTsbYfL7is2CLpeGciDSK5\ndzZyIo+HB5FYgEgS9RYiHf2ofXGqO9F4u+FYmzrS8c5EinHKMGck1wvfhkgsQCSJeguRTqey\nfVcE1j9LkI53JlJv2iwXf17Eke0gEgsQSaLeQqSrXgvl/iX3BR4DZyLdSjvk4q9MCDcHIrEA\nkSTqLURK2ehffhTx5MklzkS6mfbIxd9Mu8O2IRILEEmi3kIk8r9VlzuuhFK8M5HakUcufmDE\nuxQQiQWIJFEfUyQKcalSvDORrqcMufgxtDBsGyKxAJEk6mOKtO556tjf4M7H/1aKdybSNST5\nBYqZ9HrYNkRiASJJ1Fs8tbthi1JsEGciNSPJ2/ljejpsGyKxAJEk6nX4PdJlxSTjI08jC5FY\ngEgS9bFEqjtB1C1AKd6ZSI1Ok4z/k3qGbUMkFiCSRH0skS6fJi4vQCnemUgXlpeMz6Zrw7Yh\nEgsQSaJeh6d251aWzS9/YdgmRGIBIknUW4mUvdc729enblNKdyhSzeqy+fXKhW1CJBYgkkS9\nhUibqkwUuZcRlV+jFO9MpDNry+ZfS1nmTYjEAkSSqLcQ6eaGW8XbNHNrs1uU4p2JVFH6HY3b\nKexteojEAkSSqLcQqcq7QnRuIMS7NZXinYlUpqFsfsRvayESCxBJot5CpBLLxPGKDwux+GR8\n1y4pRTZ/Gr1v3oRILEAkiXoLkWq+KhbTMiFeO0sp3pFI+dRUNv8Desa8CZFYgEgS9RYi9a86\nslad42LfxSfhNdKRyF+MOyfiXH8QiQWIJFFvIdLeK6jSD0J0L79eKd6RSJl0nWz+1vCvNkAk\nFiCSRL3lB7IZxlGLf/pXKd2ZSAeorWx+NrU0b0IkFiCSRL0G32zYS52lB6gQ9s45RGIBIknU\nW4i0r0+1RP8v+5TiHYm0K/LwdC6ol2zegkgsQCSJeguRuhZv1cf3077+SvGORNpCvaQHuJ7M\nDyOIxAJEkqi3EOmMz5RigzgSaQPJ29qffjVtQSQWIJJEvYVIpfcrxQZxJNIaukd6gLH0hWkL\nIrEAkSTqLUS6+hul2CCORFpFQ6QHmEMzTVsQiQWIJFFvIdLPTVYq5QZwJNJyekh6gCX0iGkL\nIrEAkSTqLUS6siaVruVDKd6RSEtolPQAW8I+kYVILEAkiXqrp3atgijFOxJpIY2VHuBIwtWm\nLYjEAkSSqHfxgWzWM316jNsXvh5c3tfeS9fwfYRDkT6jCXKTN6hSyzxDiMQBRJKotxTp8I+f\neESuuWX8iO17pgzOC1sPLvt+7vF4UsP3EQ5FmktT5K/AZUmm+wgisQCRJOqtRJqaTPSDePSO\nkEqeDtu8N0mndeb1grZbforax8CRSO/Q8/JX4GYyHQsWIrEAkSTqLUSaTR1meUV6s/jkgqaV\nXYzb994PzevB5bH204f0m7A7bB8DRyLNoZfkr8BQ+l9oAyKxAJEk6i1EunigOGyc2fWRCwqa\nFt1hXI6abV4PLtN7Pbt589heh0z7bJvupffmQ9akerJ8y+k0M85eNkyiOaGNtAz5oJhkeLgT\nM9OYAw95UrkTD3AHHgzc03ykpzMHZnoOKtUfbB9bpFJf+0X6b1JIpL4mkQLr5jaR03WxaXtZ\nipdu6z32TKSZDvay4B16RL4YACb2tostUpXP/SLNDR04bpX/ads887q5TYhB75m2U1d76bv9\nmDUHPUd9y8n0Xpy9bFhPvUMbGTnyQTHJ9hxiTsxJZw485knjTkw7yhyY4TnCnJjFfr94MpXq\nsy3+I7VukWOIlNrg+oKm1A5/CpHRcYN5PbjcOSNXiMNdl5n3MXD0GmkifRpnLxsOJ14V2sBr\nJBbwGkmi3uI10jfFzhtC/fqUS/o+1DZx6PbdY4fli8ULQuuBZWaPaf/sntD3SEF7AEcijaMv\nFa5BDdPhWSASCxBJot7q7e8llxi/6mvyrakpe1rvnhPShJg8OrQeXG4b3f328f+GtgM4Emk0\nfa1wDVokhG4BiMQCRJKot/5mw761a9OEIo5EijjKo0v6mX6RBJFYgEgS9Rocs2Go8a6GNE+Z\nXmFBJBYgkkR9LJHKmDgJR1odFHFucnd8QFML1iESCxBJoj6WSN291E1q2qVT44SUe5XiHYl0\nJ/2mMMTPNLBgHSKxAJEk6i2e2s1rsNdY/HHhAqV4RyL1ps0KQ2QktChYh0gsQCSJeguRGsz1\nL19qpBTvSKRbaYfKGDUqFaxCJBYgkkS91dkolviX80oqxTsS6WbaozLGDVRwoBaIxAJEkqi3\nEKma/xfc+d1Pwtko2pFHZYwHqODDLojEAkSSqLcQaQw1vH/8+MH1aKRSvCORrqMMlTFmhw4k\nBJFYgEgS9RYi5U86y/hmQ6XH1G4BRyK1oCMqY3xP9wVXIRILEEmi3vID2fxdq1dtyxNqOBKp\nKSkNc5CuDa5CJBYgkkS9Bt9sSCmuNki1qsE1iMQCRJKo10CkhqXVBmlD/wTWIBILEEmiXgOR\nLqioNsiogp9hQCQWIJJEvQYinXOm2iAf0xOBNYjEAkSSqNdApGo11QbZSZ0CaxCJBYgkUa+B\nSJXOUxyl0tmBFYjEAkSSqNdApHL1FUe5LvglIYjEAkSSqNdApFJqX4wVYiQt9K9AJBYgkkS9\nBiIlNlEc5dPgeWEgEgsQSaK+8EU6RlfF2ckJnuBPkiASCxBJor7wRTpEaudg8lLvNP+39SAS\nCxBJor7wRUqjG1WHGRA4kj5EYgEiSdQXvkj/UkfVYd6kSb4lRGIBIknUF75If9EtqsPsoDa+\nJURiASJJ1Be+SFvDzqcsx3mn5RgLiMQCRJKoL3yRfqe+yuMMIt8xJiASCxBJor7wRVprOjCd\nLJ/SCGMBkViASBL1hS/Sj6GfikuTkdTYWEAkFiCSRH3hi7SChqsP1Dxht4BITEAkifrCF2kp\nPao+0BTfCZ0hEgsQSaK+8EVaSGPUB/qTbhIQiQmIJFFf+CJ9RhMYRqpbMhMiMQGRJOoLX6R5\npvOyyPMQfQKRmIBIEvWFL9I7NJ1hpO+pN0RiAiJJ1Be+SHNoFsNIeVVPz4VIPEAkifrCF2kW\nzeEYagAthUg8QCSJ+sIXaTq9wzHUl3Q/ROIBIknUF75IU2gex1BHkmtBJB4gkkR94Yv0FM1n\nGetmWg+RWIBIEvWFL9IY+oplrNdpPERiASJJ1Be+SI/SUpax9hdrApFYgEgS9YUv0nBawTNY\ns8QdEIkDiCRRX/gi3Uc/8gw2juZAJA4gkkR94Yt0N63jGWwV9YRIHEAkifrCF6kvbeQZ7PgZ\nZyqdjDYGEIkFiKSKE5F60lam0brTT0xJQSASCxBJFScidaV4O7nhFZrClBQEIrEAkVRxIlIH\n2sc02hbqYL+TKyASCxBJFScitSG2R0K10/O4ovxAJBYgkipORLqWsrmG60ZruaL8QCQWIJIq\nTkS6inK5hptOz3FF+YFILEAkVZyI1CSRbbi16ocRDwcisQCRVHEiUqNSbMNlVqrGluUDIrEA\nkVRxIlK9cmzDZd3I9la6H4jEAkRSxYlIdSqzDZc1luayhRlAJBYgkipORKpZnW24rK9oGFuY\nAURiASKp4kSkM2uzDZd1oFhTtjADiMQCRFLFiUgV67INl3W0QaljbGkCIjEBkVRxIlLphmzD\nZR3tw/WbDD8QiQWIpIoTkYqnsA2XdXQ6vcaWJiASExBJFQci5RHfy5qso/+jQWxpAiIxAZFU\ncSBSDl3DNlzW0ezil7OlCYjEBERSxYFI6XQD23BZR8VFrO82QCQWIJIqDkTaR+3ZhvOK1JvW\ns8VBJCYgkioORPqb8YumXpGm0ZtscRCJCYikigORtlJPtuG8Ii1j/W4DRGIBIqniQKTfqR/b\ncF6RUqkVWxxEYgIiqeJApDU0kG0445DFNSqxxUEkJiCSKg5EWkVD2IYzRGpLe9jyIBIPEEkV\nByJ9Rw+zDWeI9AjTyS18QCQWIJIqDkRaTKPZhjNE+oCeZsuDSDxAJFUciPQFPcE2nCHSRsZ3\nASESDxBJFQcifUyT2IYzRMot1YgtDyLxAJFUcSDSe4yH0PKdaKxxSb4vCUEkFiCSKg5EeoNe\nYhvOJ9Lt9DtbIERiASKpcteufGvSPXney1n0apx93JF1xHvxNL3PFnjYk8OW5edoJnNgvucg\nd2JaHnNglieXOTGb/X7xHFKrP7Ei9d9w0JoDHuNyEs2Os487UtO8F3PpQbbANE8aW1YgMZU5\n8KDnAHcie2Cqhzsxjf1+8ajdMfvbnVCRHDy1m0rz2IbzPbX7izqxBeKpHQt4aqeKA5Geovls\nw/nPal6hDlsgRGIBIqniQKTHaRHbcH6RrkxUu01MQCQWIJIqDkQaSd+wDecX6W6+M2BCJBYg\nkioORHqAVrIN5xdpOr3OFQiRWIBIqjgQaRD9zDacX6Rl9CBXIERiASKp4kCkO+k3tuH8Inmo\nDVcgRGIBIqniQKRetIVtOL9I4swaXIEQiQWIpIoDkbrRTrbhAiJdy3Z6Z4jEAkRSxYFInWgv\n23ABke6nFUyBEIkFiKSKA5FupDS24QIivUIzmQIhEgsQSRUHIl1LbB+fBkVaRfcwBUIkFiCS\nKg5Euopy2YYLiHQo8SqmQIjEAkRSxYFI/0nkGy4gkji3PNNDASKxAJFUcSDSxafxDRcUqSPX\nyc0hEgsQSRUHIl1Ynm+4oEij6UueQIjEAkRSxYFItavwDRcU6UOuQ3JBJBYgkioORKpek2+4\noEgbqQdPIERiASKp4kCkSnw/wysQ6Xjp+jyBEIkFiKSKA5GSmR7zBkGRxH+KZbMEQiQWIJIq\nDkQqcQnfcAUiDWD6bR9EYgEiqWIvUn7CFXzDFYj0Ar3CEgiRWIBIqtiLdISa8w1XINIKupcl\nECKxAJFUsRcpk67jG65ApIyEq1kCIRILEEkVe5E81JZvuAKRRJ1yeRyBEIkFiKSKvUi76Wa+\n4UIidac/OAIhEgsQSRV7kbbTbXzDhUSaTO9wBEIkFiCSKvYibaI7+IYLibSMHuAIhEgsQCRV\n7EVaR3fxDRcSKSOR5d0GiMQCRFLFXqQf6T6+4UIiibqlOe47iMQCRFLFXqQVNJxvOJNIPWgD\nQyBEYgEiqWIv0lJ6lG84k0jP0asMgRCJBYikir1IC2kc33AmkX6k/gyBEIkFiKSKvUif0kS+\n4UwiHStdjyEQIrEAkVSxF+lDepZvOJNIonlCqnogRGIBIqliL9IbbAdzFOEiPUJfqAdCJBYg\nkir2Is2m1/iGM4u0gONdDIjEAkRSxV6k6Tzf5fFjFiktsZl6IERiASKpYi/SZPqIbzizSOLS\n4hnKgRCJBYikir1I4zleygQJE+lBhoPbQSQWIJIq9iKNpq/5hgsTaSHDGTAhEgsQSRV7kR6k\n5XzDhYmUlaR+WBWIxAJEUsVepPtoNd9wYSKJ5gnKpzCDSCxAJFXsRRpA6/mGCxdpkvo76xCJ\nBYikir1IvWkz33DhIm1Q/xU7RGIBIqliLxLnuZgjRBLnlD2iGAiRWIBIqtiL1JH+4RsuQqRB\ntEgxECKxAJFUsRfpBmJ8HESItITuVAyESCxAJFXsRbqGGG+SCJGOVzlD8fy0EIkFiKSKvUhN\nieVIjn4iRBIDVD/thUgsQCRV7EW6tATjcJEiLVZ9bgeRWIBIqtiLVD+ZcbhIkXKrVlR73w4i\nsQCRVLEX6dzKjMNFiiSG0MdKgRCJBYikir1INWowDhcl0o+Kn8lCJBYgkir2InGeQjZaJFG3\npNKRGyASCxBJFXuRyl7EOFy0SE/RiyqBEIkFiKSKvUjFUxiHixZpd7EmKoEQiQWIpIqtSHnE\ncGSFAqJFEq3pd4VAiMQCRFLFVqRsask4XAyR3qIRCoEQiQWIpIqtSGl0I+NwMUQ6VLa6wp0I\nkViASKrYirSXOjEOF0Mk0Vvla0IQiQWIpIqtSDuoO+NwsURaQr3lAyESCxBJFVuRNlEfxuFi\niZRXPTlbOhAisQCRVLEVaR3dzThcLJHEMPpQOhAisQCRVLEVaTXdzzhcTJF+oQ4xWp0BkViA\nSKrYirScHmIcLqZI4sKS0g81iMQCRFLFVqT/0mOMw8UWaQy9IRsIkViASKrYirSAnmQcLrZI\nm6itbCBEYgEiqWIr0jyawjhcbJFE/RJpkoEQiQWIpIqtSO/QDMbhLEQaQ29KBkIkFiCSKrYi\nvUqzGYezEGk9dZQMhEgsQCRVbEV6gd5iHM5CJHFBKcmHL0RiASKpYivSMwqflkZjJdLDNFcu\nECKxAJFUsRVpAs1nHM5KpB+oh1wgRGIBIqliK9LjyofnNmMlUn6N8hY9NkAkFiCSKrYijaBv\nGYezEkncQ/+VCoRILECkMLKe6dNj3L7w9eAydcrt3UZuFuK+9l66FpTYijSEfpCee4wZWom0\niAZJBUIkFiBSGONHbN8zZXBe2Hpw+cCIbXun9jws+n7u8XhCR8CyFWkgrZGffBSWIh0tX13q\nwQGRWIBIZjwdtnlvkk7rzOvBZeYErzH7228Rt/wUVmMrUl/aKDv1GFiKJG6TO1UtRGIBIplZ\n2cW4fe/90LxubhObOqYdaz99SL8Juwtq7N+1S/lbbuIxsRZpLo2UCYRILEAkM4vuMC5HzTav\nm9syB70u0ns9u3nz2F7GnLZN99J78yFrUj1ZcXplSMuw6tlXqo5MYIbHMlGSzDTmwEOeVO7E\nA9yBB9nv6fR05sBMz0Gl+oPtnYvU1yRSYN3U9vddMwN/yHK6LvZeLkvx0m29RxOup/8V9hTA\nKczedo5FWuV/GjfPvB5qW9fj84I9B73nvUhd7aXv9mPWHPQcjdMrQ0aOZdcrNFYiMNtzSH42\nMclJZw485knjTkxjv188R5gTs9jvF0+mUn228/9IqR3+FCKj4wbzekHb77f9bLTvnJErxOGu\ny4I19ocsZsX6NZJIS2osEYjXSCzgNVIYE4du3z12WL5YvCC0HlgeHfC+8f/tcGaPaf/sntC3\n4Oxe+ogkrqOt7gMhEgsQKYzsab17TkgTYvLo0Hpgua69jy/EttHdbx//b0GJRiLNoknuAyES\nCxBJFY1E2ldc4rkdRGIBIqmikUjiBtrgOhAisQCRVNFJpDckDlgEkViASKroJFJGqTqux4NI\nLEAkVXQSSXSj79wGQiQWIJIqWon0Fd3hNhAisQCRVNFKpOM1y7h9FEMkFiCSKlqJJB5zfYpz\niMQCRFJFL5H2JNVzOSJEYgEiqaKXSKKb29NgQiQWIJIqmom0gq53FwiRWIBIqmgmkriSfrLZ\nIxyIxAJEUkU3kT6nm10FQiQWIJIquomUf2nCz24CIRILEEkV3UQSC+kGN4EQiQWIpIp2Ionm\ntMRFIERiASKpop9IqxIaubhTIRILEEkV/UQSt9KrzgMhEgsQSRUNRdpRqqrzxzJEYgEiqaKh\nSOIResRxIERiASKpoqNImVVP222/lx+IxAJEUkVHkcRL1N9pIERiASKpoqVIuRcU2+QwECKx\nAJFU0VIk8R71chgIkViASKroKdLxusW2OAuESCxAJFX0FEm8Qfc42xEisQCRVNFUpGM1Sqfa\n7yUgEhMQSRVNRRLjaaqj/SASCxBJFV1F2u/wYJEQiQWIpIquIolbnR0sEiKxAJFU0Vakr6if\nk90gEgsQSRVtRTpeo6yTGw4isQCRVNFWJPEwve9gL4jEAkRSRV+RfqFODvaCSCxAJFX0FUmc\nXzLdfieIxAJEUkVjkUbR2/Y7QSQWIJIqGou0jrrY7wSRWIBIqmgskqhd1v7OgEgsQCRVdBZp\nCH1huw9EYgEiqaKzSMtogO0+EIkFiKSKziLlnn6W7fgQiQWIpIrOIonbyPY44BCJBYikitYi\nvUPj7HaBSCxAJFW0Fim1WBO7XSASCxBJFa1FElcm/muzB0RiASKpordIT9KbNntAJBYgkip6\ni/QL3WazB0RiASKpordI+WedbnMHQyQWIJIqeosk+tDK+DtAJBYgkiqai/QhPRZ/B4jEAkRS\nRXOR0opfFn8HiMQCRFJFc5HElQl74/ZDJBYgkiq6i/QkvRG3HyKxAJFU0V2kNdQ9bj9EYgEi\nqaK7SPnVKubG64dILEAkVXQXSdwZ/4irEIkFiKSK9iJ9Sg/H64ZILEAkVbQX6VCpi+J1QyQW\nIJIq2oskbqDtcXohEgsQSRX9RXqBpsXphUgsQCRV9Bdpd8I1cXohEgsQSRX9RRIpxTzWnRCJ\nBYikShEQaTy9bt0JkViASKoUAZE2UFvrTojEAkRSpQiIJOonWZ/hHCKxAJFUKQoijYnzxVWI\nxAJEUqUoiLSBbrLsg0gsQCRVioJIokHSPqsuiMQCRFKlSIg0iZ6z6oJILEAkVYqESHuKpVh1\nQSQWIJIqRUIkcR2tt+iBSCxAJFWKhkgf0GCLHojEAkRSpWiIdOysZIuHN0RiASKpUjREEo/Q\nrNgdEIkFiKRKERFpV1LdvJgdEIkFiKRKERFJ9KRPY7ZDJBYgkioDdhy35qAnN06vDJmHJQvX\nJDSJ2Z7jyZafTUyOZDAHHvekcSemsd8vnmPMiYfY7xdPllL94RMrUv8NB6054InTKUVqmmzl\nDfR+rOY0j3SiBWmpzIEHPQe4E9kDU9nv6TT2+8Wjdsfsb3dCRSoqT+3E2oRGsV4l4akdC3hq\np0qREUl0o1ditEIkFiCSKkVHpF1lzjgQ3QqRWIBIqhQdkcQT1De6ESKxAJFUKUIiHWlIC6Ib\nIRIHEEmVIiSSWFuiUtRsIRILEEmVoiSSmEaXR949EIkFiKRKkRJJ3ErdI2YEkViASKoULZFy\nLqeh4S0QiQWIpErREknsu4BGhDVAJBYgkipFTCTxd2263zwpiMQCRFKlqIkk/q5L3U33EURi\nASKpUuREEvsvpyv2FmxBJBYgkipFTySR3Y3OKjivLERiASKpUgRFEvkTihUfF7jjIRILEEmV\noiiSEMuqUbOtvjWIxAJEUqVoiiQ8najsS8bkIBILEEmVIiqSEK+Xpxt2QyQmIJIqRVYksasV\nnfEJROIBIqlSdEUS+c+WShiSBZE4gEiqFGGRhFh7PrXYDJEYgEiqFGmRREZbOv8P1kSIxARE\nMqG9SCJvMFVjNgkisQCRTOgvkjgyis7kNQkisQCRTBQFkTxPU/VtnIkQiQWIZKJIiHT4KTrv\nX8ZEiMQCRDJRNEQSwykliy8RIrEAkUwUEZHye1GbXLZEiMQCRDJRREQSx1pTP7ZEiMQCRDJR\nVEQSGY3oca5EiMQCRDJRZEQSu2vRDKZEiMQCRDJRdEQSf1ROeJUnESKxAJFMFCGRxNqKiTz/\nkyASCxDJRFESSfxc7diq3gAADppJREFUmYZzPBogEgsQyUSREklsqkNXbVFPhEgsQCQTRUsk\nkdaBSt6/XTURIrEAkUwUMZGEePdsSmzx3FalRIjEAkQyUeREEkdeu5yI6j+u8BQPIrEAkUwU\nPZG87HzhhpKUeNMPsokQiQWIZKJIiuQl/bXLiG6Jd83iAJFYgEgmiqpIXr5OoeSZUrOHSCxA\nJBNFWCSRN7MctdolkQiRWIBIJoqySEL8dR2Vm5XnOhEisQCRTBRtkUT+zGS6/DvLbgsgEgsQ\nyUQRF0mInZ2ILp+xPkeI7H+37XN2ZSASCxDJRJEXSYjlNyRSkJJXTj1gnwiRWIBIJk4BkYTY\n/sIdLVKatO7ctd1FiZQ8yfY36RCJBYhk4pQQycS+CRXpyr9tdoJILEAkE6eaSN7HdEeqvi7+\nLhCJBYhk4tQTSeQ/lVD+f3H3gEgsQCQTp6BIQrxRrGzct8QhEgsQycQpKZL4MKnMN3G6IRIL\nEMnEqSmS+Cip9GLrXojEAkQycYqKJD4pUfIjy06IxAJEMnGqiiQWli42zaoPIrEAkUycsiKJ\nlVXojuzYXRCJBYhk4tQVSexoRBetjdkDkViASCZOYZFEzgAqMTbWbCASCxDJxKkskhCfVaWL\nVkQ3QyQWIJKJU1skkdY3IaF/1PfBIRILEMnEKS6SEN/Wp0pzIq4jRGIBIpk45UUSR586jZpv\nCGuCSCxAJBOnvkhCbLuRkoalmxogEgsQycT/B5GE+Ohsqjz9SMEmRGIBIpn4/yGSyH68DFWf\nFHzXASKxAJFM/D8RSYi9Q0tTyS5zM4x1iMQCRDLx/0Yk74N94vlESVc/vigNIrEAkUz8PxLJ\ny+pHGicQUe22I+d8u4vzcQWROIBIZnQWycv+j4e3KO87dFfSuW1GfMokAETiACKZ0Vwkg2Mb\nF784vEtKRa9NxVq/kcOQCJE4gEhmioJIgddI/y589FKi0x/zKCdCJA4gkpkiJJLB5gcrUtkR\nqipBJA4gkpkiJpIQmZOrUNmH9iglQiQOIJKZIieSEIe8KiXd8klGYOuX957o375167Z9n/zE\n7pitQSASBxDJTBEUSYjsmXWJite7sWunZjUSyMQ5/edlOEiESBxAJDNFUiQvPzzcpIyhTrXm\nA6bOX//voYx9Gz4dd1M5ohKtnvndLhEicXAyRcp6pk+PcfvC162WASBSJJbfbEj7e9+x8Jbc\nFY9c7LWraufxH/282/oLERCJg5Mp0vgR2/dMGZwXtm61DACRInH5FaFds7pUMT3bK12x9mXt\nBk78YNVe/yMz96/v5jzQ5cravo95K9T6T/tBT3+w2sFZmuIDkSTqnYvk6bDNe5N0Wmdet1oG\nayBSJBLftdv28dODbml9VYqXS8+tWtyvVOIZ55577umB11RV6qe0bJlSv2ox/2b5xh0GjXnu\n5Xfnzn552tMjH77rrrsGjxg39eW5Xy3/ef22AGt/DvDnttQjEQNCJIl65yKt7GLcvvd+aF63\nWgZrIFIk6l9a/feXj6fe1+GK886sWOWCZl2Gz5z7S4EI+f/8/NGUe244vyS547RqF7e+fdiz\n7yz9dY/xXYsYImXt/nXJ288M79P+mpSUy1p36vvo9E9++MvxrQ2RzCy6w7gcNdu8brX0Xmx+\nykuvP7KsSfVkxumVIS2DOTDdk86cmJHGHJjlSY1qyvxzxcevT/cy8615C5Yu97Los7denDjq\ngf59b+vc+dbOnXv2LaBX5xuvbli9VMiqchUMShdI5t0oZiFg+XMvaXlT5659w/DG39Dy8kvq\n1KpQIdEfcOY59Rtf07JdYNjbO3fu5P3/efE51SuU8adUqH5O45SWrb0z6xuBd7bXe7POPTuQ\nVdqbddEl3qxOnbp4u3t07tz52pYpjc+pUSHZF1W2QrVzLvb+d+7c+ZbIKBvu6H1HREvPzp3b\ntGx2yQXnnF4hiWrY3Qtp7Z2L1NckUmDdaum9WGY8Fem23gOKBDt/+Hz2Uw/0bNeiUf1aXi5o\n5KVp80YBUlq0u/2Bp15Z8L8/9nr33f37ik9nPjag3eXnnW6lGJWoUL1WPX/xubVOLx+9Q9kK\ntWo19PXXr3V2QCkLSlaoEcyqXeuM6KzSFc4ORF1cq1bFsvGiXJNUoapxY7S2u/32tnMs0ir/\n07Z55nWrpfcic6OX/jtyrTnoORanV4aMHObAbE82c+LhDObAXE8ad2Kqi/slfd+WLX+sNrFx\ny5Zt+w+G75ThOZq937tjkL37D0UFZezftWXLptVhbNiyZfv+1Khdc/bv+21DIGr3/vSo/kP7\nd2/Zsnm1K1YuWR419tb9BxzfDjnOn9qldvhTiIyOG8zrVstgDV4jRYIf9rFQlF8jiYlDt+8e\nOyxfLF4QWrdaBoBIkUAkFoq0SNnTeveckCbE5NGhdatlAIgUCURioUiLJAFEigQisQCRTEAk\nFiASBxDJDERiASJJ1EOkeEAkFiCSKhApEojEAkQyAZFYgEgcQCQzEIkFiCRRD5HiAZFYgEiq\nQKRIIBILEMkERGIBInEAkcxAJBYgkkQ9RIoHRGIBIqkCkSKBSCxAJBMQiQWIxAFEMgORWIBI\nEvUQKR4QiQWIpApEigQisQCRTEAkFiASBxDJDERiASJJ1EOkeEAkFiCSKhApEojEAkQyAZFY\ngEgcQCQzEIkFiCRRD5HiAZFYgEiqQKRIIBILEMkERGIBInEAkcxAJBYgkkQ9RIoHRGIBIqly\n1/NvWDPrxdfj9Mow+zXmwFdffJU58bWXmQPfePEl7sSX2O+XF+cwJ77Cfr+8OFut/sSK9N3H\ncXiw39x43RLM/Yg58Ll+zzAnfsR9nT/uP5Q78UPuwDH9XmdOnDePOfCVfk+qBfz3hIoUl4Ep\n3P+f2fky5UP7nQqZJr0Kewa2jE/ZWthTsGNdyjSuKIgUDURiASKdUCASCxCJA4h0QoFILEAk\nAIBLIBIADEAkABiASAAwcJJFynqmT49x+07umE5JnXJ7t5GbQ3PUc65L2v+g9xS/vLPzvT/q\nPcW/n+jZfeTvvHM8ySKNH7F9z5TBeSd3UIc8MGLb3qk9DxfMUcu5HuzV5Qeh8xSX9P5p32cD\nsnWeYv6AGdlH3umayTrHkyuSp8M2r/+d1p3UQR2SOeEvIfa33xKco55znfharx+EzlMcsNS3\n0HmK6e03CZHWfjPrHE+uSCu7GF8xvlffj2k2dUwLzlHLua6887BXJI2neKD90vtuGb5J5ykK\n8dC0zMPv3XmUdY4nV6RFdxiXo2af1EFdkDno9YI56jjXrN5rhVckjae4uf0jf2fOvjVd4yl6\nXw0Pbt++91bem/Eki9TXuNTsZg3x910z8wvmqONcn3tO+ETSd4qb23ufIR2/bYnGUxS5Q2ak\nZ8/rmcY6x5Mr0ir/P9F5J3VQx6zr8bkIzVHDua7tnekTSeMpetr/6b0cPE/jKYo1HYxvqfVb\nwDrHkytSagfvzZzRccNJHdQpv9/2s7EIzlHDuU7u0qNHjw7dJmg8xbze3j9GR7st13iK4pf2\n2d7L3gtY53iS3/6eOHT77rHDuH/UzMLRAe97vBwumKN+c800Jnj74gyNpyjm9Vzrmd5b51tR\nZPeekXX04y57Wed4kkXKnta754S0kzumQ9a19/FFwRw1nav3qZ3OU8x7s1fnkX9pPUWxc1zP\nWx/+lXeO+IoQAAxAJAAYgEgAMACRAGAAIgHAAEQCgAGIBAADEAkABiDS/2+6lynsGZwiQCRN\nWGtxT1i1MwGRmIBImjDd4p6wamcCIjEBkfTgBiJKEeLb1smnXfKaEAtoqrfxfZoRaC/g6qvW\nXJtc+dZ9QjRqZGx3PMNoW/6fUtUmHxtRrWyrbaZ9q3X2XrxFdwpDx61i4dVlS130TL4QV179\neY2mIn9cjZIN5hki7b3z7JJn3rzpJF7bUxCIpAdbOtJPG8WSYs0/XzzQkKhPmV0i46yW+f72\nEK1q/ufrfR8V62MWqVWNlr/83Zlaj9v9Xbm2pn37VPJa07fS+d7Vm+uITxPafLZkGD0kxLUX\nX/jiF2IS9fz6wwZ1vSJdUfXVZe82rJJ9Mq/vKQdE0oT+xj1xyXnGo7lD8mGRXrODGJy8M9Ae\nohV9b1xWCxOJ1gmxgpp5N3uan6m9S78Lcc4I2iPyKw0SF55tnNmwU9IB7/6fCJFfrYF3c29S\nGZFBI71rWyfsOSnX81QFImmCIcw+GnLYyyz6UYjFNCrxFREtUmnjsk9imEiGPVvpQe/lg2Q6\nyea+hJfEDtpY6T3xK83fQwONttfoC9GqxDEhdtH9xnbTMuLYGbWWaHW4rCIJRNIEQ5i1FMD7\nH0MMoNbBdhOtagUbTSIZbTtoovdyBJnPLdu4h5hTRXS6S0xPyvyRxhtNC2m27/+ZWO3f7uKV\n8PvadEaXd3NP7PU71YFImuAXqd8PPjxC5F2VUCtTqIn0UA3Rq6t4tq64+RrxE40zmr6kV/37\nr/KL1Mn4b3Z86YP16bKcE3sFT3EgkiYYbqRSn4LtqcU/LzdAxBHpEuM1jrg8nkhf0/YaL3oV\n2ltpoviH7jKaZtMi//7baLCx3Tj4omomvcF/pf4fAZE04U7yPrdqUt7w4M1RueKP00aIF7wP\nen97iJBI1xrvye07LZ5IR04bRRvE8eQxtEaIBtWMY+e0KZ3h3z+vUh3vC6PNCWXEz92Nw15v\npSkn42qeskAkTXicxn0kvk26+M3/jk66Qxy/ok6OyGta/aC/PURIpOdo4r9rWl4UTyRxfaXK\nxmXlKl7nvky8fv5X9xh7+TMeo5s/fqlWShnxT/LFr339QbNy2p9fT2sgkib8fUlSXSFWXJec\ndMHkXDGJjENob0jqHWgvICTS0WHVSzb6fHByPJGm0i3eyyfpdmNj8VVlSl4ypyDj+MiqJRp+\nem8JIdZ3rpJUrfOaE34VT2kgEgAMQCQAGIBIADAAkfTnKyrgJd6dARsQSX+yfivA/nCgrnYG\nbEAkABiASAAwAJEAYAAiAcAARAKAAYgEAAMQCQAG/g8Z0WDeJQEdigAAAABJRU5ErkJggg==" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ], - "id": "lovFEHaWp4lC" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CDlHPQZfcv7I" - }, - "source": [ - "Let's make a two-way scatter plot of prices and (proxied) market shares." - ], - "id": "CDlHPQZfcv7I" - }, - { - "cell_type": "code", - "source": [ - "p1 <- ggplot(data, aes(x = ln_p, y = ln_q)) +\n", - " geom_point() +\n", - " geom_smooth(method = \"lm\", color = \"red\") +\n", - " labs(title = \"Scatter Plot with Regression Line\")\n", - "print(p1)" - ], - "metadata": { - "id": "dNujhir1q_0N", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 454 - }, - "outputId": "06ba2da1-4ffc-4dc4-d762-f0bd6cdc039c" - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\u001b[1m\u001b[22m`geom_smooth()` using formula = 'y ~ x'\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAIAAAByhViMAAAACXBIWXMAABJ0AAASdAHeZh94\nAAAgAElEQVR4nOzdZ0BTV9gH8BMCIWGDIA4cKC5wCw7EPRBtq1gVARUVW9FqVcSqVaugIFZE\nrNZRFbeideDeigMcqCAi4BYcqOw9QpL3w33fNG8SIECSm4T/71Py5Nx7ntwbkoc7zmEIBAIC\nAAAAAOpPi+4EAAAAAEA+UNgBAAAAaAgUdgAAAAAaAoUdAAAAgIZAYQcAAACgIVDYAQAAAGgI\nFHYAAAAAGgKFnSZYvnw5g8HYsmULvWksWbKEwWBs3769sgbBwcEMBiMkJESZWamjarekMqlU\nMgAAUDX1LuwSExNnz57drVu3Ro0a6ejoGBoaduzYcdasWcnJyYro7sCBA2fOnKk6Ii/x8fEM\nCSwWy8rKasKECXfu3Kl7F3JP3tTUtEWLFoaGhvLtQuqmYDAYenp67du3/+WXX16/fl3HLlSN\n5JZUEGrb2tjYqEIyAABQdwz1nXni0KFD06ZN43K5Xbp06datm56e3rdv32JiYj5//qyrq3vq\n1CkXFxf59ti4ceNRo0bt2rWrioi8xMfHd+vWzcDAYNSoUcJgdnZ2SkrKhw8fGAxGWFjYr7/+\nSsWXL18eGBi4efPmOXPmyN6F4pKvrIvg4OClS5euX7/ez89P9pVI3RQCgeDr168JCQk5OTkc\nDufMmTNDhw6Vc/b1ALVtW7durXnFMQBA/aRNdwK1lJeXN3PmTD6ff/z48R9//FEY5/P5a9as\nWblypY+Pz5s3b7S15fYG3759++XLl6ojcmdpaRkRESEa4fP5W7dunTt37qJFi1xdXZs1a1a7\nNSshefl2IbkpCCHFxcV+fn7btm3z9vZ+9+6dlpZ6H4EGAACoI3X9IYyLiysqKnJwcBCt6ggh\nWlpaf/zxh5eX17hx4zIyMoRxqh5ycHAwMDAwNDQcMmTI7du3RRfMz89funRphw4dOByOrq5u\nmzZtFi1alJ+fT706bty41q1bE0J2797NYDCcnJwkI1RLgUCwa9euPn36GBoacjicDh06rFix\noqioSNjRsmXLGAzGmTNntm7d2rRpUxMTkxq9cS0trTlz5gwePLi8vPzChQuVNeNyuZs2bXJw\ncDA0NGSz2TY2NnPmzPn8+XNlb0dyDU2bNuVwOOXl5cJIbGwsdQ7006dPwmBGRoaWlpaDgwP5\n/xdjVdEFk8lMSkr67rvvTE1NORxO165djx49WqONIKSnp7dlyxYjI6O0tLRnz54J49XuBUJI\nWlqah4eHhYWFnp6eg4PDyZMns7OzGQxGr169qAaV7SlZVn78+PHBgwebmZmxWKwmTZq4uLhc\nvHhR9gaSl7VVvUMJIStWrKCyff78uaura8OGDdlsdteuXY8cOVK7bVtZMjJ2JMtWAgAAuVPX\nI3bGxsaEkOzsbD6fL3mcZu/evWIRNze348eP29raenl55eXlnT59esCAAfv37588eTIhhMvl\nfvfdd3fu3OnRo8ecOXO4XO6lS5dCQkJu3bp17949JpM5bdo0Q0PDvXv39u7d283NrWnTpnp6\nemIRqqMpU6YcPHiwcePGM2fO1NXVvXHjxpo1a86dO3f79m3qKiUWi0UIuXXr1vbt20ePHm1g\nYFCLt9+2bdsbN258+/ZN6qt8Pn/06NEXL15s3769t7e3kZHRo0eP/v7775MnT967d69FixaS\nb0dyJcOGDdu3b9/Dhw+FNdnNmzdZLFZ5efmtW7c8PDyoYFRUlEAgcHZ2Flu8ii4+ffrUt2/f\nnj17ent7v3r16syZM+7u7ubm5kOGDKnFptDS0mrYsGF+fn5ZWZkwWO1eyMrKcnJy+vDhQ9++\nfYcOHfrx40dPT8/ly5cTQthsNrWSyvZUtSvfuXPnzz//bGFhMWHChIYNG3769CkyMnLUqFH7\n9u2jPm/VNhBT7Q4VZhsXFzdlypRevXpNmjTp5cuX58+f9/DwsLS0HDx4cC22rVQydlTtVgIA\nAIUQqCcul9u+fXtCyPDhw588eVJ1Y+pYgouLS0VFBRVJSUnR09PT19cvKCgQCAQnTpwghPTu\n3VvYoKysjFr/mTNnqMi///5LCPH29hauVjJCHXnq0aNHfn4+FeHz+dR1b0uWLKEiQUFBhBBj\nY+PLly9XkXNcXBwhpHXr1lJfHThwICEkPDycerps2TJCyObNm6mn//zzDyGkT58+paWlwkWo\nqmXChAmVJS/m4MGDhJDAwEBhxNnZuU+fPi1atPj555+FwdmzZxNCbt++LRAIFi9eTAjZtm1b\nZV2sXbuWEMJisQ4cOCAM/vbbb4SQKVOm1G5TvHjxQktLS0dHJzc3l4rIsheoLTZ+/HjheqKj\nozkcDiFkwIABVETqnpJl5Z06dSKEvH79WrjUhw8fDA0Ne/fuLWMDsS0pyw6Vum2paxm9vLxq\nt22lJiNLR7JsJQAAUAR1PRWrra196tQpW1vbK1eudO/evWXLlpMmTdq6davo+TihPXv2EEJ+\n//13JpNJRdq1axcYGOjj40Md9OrevfvJkyc3b94sbMBisUaPHk0ISUhIkD2rnTt3EkLWrl0r\nPCbBYDBWr16to6Ozb98+YYQQ0qFDh+HDh9fijQsEgh07dkRFRenr63/33XdS21B9rVixQldX\nVxhctGgRi8WKjIwsKSmRpaOhQ4cyGAzhCWsul3v37t0+ffrY29vfunVL2OzmzZuGhoZ9+vSR\n/S307t170qRJwqeurq6EkNTUVNnXQPn69eupU6dGjRrF5/N//vln6iAukW0vnD17lhCyaNEi\n4docHR0nTpwoun6pe0qWlefm5jIYDH19feFSVlZWmZmZ9+7dk7GBGNl3qIODg+i2HT9+PCHk\n5cuX0rdgHVTdkSxbCQAAFEFdCztCSPv27RMSEo4ePTphwoSysrJDhw798ssvnTt3bt68eWBg\noOiJuejoaEJIjx49RBefP39+SEhIq1atCCEtW7Z0dXW1t7cnhBQUFHz58uXLly96enqEEBnL\nIMr9+/cJIY6OjqJBExOTjh07pqenp6WlCYMyVkJfv36dKGLkyJGtW7f28fHR0dHZtWuXhYWF\n5CICgeDx48eSaRgZGbVr1668vPz58+eydG1padm5c+eYmBgej0cIefjwYVFRkaOjY//+/V+8\nePH161dCyLdv35KTk4cMGVKjm1R69+4t+tTMzIwQkpeXV/VSb968ERvupFGjRmPHjn39+rW3\nt3doaKiwZbV7gc/np6SkaGlpde3aVbSN6F23QmJ7SpZd/P333wsEgkGDBoWHhwtvH6HOYFKq\nbSCqRjtUbNuampqSGn6GZVR1R7L/IQAAgHyp6zV2FCaTOWHChAkTJhBC3r59Gx0dfe7cufPn\nzy9fvvzMmTN37txhsVhFRUVFRUVsNps60VaZyMjIkJCQx48fl5aW1i6ZkpKSwsJCQkhll819\n+vSpefPm1GOpNZmkwsJC0RsLtLW1LS0tPT09/fz8xIoS0UVKS0tZLJbwCJYQ1WlmZqYsXRNC\nhg0bFhISEhcXZ29vf/PmTQaDMWDAgA8fPhBCbt26NWHChKioKEKI5AV2VRN779QlkoLqht0R\nG+7k06dPd+/ebd++/cmTJzt06CCMy7IXTExMysvLjY2NdXR0RF+iLlarIlsZd3FYWBiPxwsP\nD/f29iaE2Nrafvfddz4+PtbW1lSzahuIqtEObdSokWgD6qBjtdu2FqroqEZ/CAAAIF/qXdiJ\natWqVatWrSZPnvz169ehQ4c+fPgwPDzcx8eHqhu4XK5AIKB+fiT9888/M2fONDQ09PHx6dmz\np7GxsZaWVmRk5I4dO2RPgFo5g8H4448/pDYQ/S0UKykqU4sBxqr4Lefz+cIGsqAKu9u3b9vb\n29+4ccPOzs7c3NzMzMzIyIgq7G7evEkIqd055ZoSG+6kpKSkY8eOKSkpqampooWdLHuB2jiS\n20HqlhHdUzLuYh0dne3bt69cufLMmTMXL168cePGn3/+GRYWduDAAeqfkGobSGYllx2qHDX6\nQwAAAPlS48IuKyvLzMxM8lfN0tJy9uzZs2fPjo2N9fHx4XA4hoaGBQUFWVlZ5ubmUlcVEBBA\nCDl37lz//v2FwcoueKoMm802NjbOy8v75ZdfZDwgpwgGBgZ6enrFxcW5ubliY6lQ47/Inlv/\n/v3ZbPbt27d/+eWXe/fuzZgxgxCipaXl6OhIXWYXFRXVpk0b6nS2knE4nL///tvFxWXmzJmJ\niYnCa7lk2Qs8Ho/JZBYUFFAPhHHqYGQVarSLqRtCZ86cWVpaunfv3rlz586cOXP06NHC6+Sq\nbUCR4w5VDhX5QwAAqJ/U9Rq7vn37mpubX7p0Seqr1C0RwnErqIvnrl27Jtpm7dq1Q4cOjYmJ\nKSsr+/Tpk4GBgWhVJxAIKlt5Fagh0MRGyCOEZGdn13RVdUG9X+rKQtEcXrx4weFw7OzsZFwP\nm812cnK6e/dudHR0aWkpdSsuIaRfv35JSUlJSUkpKSk1PQ8rRyNGjBg/fnxaWhp126ZQtXuB\nyWRaW1vzeLyUlBTRBrLscVl2cWpqanp6uvApm8328fFxdHTMzc19+/atLA3EyGuHKo2K/CEA\nANRD6lrYjRw5khDi5eV19epV0bhAIDhx4sSff/5J/u9OPaoZISQkJEQ4Pur79+/Xr19/7969\nDh066OrqmpmZFRYWCg/YCASCgIAA6hLv3NxcKkiViVlZWcK+JCPUJVOrVq0SHRv5zp07lpaW\nwmSUgEojKChIdHjhoKCgiooKT09P6oCQZPJSDRs2LCsrixpkeMCAAVSwX79+AoGA2shVFHYy\ndlEXYWFhRkZG27dvF60hZNkLVNqbN28WNnj48OHhw4er7bHalT99+pS6R1t04xcUFLx9+5bJ\nZDZs2LDaBpV1WvUOVSkq8ocAAFAPqeup2CVLliQnJx86dGj48OHW1tZdu3bV09PLycl59uzZ\nhw8ftLS0AgMDhUeYJk+efPz48XPnztnZ2bm4uBQVFUVGRhYUFOzcuZO6m2/q1KmhoaFDhgyh\nSsBz587l5OTs27fP2dk5IiKiWbNmnp6eHTp0YDAY58+f9/b2ZrFY27Ztk4xMmDAhMjLyyJEj\n3bp1c3NzMzQ0TExMPHPmDIfDER1ZQ9EmT5588uTJ06dP9+jRw8XFRUdH58GDB9evX2/btm1w\ncDDVRjJ5qasaPnz44sWLjx8/Tl1gRwV79uypq6t75MgRFosl3MiSZOyiLpo0abJ69ep58+Z5\ne3snJCRQ98fIshf8/PwOHjy4Y8eO1NTUnj17pqamnjx5cvny5WIH/yRVu/IuXbp4eHgcPny4\nQ4cOLi4uDRo0yMzMPH/+/MePH+fNm9egQYMGDRpU3UCyU1l2aF2kp6dLHTrHw8NDOBJ1jajI\nHwIAQH2k/KHz5Ojy5cuenp42NjYcDkdLS8vIyKhz586//PJLQkKCWEsul7thw4bOnTtzOBx9\nff3+/fvfuHFD+GpJScmyZctat26tq6vbrFmz2bNnZ2ZmCgSCqVOn6uvrN2rUiFphcHCwubm5\nrq5u9+7dqQUlIzweb+fOndRMStra2lZWVlOmTElOThb2RY3vun79+qrfmiwjxwqJDVBMvd+w\nsLDu3bvr6enp6uq2b99+6dKlOTk5oktJJi+Jz+dTx5DmzJkjGqemoxg4cKBoUGwkW8kupL73\nV69eEUK6dOlSWQ5Vbwoej0cNZLNw4ULRYNV7gVrtsGHDDA0NjYyMBgwYcOPGDWoQROGbqmxP\nVbtyHo/3999/Ozo6mpubM5lMY2Pjfv36hYeH8/l8GRtIbslqd2hdtm1lVq5cKZmMjB3JsgsA\nAEDuGAIFDIUAoI4ePHjQu3fvUaNGnTt3ju5cAAAAakNdr7EDqIuvX79euHBBbFoR6vCV1MHk\nAAAA1AIKO6iPrl69OmrUqFmzZnG5XCqSl5e3YcMGQkhlE7UBAACoPpyKhfqovLx8yJAhd+/e\ntbOzGzlyZHFx8enTpz9+/Ojq6nry5Em6swMAAKglFHZQTxUUFGzcuPHYsWOpqak8Hq9du3ae\nnp7z58+v0by3AAAAKgWFHQAAAICGwDV2AAAAABoChR0AAACAhkBhBwAAAKAhUNgBAAAAaAgU\ndgAAAAAaAoUdAAAAgIZAYQcAAACgIVDYAQAAAGgI9Rtkv7Cw8LfffquiQUVFBYPBYDKZSktJ\n+fh8PiFES0uT63IejycQCDR7HgiBQMDn8zX+s0q9RwaDQXcuCsTj8TR7PwoEAh6Pp6Wlpdlf\nO3w+n8FgaPZnFT+RGsDc3DwgIKCyV9XvV5PL5WZmZm7cuLGyBjk5OUwm08jISJlZKVlZWRmf\nz+dwOHQnokCFhYVcLtfExESDv2R5PF5xcbGhoSHdiShQaWlpSUmJgYGBjo4O3bkoUG5uromJ\nCd1ZKBCPx8vPz9fV1dXT06M7FwUqKirS1dXV7P8nc3NzGQyGsbEx3YkokGb/RFZUVPz6669V\nNFDLj6+Ojk7Tpk0re5X6s9TsL9nS0lI+n6/Z37D5+fnl5eUNGjTQ7MKusLBQs79hS0pKioqK\njIyMWCwW3bkoEIfDMTMzozsLBaqoqNDT0+NwOPr6+nTnokAFBQVsNluz/wlhs9laWlqmpqZ0\nJ6JAmv0TyeVyq26gsQcqAQAAAOobFHYAAAAAGgKFHQAAAICGQGEHAAAAoCFQ2AEAAABoCBR2\nAAAAABoChR0AAACAhkBhBwAAAKAhUNgBAAAAaAgUdgAAAAAaAoUdAAAAgIZAYQcAAACgIVDY\nAQAAAGgIFHYAAAAAGgKFHQAAAICGQGEHAAAAoCFQ2AEAAABoCBR2AAAAABoChR0AAACAhkBh\nBwAAAKAhUNgBAAAAaAgUdgAAAAAaAoUdAAAAgIZAYQcAAACgIVDYAQAAAGgIFHYAAAAAGgKF\nHQAAAICG0KY7AQAAVfH69etbt26Vl5f37t27W7dudKcDAFBjKOwAAAghJCgoKCAgoKysjHo6\nffr0Xbt2MRgMerMCAKgRnIoFACBXrlxZtmyZsKojhISHh//11180pgQAUAso7AAAyJ49eySD\nu3fvVn4mAAB1gcIOAIBkZGTIGAQAUGUo7AAASJs2bSSDbdu2VX4mAAB1gcIOAIAsXLjQ0NBQ\nLLhy5UpakgEAqDUUdgAAxMbG5uzZs+3bt6eeNm7c+MiRI4MHD6Y3KwCAmsJwJwAAhBAyYMCA\n5OTkjx8/lpWVWVtba2nh/14AUD8o7AAA/mNlZUV3CgAAtYd/SQEAAAA0BAo7AAAAAA2Bwg4A\nAABAQ6CwAwAAANAQKOwAAAAANAQKOwAA9ZaRkVFUVER3FgCgElDYAQCoq6NHj7Zo0aJhw4ZG\nRkaDBw9OSkqiOyMAoBkKOwAAtXTx4sWJEyempaURQvh8/s2bN4cPH56ZmUl3XgBAJxR2AABq\n6ffffxeLfPr0acuWLbQkAwAqAoUdAIBaSk5OlgzibCxAPYfCDgBALZmamkoGzczMlJ8JAKgO\nFHYAAGpp0qRJkkFPT0/lZwIAqgOFHQCAkqSmpnp7e3fq1MnBwWH58uWFhYV1Wdvq1auHDBki\nfMpisYKDg/v161fnNAFAjWnTnQAAQL3w/v37bt265ebmUk8fPXp08eLFmJgYXV3d2q2QzWZf\nu3bt8uXLsbGxBgYGzs7OHTp0kF++AKCWUNgBACiDr6+vsKqjPHnyZPPmzX5+fnVZrbOzs7Oz\nc91SAwDNgVOxAADKEB0dLRm8c+eO8jMBAA2Gwg4AQBmYTKZkUFsbp00AQJ5Q2AEAKMOwYcMk\ng8OHD1d+JgCgwVDYAQAoQ0hIiJWVlWhk6NChP/30E135AIBGwlkAAABlsLCwePbs2caNG+/d\nu6enp+fs7PzTTz9paeG/awCQJxR2AKBycnJy1qxZExUVxePxnJycVqxYYWlpSXdScmBiYuLv\n7093FgCgyVDYAYBqKSgo6NWr16tXr6inT58+PXXqVHx8vIWFBb2JAQCoPpwFAADVsnbtWmFV\nR/n8+fPvv/9OVz4aTCAQpKSkXL169f3793TnAgDyoZZH7AQCQUVFRV0aqDs+n8/n8zX7PQoE\nAkJIRUUFg8GgOxdF4fP59eGzSgjh8Xiyv02pQ7vduXNHxTeUiqcn6d27d1OnTo2JiaGeurq6\n7tixw9TUVGpjHo9HCKkPXzs8Hk+Dv3Mo9eFrR4M/q9W+L/Ur7AQCAZ/PLykpqaJNtQ3UHY/H\nEwgEVOmjqagfktLSUroTUSBZPszqjtqP5eXldfyS1dLSUuUNJRAIVDk9SVwud/z48U+fPhVG\nTp06JRAI9u/fL7W98B8t9XqbNVVRUSEQCLhcLt2JKBC1KzV7P1JfO5r6HrlcLvUPc2XUr7Bj\nMBhMJtPQ0LCyBmVlZVU30AClpaV8Pl9PT4/uRBQoPz+/vLzcwMBAg/975vF4hYWFmv1ZLSkp\nqaio4HA4LBZLxkVGjBhx9+5dsaCzs7Mqb6js7GxVTk/S5cuXRas6SmRkZGZmprW1tWT7ioqK\n8vJyFoulr6+vlATpUVBQwGazdXR06E5EgcrLy7W0tNTr41pTmv0TyeVyq76bHtfYAYBq8fPz\n69Gjh2ikbdu2AQEBiutRsw9+S5WamlqjOACoCxR2AKBadHV17969u379ehcXl+HDh69evfrJ\nkyeKOMDA5XJDQkJatWqlra3dunXr0NBQTb0oR1LTpk2lxsWGUAYAtaN+p2IBQOOx2Ww/Pz8/\nPz+F9rJ48eKNGzdSj9++fbtw4cL09PT169crtFMVMWTIEFtb26SkJNGgi4uLjY0NXSkBgFzg\niB0A1Edv374VVnVCISEh9WTgDzab/e+//3bq1EkYGTRo0J49e2hMCQDkAkfsAKA+SkhIkBqP\nj49v2bKlcnOhh62t7ZMnTx4+fJiWltauXbtu3brRnREAyAEKOwCojyq7u9PAwEDJmdBIW1vb\n0dHR0dGR7kQAQG5wKhYA6iNHR0fJ+WcbN26MKgcA1BoKOwCoj/T19Q8cOCB63M7AwODAgQOa\nOvYVANQTOBULAPXUsGHDUlJS9u/f//r1axsbGy8vr8oGAQEAUBco7ACg/rKysvr999+raCAQ\nCMLCwk6cOFFaWjpw4MDAwEBdXV2lpQcAUFMo7AAApBMIBHZ2dsnJydTTx48f79q16/379yYm\nJvQmBgBQGVxjBwAg3cKFC4VVHSUvL8/Z2ZmufAAAqoXCDgBAun379kkGHz16pPxMAABkhMIO\nAEC6oqIiySCfz1d+JgAAMkJhBwAgHYfDkQxqaeFrEwBUF76hAACk8/T0lAx26dJF+ZkAAMgI\nhR0AgHTBwcFNmjQRjejq6h49epSufAAAqoXCDgBAOgMDgydPnvz888/Nmze3sLAYO3ZsXFxc\nmzZt6M4LAKBSGMcOAKBSlpaWO3bsoDsLAABZ4YgdAAAAgIZAYQcAAACgIVDYAQAAAGgIFHYA\nAAAAGgKFHQAAAICGQGEHAAAAoCFQ2AEAAABoCBR2AAAAABoChR0AAACAhkBhBwAAAKAhUNgB\nAAAAaAjMFQsAUH+Vl5efOXPm5cuXVlZW33//vampKd0ZAUCdoLADAKin3r596+Li8vLlS+qp\nhYXFsWPHBg4cSGtSAFAnOBULAFBPeXp6Cqs6QkhGRoa7u3tubi6NKQFAHaGwAwCoj16+fHn/\n/n2x4JcvXy5fvkxLPgAgFyjsAADqo6ysrBrFAUAtoLADAKiPbGxsmEymZLx9+/bKTwYA5AWF\nHQBAfWRhYTFv3jyx4LBhw3DzBIBaQ2EHAFBPrV27dunSpRwOhxDCZDInT558+PBhLS38LgCo\nMfwBAwDUUywWKygoKD8//+3bt4WFhfv37zc3N6c7KQCoE4xjBwBQr2lra1tbW9OdBQDIBwo7\nAACQybt37y5fvpyRkWFnZ+fm5sZisSTb8Pl8nMwFoBH+/AAAoHoREREODg5+fn7r1q2bMmVK\np06dPn36JHy1oKBg0aJFTZo0YbFYdnZ2Bw8epDFVgPoMhR0AAFTjzZs3v/32W2lpqTDy8uVL\nb29v6rFAIPDw8AgJCUlPT+fxeElJSZMnT/7nn39oShagXkNhBwAA1Thz5kxJSYlY8MqVK9nZ\n2dSDc+fOib26aNGisrIyJeUHAP8HhR0AAFQjLy9PMigQCKj406dPJV/Nz89/8+aNwjMDgP8P\nhR0AAFTD1tZWMmhiYmJlZUUI0dfXl7qUgYGBYtMCAAko7AAAoBqurq729vZiwaCgIB0dHUKI\ni4sLNcqxKAcHh+bNmyspPwD4PyjsAADEZWZmXr58+cKFC1+/fqU7F5Wgo6Ozf/9+d3d3NptN\nCGnatOn27dtnzZpFvdqqVau//vpLdPQTS0vL/fv305MrQP2GcewAAP6fbdu2/fbbb4WFhYQQ\nDofj7++/aNEiupOin4WFxc6dOw8cOJCfn29qair26owZMxwdHY8ePZqenm5raztt2jRjY2Na\n8gSo51DYAQD858aNG7NnzxY+LSkp+e2332xsbFxdXWnMSnUwmUzJqo5ia2vr7++v5HwAQAxO\nxQIA/GfLli2Swb/++kv5mQAA1AIKOwCA/4jOpiD08eNH5WcCAFALKOwAAP4j9UbOli1bKj0R\nUD8YkBlUAQo7AID/zJ8/XzK4cOFC5WcCauT06dOdOnXS19c3MTGZMmXKly9f6M4I6i8UdgAA\n/+nbt+++ffvMzMyop0ZGRlu3bh0xYgS9WYEqO3fu3JgxYxITE3k8Xl5e3oEDB0aMGCE6ry6A\nMqGwAwD4f6ZMmZKamnr37t1bt259+PBBOFobgFS+vr5ikadPn+7Zs4eWZAAw3AkAgDgDA4O+\nffvSnQWogZKSklevXknGpc6fC6AEOGIHAABQSywWS3TKDSEjIyPlJwNAUNgBAADUGpPJHDt2\nrGRcahBACVDYAQAA1N7mzZs7dOggGgkMDOzduzdd+UA9h2vsAAAUq7i4ODIy8s2bN82bNx8z\nZgwmUdUw5ubm8fHxhw8fjouLMzU1/eGHH7p37053UlB/obADAFCgxMTEUaNGpcI9de0AACAA\nSURBVKWlUU8tLS1Pnjzp6OhIb1YgXywWa+rUqVOnTqU7EQCcigUAUBgej+fu7i6s6gghX79+\ndXd3LyoqojErANBgOGIHAKAocXFxiYmJYsG0tLSoqKhRo0YptOtHjx7duHGjoqLC0dFx4MCB\nCu0LAFQHCjsAAEXJzs6uUVxeFixYEBYWJnw6bty4iIgIJpOp0E4BQBXgVCwAgKK0a9dOalzs\nJkq5KCgoWL58ed++fW1sbESrOkLI8ePHN2zYIPceAUAFobADAFCUFi1azJw5Uyw4duxYe3t7\n+XZUXFzcp0+fwMDAmJiYN2/eSDY4dOiQfHsEANWEwg4AQIE2btzo5+fH4XAIISwWy8fHRxGz\niP7555/Pnz+vokFOTo7cO1VBBQUFdKcAQDMUdgAACsThcNavX5+fn5+amlpYWLht2zZFTDZ1\n+/btqhvY2trKvVPVUVZW5u/vb25ubmRk1KBBg2XLlpWUlNCdFAA9cPMEAIDCaWtrN2/eXO6r\nLS0t3bVr1+PHj6XOQy/EZrNXr14t995Vh5+f35YtW6jH2dnZQUFBX7582b17N71ZAdAChR0A\ngFrKycnp06fPixcvqm5mZ2cXGhrq4OCgnKyU7/3798KqTig8PNzX19fOzo6WlABohMIOAEAt\n+fr6VlHVWVtb3759W19f39TUVJlZKV9lFxc+e/YMhR3UQyjsAADU0pkzZySDjRs3bteuXd++\nfRctWlRPJqWt7JpFExMTJWcCoApQ2AEAqCWp9wf0798/IiJC+cnQqFevXtbW1u/evRMNNm3a\ntF+/fnSlBEAj3BULAKCWevToIRmU+wh5qo/FYh0+fNjc3FwYMTU1PXTokL6+Po1ZAdAFR+wA\nANTSxo0b+/XrV1paKozY2dn98ssvNKZEl969e7948SIiIuL169fW1tbu7u6idR5AvYLCDgBA\nLdnb20dFRS1btiw2NtbQ0HDkyJFr1qyhRkKuh8zMzGbPnk13FgD0Q2EHALQpKCg4fvz4u3fv\nmjdvPm7cOFztXlO9evW6du0a3VkAgApBYQcASpKZmfnw4cPi4mJ7e/uWLVvGxcV99913nz9/\npl79/fffIyMjHR0d6U0SlOnjx48bNmx4/vy5ubm5m5vb6NGj6c4IQO2hsAMAZdi7d++8efPy\n8/MJISwW65dffjl//rywqiOEZGRkuLu7Jycn6+np0ZcmKE9CQkLfvn0LCwupp0eOHJk/f/7G\njRvpzQpA3eGuWABQuEePHs2aNYuq6ggh5eXlGzdufPnypViztLS0auc8BY0xffp0YVVHCQsL\ni46OpisfAM2Awg4AFG737t2iN29WIScnR9HJgCrIzs5+/PixZPzKlSvKTwZAk6CwAwCFEz3l\nWjXMAVVPVFRU1CiuRlJSUry9vfv06TNmzJijR4/SnQ7UO7jGDgAUrkWLFpJBc3PzzMxM0YiH\nh0fnzp2VlRTQqWHDhm3atHn16pVY3MnJiZZ85OXu3btDhw4tKyujnp4+fTomJmbTpk30ZgX1\nCo7YAYDCzZ49W/KWiI0bNy5ZssTAwIAQwuFw5s+fv2PHDjqyA3r8888/YpGhQ4e6uLjQkoxc\nCASC6dOnC6s6yl9//XX//n26UoJ6CIUdAChc+/btjx49amVlRT3V19ffsGHDpEmT1q5dm5eX\n9/nz54KCgo0bN1JFHtQTGRkZYpGEhIQvX77QkoxcfPz4UfIYJCHkxo0byk8G6i0UdgCgDN99\n993r168fP34cHR39+fNnX19fKq6lpdW4cWMmk0lveqBM3759W7Vq1YwZMyTjf//9Ny0pyYVA\nIKhRHEARcI0dACiJrq5u9+7d6c5Co8TFxa1bt+758+eNGjXy8PDw8vLS0lL1f9cTExP79euX\nm5sr9VXJQXDUSLNmzVq1avX27Vux+MCBA+lIB+opFHYAoMkePnx45cqVwsLCnj17jhkzRvXr\nHtldu3Zt2LBh1OPExMRr1649ePBg+/bt9GZVLS8vr8qqOkKIhYWFMpORLwaDsXv37kGDBokG\nZ86c2bdvX7pSgnoIhR0AaKylS5cGBwcLnzo5OV25coXD4dCYkrwIBALJU5k7duyYPHmyKpcR\n6enpT548qaLBlClTlJaMIgwcOPDp06fr1q1LTEykDqOq+zsCtYPCDgA00+XLl0WrOkLI3bt3\nly1bFhoaSldKcpSampqamioZv337tioXdlWMU83hcDZs2NCzZ09l5qMInTt3PnToEN1ZQP2l\nOWclAABEHTt2TDIYERGh/EwUgcFgSI3Tda5ZIBDIMmtI8+bNGzVqJBn39fV98eLFrFmzFJAa\nQP2Cwg4ANFNeXp6MQXXUvHnzNm3aSMYHDx6s5Ezy8/Pnzp1rZGRkZmbWsGHDdevW8Xi8yhoz\nmcwtW7aIBV1dXTds2NCsWTMFZwpQL6CwAwDN1LFjR8mgxsxswWAwwsPDdXV1RYO+vr4ODg7K\nTEMgEEyZMmXLli2FhYWEkIyMjCVLlvj7+1exyI8//njp0qV+/fqZmZl16NBh9erVhw8fVla+\nAJoP19gBgGaaN29eeHj4hw8fRIPr1q2jKx+5c3Jyevr06YYNGxITExs3buzu7j5u3Dgl5xAd\nHX369Gmx4Nq1a+fNm9egQYPKlnJ2dnZ2dlZwagD1lAoVdhcuXDh16lRWVlbTpk2nTJmi5P87\nAUDDmJqaXr9+3dfX98qVK1wut1OnTsHBwf3796c7L3lq166d5MRcyvT8+XPJYEVFRUpKiirf\nwwGgwVSlsLt+/frRo0fnzp3bvHnze/fu7dy5087OTnJySQCAquXm5h49evTdu3fW1tYTJkw4\ne/ZsRUUFl8tVkVFOvn79euLEiQ8fPtjY2Li5uan7LGrGxsZS4yYmJkrOBAAoqlLYHT161MvL\ny97enhAyevTo0aNH050RAKifBw8efP/998JJSJcvX3769GlHR0dtbZX4rrty5cqECROEN3Cs\nXLny0qVLUq8FVBfDhg1r0KBBVlaWaLBLly62trZ0pQRQz6nEzRNZWVnUxM+//vrr+PHj/fz8\nUlJS6E4KANRMWVmZu7u76NTymZmZ7u7uJSUlNGYllJubO3nyZNHbcj99+uTu7s7n82nMqo4a\nNGiwf/9+Q0NDYcTKyurw4cOVjcYCAIqmEv/FUv/tXbt27bfffjM2No6IiPD399++fbvwIH9E\nRER8fDz1WFtbm8fjFRQUVLHCahuoO2o0gSrGFNAAFRUVhBDqVjtNJRAI6slntaSkpKysTNF9\nRUdHv3v3TiyYlpZ29epVsVme5E4gEFS7H8+dO/ft2zexYGJi4r1791T/Xl1qGvvy8nLJMrRf\nv36PHz8+ffr0x48f27RpM27cOD09PTX9VFdUVBQXF2vSvHOSBAIBn89X0x0kI83+ieRyuVX/\nN0hPYXf37t2QkBDq8dq1a5lMJiHEzc3NysqKEDJ9+vSbN28+evRoyJAhVBtqGkTqsbGxsbm5\nedU/EgKBQAm/IrSjSh/NVh/2Y314j1wuVwm9iJ0QFMrMzFTCRq62i+zsbKlx5aQnFzweT+qP\npamp6dSpU4VP1eXtSKWp1YAo/ESqtWq/Tukp7Lp3775p0ybqcaNGjah/HfT19akIk8k0MzMT\nHcTc19dXOCJ5QUFBQECAqalpZSvPyclhMplGRkaKyl4FlJWV8fl8FbkYXEEKCwu5XK6JiYkG\nn9Ph8XjFxcWip7E0T2lpaUlJiYGBgY6OjqL7quxW+p49e1bxjSEXubm51d4uQF1DLIbJZCoh\nvbrj8Xj5+flsNluTvnZKSkrCwsJu3rxZUVHRp08fPz8/Foulq6urIldkKkhubi6DwajsrhfN\noNk/kVwut+qDyvR8fPX09Fq0aCF8qqOjY2pqmpKSYmNjQwgpLy/PyMiwtLQUNjAzMxM+zsnJ\nYTAY1EG+ylTbQN1RO1Wz3yNVzzGZTA0u7Ei9+axqaWkp4W22bdt25syZO3bsEA1Onz5dCXcn\nyLIfHR0df/zxxxMnTogG/fz8GjdurMjU5IM6FatJH9eysrKBAwc+fvyYenr79u0jR47cvn27\nSZMmGvMeK6NJ+1Eqzf6JrPaqXJW4kkBLS+v777+nLqTLzMzcsWMHm83GOHYAUFNhYWG///47\ndTTCyMho8eLFkhNY0WjPnj2//vorNcSJmZnZmjVrVq9eTXdS9dTGjRuFVR0lLS1t1apVNKUD\nIDeqcsB57NixxcXFoaGhhYWF7dq1W7NmDZvNpjspAFAzbDY7MDAwMDAwIyPDwsKC7nTEGRoa\nbtq0KSwsLCsry9zcnO501Mnt27dDQ0NfvXrVrFkzb2/v8ePH13GFN2/elAzeunWrjqsFoJ2q\nFHZaWlpTpkyZMmUK3YkAgCZQwapOiMFgoKqrkcOHD3t6elKPk5KSLl++vGLFioCAgLqskzq5\nLEsQQL2oxKlYAAAAqYqLi2fPni0WXL169cuXL+uy2gEDBkgG7e3t582b5+DgMGjQoODg4NLS\n0rp0AUALVTliBwAAICkhIUF0VGehu3fvtm3bttarXbhw4bFjxxISEoQRS0vL69evCwdkiIqK\nOnfuXFRUlGbfJAuaB0fsAABAdVV2X3wdhxFms9kxMTErV650cnJycHDw9fV1cHAQHWaLEBId\nHb19+/a69AKgfCjsAABUTkFBwbNnz8TqjPqpa9eukpck6urqSj2XWiP6+vqrVq26c+fOw4cP\nN2zYcP/+fck2UVFRdewFQMlQ2AEAqJDi4uJZs2aZmpp27tzZzMxs3LhxX79+pTspOunq6u7a\ntUssuHr1amtra/l2JPXQoGaPowkaCZcOAACokF9//XX37t3CpydOnMjKyrp+/bpmT2BatdGj\nRz958mTTpk0vX75s1qzZjBkzhg0bJvdeBg0adOzYMbHg4MGD5d4RgEKhsAMAUBWfPn0Sreoo\nUVFRt27dGjRoEC0pqYhu3brt3btXoV2EhobeuHEjMzNTGOnfv//PP/+s0E4B5A6FHQCAqnjz\n5o3U+KtXr+p5YacETZs2ffDgwebNm2NjYzkcjouLy5w5czR1WirQYCjsAADo9+jRo8DAwLi4\nOKmvis6dDYpjYWHx559/6ujo0J0IQO2hsAMA+E9eXl5oaOi9e/fYbPbQoUNnzZqlhJ/5mzdv\nVnEtV8uWLYcOHaroHGiRmZm5fv36R48eGRsbu7i4TJ8+HUfIAOoIhR0AwP/Kzs7u3r17amoq\n9fTs2bPHjx+/ceOGooeoreJCrubNmx87dkxfX1+hCdDi48eP3bt3z8jIoJ6eOnVq+/btDg4O\nDAZj4MCBEyZMwB2pALVQf2+zAgAQs3jxYmFVR7lz587ff/+t0E6/fv36+vVryfjw4cNPnTqV\nnJzs4OCg0AToMnfuXGFVR3ny5MmOHTu2b98+ceJEFxcXHo9HV24A6guFHQDA/7p27Zpk8OrV\nqwrttLKTj3369BkzZoyenp4sK8nPz79y5cqRI0eSk5Plmp0CSd3aQpcvXw4LC1NaMgAaA4Ud\nAMD/knqISNHHjczNzbt37y4ZHz58uIxruHDhQrt27ZydnT08PGxtbT09PcvLy+Wao0JUu2Ej\nIyOVkwmAJkFhBwDwv5ycnCSD/fr1U3S/4eHhYlfRzZ8/39HRUZZlU1NTPTw8vnz5IowcPnx4\n+fLlck5RAaRubVHFxcXKyQRAk6CwAwCNUl5efvbs2b/++uvMmTNlZWU1Wnb9+vVi05J26tTJ\n19dXrglK0aVLl+Tk5Pnz5w8ZMmTixImRkZEbN26UcdmDBw/m5eWJBbdv3676F6ht3rzZwMCg\nigb29vZKSwZAY+CuWADQHMnJyWPGjHn58iX11MbG5tSpUx07dpRx8aZNm8bHxwcEBNy/f5/F\nYg0fPnzx4sVsNlth+f6nWbNmshdzotLT0yWDBQUFBQUFJiYmdc5Lgdq1axcfH7969epHjx6x\nWKyUlJSSkhLhqxYWFitXrqQxPQA1hcIOADRERUWFm5ubsKojhLx+/drNzS0uLo7FYsm4kqZN\nm+7YsUMxCSpEy5YtJYNmZmbGxsZKz6XGWrduLZwoLCUlZcmSJbdv3yaEDBo0KDg4uEmTJnQm\nB6CecCoWADREbGzss2fPxIJJSUkxMTG05KMcU6ZMadSokVjwt99+U7tB4Nq3bx8ZGZmdnZ2d\nnX3ixIk2bdrQnRGAWkJhBwAaQmxQNKFv374pORNlatiwYWRkZIcOHainOjo6ixYtWrRoEb1Z\nAQBdcCoWADSEjY2N1Hjbtm2VnEmtZWdnHzhw4M2bN82aNfPw8GjatKksS/Xq1evZs2cpKSnZ\n2dl2dnZmZmaKzhMAVBYKOwDQELa2tuPHj//3339Fg6NHj+7atStdKdXIo0ePRowYkZWVRT31\n9/c/evToqFGjZFmWyWTa2dkpMjsAUA84FQsAclNWVlbTEUbka+fOndOmTdPS0iKEMBiMKVOm\n7Nmzh8Z8ZFdRUeHu7i6s6gghRUVFXl5e2dnZNGYFAGoHhR0AyEFsbGzfvn0NDAwMDAycnJwe\nPnxISxrGxsbh4eG5ubkJCQm5ubn79u0zNTWlJZOaio+Pl5wxNisr68aNG7TkAwBqCoUdANTV\n69evhwwZEhMTU1FRUVFRER0dPXTo0FevXim637Nnz44fP75v377e3t6JiYnCuKGhYadOnYyM\njBSdgBwVFBTUKA4AIBWusQOAuvL39xerPwoKClatWnXo0CHFdbpq1Sp/f3/qcUxMzMGDB8+e\nPSv7/Kqqxs7OjslkSk4X0aVLFzn2kpCQcPv27YqKCicnJ8zrAKCRcMQOAOpK9GiZkOSQcnKU\nlJQkrOoo5eXl06ZNq6ioUFynCtWwYcMlS5aIBSdPnty9e3d5deHn59elS5e5c+cuWLDAwcHh\np59+EggE8lq5WhAIBPv27XN2du7SpYu7u3t8fDzdGQHIHwo7AKgrqZMcKHTmg1u3bkkGP3/+\nnJycrLhOFW3VqlUhISFWVlaEEAsLi2XLlslxDoxjx45t2LBBNLJr1y71mmOj7ubMmTN16tQr\nV64kJCRERET07t372rVrdCcFIGco7ACgrtzc3CSDEydOrKw9n89/+/ZtXcYN5vP5UuNqfQhK\nW1t74cKFHz58KC4u/vbt25o1azgcjrxWvm/fPhmDmur+/ftbt24VjZSVlXl7e1f2WQJQUyjs\nAKCufHx8xMo4Nze32bNnS228d+/eRo0atW7d2tLSsnv37rGxsbXosV+/fpLBhg0bCidgUGty\nrOeERAdSEcrMzJR7RyqLmoVWTFpa2rt375SfDIDioLADgLpiMBhHjhy5du3aihUrli9ffu3a\ntYiICKlzlZ49e3batGnCub/i4uJcXFw+ffpU0x47d+68cOFCseA///yjo6NTi/zVV3p6+rx5\n8/r27evs7Lxp0yYul1tZy3bt2kkG27dvr8jsVEtlk+eq3aS6AFXDXbEAIB9DhgwZMmRI1W1W\nrlwpFsnKytq0adOff/5Z0+7Wr1/frVu3ffv2ffr0qUOHDosWLerVq1dNV6LWUlNTu3fvLhzB\n+MqVK+fPn7906RI1PrOYpUuXnjhxoqioSBhhs9mSu0PuMjMzb968mZ2d3blz5z59+ii6uyoM\nGjRIMtiqVStra2vlJwOgODhiBwDK8/LlSxmD1WIwGJ6enleuXHn+/Pnx48frW1VHCJk7d67Y\nvBRXr17du3ev1Mbt27c/f/68cNoxGxubEydOKHrEk5MnT7Zt23bChAk+Pj6Ojo4jR44sLi5W\naI9VsLe3X7BggWhEV1c3PDxcpY7YVXHMFUBGKOwAQHksLCxkDKqj0tLSmJiYM2fOvH//Xgnd\nRUVFSQarmKliwIABiYmJX79+/fz586tXr0aOHKnA5Ah5+/atl5dXTk6OMHLx4kU/Pz+Fdlq1\n0NDQo0ePjh49ulevXtOmTYuLixswYACN+Qjx+fy///67VatWurq6jRs3XrZsGY0VMKg7FHYA\noDxTp06VDHp5eSk9Efm7c+eOra1t3759R48ebW1tPW3atPLycoX2WLtbgBs2bNi4cWO5JyPp\nyJEjhYWFYsG9e/fSe1BqwoQJkZGR9+/fDw8PV51bbYKDg+fMmfPu3TuBQPDly5egoCBvb2+6\nkwJ1hcIOAJRn2bJl48aNEz5ls9kbN250cnJSUHfZ2dlJSUmKLrAIIRkZGT/++KPo/ZV79+5d\nvny5QjsdOHCgjEFaSB3OpqSkJD8/X/nJVOvp06eurq52dnY9e/ZctWqVMg+Y5eTkiI22TQiJ\niIi4d++e0nIATYKbJwBAebS1tf/999/79+/HxsZyOJyhQ4e2bNlSLmtOT0+/fv16Tk5Ot27d\nnJycUlNTfXx8Ll26RAhhs9kLFy709/dnMply6UvS8ePHhbf6Cm3dujUwMFBxN+r+9ddfd+/e\nzc3NFUYGDRo0bdo0BXVXU9RIy2IaNGhgamqq/GSq9ujRo379+pWWlhJCPnz4EB8ff+vWrWvX\nrinuAyMqOTlZ6v8eT58+pfd2E1BTKOwAQNl69+7du3dvOa7wwIEDs2fPFp74c3Z2Tk9PT0hI\noJ6WlpYGBgZqa2uvWrVKjp2KkjpiS1FRUXZ2tqWlpTDC5XJ37doVFRXF5/P79evn4+PDYrFq\n3am1tfWzZ8/Wrl374MEDY2NjFxeXX3/9VTm1SLViYmJCQ0Ml48uXL5d60y69Zs+eTVV1QlFR\nUQcPHlTORQIGBgZS44aGhkroHTSQQN1kZ2d7eHhU0SAjIyMnJ0dp+dCipKSkqKiI7iwUKy8v\nLyMjg8/n052IAlVUVOTm5tKdhWIVFxdnZGSUlZUprovnz5/LMqIvm82u9V/N+fPne/XqxWaz\nmzVrNn/+fMm9FhgYKNmjgYFBeXm5sE1ZWZnYAZhu3boVFxfX/p3LrKysLC4u7vHjx7Xujsvl\nZmRkFBYWytI4Ly9P8nCdlpZWQECACv5Fl5eXS70xdubMmcpJgMfjSQ4oaGxs/PXrV0V0l5mZ\nmZ2drYg1qw7N/oksLy//4Ycfqmigcv85AQDUyOHDh0tKSqptVlpampaWVov1nz17dtSoUQ8e\nPCgtLf3w4UNYWNgPP/zA4/FE24wbN070yBxlxIgR69ev37lz5+fPnwkhISEhYldNxcXFrV69\nuhYp1cjJkydbtmzZrVu3Hj16WFlZVTYeihxdvXr148ePYkE+n+/m5qZSY4tQtLS0pB7mVNpg\n11paWocPH27QoIEwwuFwdu/e3bBhQ+UkABoGhR0AqDfJi9ukYjAYtRhXRSAQzJ07Vyx4+/bt\nY8eOiUbMzc1PnjzZpk0b0cjx48eXLVv2888/t2vX7t9//6Uu+BNz8eLFmqZUI48fP/b09ExP\nT6eeZmdnT5s27ZqCZ76vbI/UZXZgxWEymcOGDZOMjxgxQmk5dOvW7eXLl6GhobNmzQoKCkpK\nSvrxxx+V1jtoGBR2AKDeRMspIckLuUaPHi16UERMSUmJv79/q1at2Gx2165dDx8+TMWzs7NT\nU1Ml2z9+/Fgs4ujomJiYGBsbe/78+REjRohOw1pYWDh9+nTJsT+I4gekDQ0NFbt6jBASHBys\n0E4r2yNS46pg27ZtYkX/9OnTR40aJRrJysp6+PDh169fFZSDmZnZggULtm7dunTpUnndUQT1\nEwo7AFBv06dPb9asmVhw7ty5ZmZmwqe9evXauXNn1StZtWrVu3fvysrKnj596unpuW3bNkII\nm82WerG/np6eZJDFYtnb2zs5OV25ckXspcLCQqnXwiv6tkepM9wretr7gQMHSg786+PjI3m2\nWkW0aNEiOTl5xYoVI0aMcHd3//fff3fv3i18NT8/38vLy8LColevXo0aNRozZoziyjuAukNh\nBwDqzczM7Pz588LbbI2MjEJDQ11cXGS/nOvu3bsRERFiwUWLFhUXF+vr6w8dOlRykR9++KGy\nteXm5vL5fMl4jx49xApQS0vLNWvWyJhk7TRq1EgyqOgBiplMZkREhPBkora29rx58zZs2KDQ\nTuuoQYMGAQEBx44d27dvn+hQi4QQHx+f/fv3C/5vOOjTp0+7u7tL3cUAqgCFHQCovU6dOt27\nd+/Lly9JSUmZmZnu7u4eHh5ZWVnCBg8ePPj5558rW/zJkyeSwaKiohcvXhBCdu7c2bRpU9GX\n1qxZU8Usq40bNzYyMpKMd+/enUrDxsamdevW06ZNe/TokaIPYvn4+EgGZ8+erdBOCSGNGjU6\nfvx4bm5uYmJifn5+WFgYm82uxXqys7OLiorkm1txcbHoZ6Nqb9++PXLkiFjw5s2b0dHR8s0K\nQF5Q2AGAhrC0tOzQoYOOjs7p06ezs7PFXo2MjBS97k2U1POqwnjz5s1TUlJCQ0OnTJni5+cX\nExOzbNkyqkFmZua1a9du3rxZUFAgXEpHR0dywLyuXbuOHz++cePGO3bsePXq1evXr8PDw6UO\n4Stfw4cP37Bhg3A4GF1d3eXLl3t4eCi6X4qxsbGdnZ0sg9FIOn/+fLt27Ro0aGBoaOjk5BQf\nH1/3fBITEwcNGmRoaGhubm5jY3Pq1KlqF3nz5k2N4gC0wwDFAKBppN6VKRAIMjMzzc3NJV8a\nPny4np6e2CxStra2bdu2pR4bGBgsWLBAbKnQ0NAVK1ZQSxkbG4eFhQlnwp0/f35FRUVQUFBu\nbq6Wltb333+/efNmXV1dsTXcvXs3OjqayWQOHjy4e/futXmrMvD19Z04cWJMTAyPx+vdu3eL\nFi3qvs6ysrKTJ0++ePGiadOmY8aMqcXtxlWLjo7+7rvvqMcCgSA6OnrYsGHx8fFih05rJCMj\nY/jw4cIbhN+8eTN27NirV69KPdUuJPVcNlH86WyA2lPKcHryhAGKBZo++iIFAxRrBiUMUCxJ\n8oI5Ut0Axbt37xadBMLU1PTJkydVdCH1YM/du3dF2/B4vNTUVKmd8ng8scNm8+fPr/X7FZWR\nkbFixYrRo0fPmDHjwoULclmn4P8PUPzmzRsbGxth5iYmJhcvXpRXRxSpP0jwRQAAIABJREFU\nM97OmzevLusUHmcV5eDgINomPz9fdExpgUDA5/MdHR3FlmrXrl1JSUldkqERBihWd9UOUIzC\nTi1p9qeWgsJOM9BS2JWWlnbp0kXsx/iPP/6oeqlnz54tWbJk8uTJQUFBGRkZVTeWvOuTEDJh\nwgQZMwwLC5NcPCIiQsbFK/Py5UvRe4EJIX5+ftRLx48fHzBgQLNmzfr163f48OGq/7L4fH5e\nXp5oRLSwk7yT19zc/PPnzyEhIQ4ODi1atBg5cmR0dHRd3ojUSw+HDRtWl3WOHTtWcp16enqi\nbSQLO4FA8O7du65duwoXadOmzdOnT+uSCb1Q2Kk7FHaaSbM/tRQUdpqBlsJOIBC8e/dOOMAs\nm81etmwZl8uV4/pbtWolWSX07t1bxsWl3nsxatQoGRcvLS2Ni4t7+PCh2Bxf/fv3l1zttm3b\nJGc8CwgIkLrmrKysGTNmUBcXNmnSZPPmzdTfoLCwe/36tWQXhJBevXqJRepyGE9yii1CiJub\nW61XKBAIpk+fLrlOKysr0TZSCzuBQFBRUXH9+vXt27dfunRJ+R9m+UJhp+6qLexwjR0AaKCW\nLVtevHgxOzs7PT29TZs2oqdZ5cLKyurt27diwebNm8u4eE5OjmRQ8oYPqU6fPj179mxqmjIT\nE5Pg4OCZM2cSQgoLC+/cuSPZftasWZLBgIAALy8vsYT5fP64ceNu3rxJPf38+fPcuXPLy8t9\nfX2rzpwQ8uDBA7HIzJkz379/L3XQGS6Xe+PGjffv37dq1WrQoEHa2v/9Er1+/frevXsdO3ZM\nSUkRW8rT01Nq1zLy8PAIDw8XC06aNEmWZanrIAcPHlyXBACURGk1przgiJ1A0/8doeCInWag\n64idop08eVLy61TsGrsqiM1qQPH29q52wfj4eMmbTM+ePSsQCGSsC4WOHTsmtvKzZ89KNtPX\n1y8uLhYescvJyZF9EtXU1FTJt5CUlCR6QM7Ozu7ly5fUS0uWLBGW4GLjQi9ZskTGbVsFf39/\n0XUOHz68tLRUtEFlR+w0CY7Yqbtqj9hhuBMAUGmFhYWBgYFjxoyZOHHinj17VGRgWFdX15CQ\nEOE4KcbGxnv27Onbt6+Mi/v7+4uN62ZkZLR06dJqFwwLCyspKRELrlu3jhBiamoqvI1XFpJH\nMZ8/fy7ZrKioSPTYpImJyeLFi8XatG7dWsYuuFyum5ub6NG458+fu7m58Xi8Q4cOBQcHl5eX\nU3FqR3t6ev7555/x8fFr166V6V1V6Y8//nj27FlISEhAQMClS5cuX74seasygLrDqVgAUF1Z\nWVkODg7CKbCOHj16/Pjxs2fPSp3mS8kWLlzo5eUVFxfHZDJbtWpVo/k9e/Tocfr06fnz5ycn\nJxNC7O3tN2/eXFl5JOr9+/eSQWHhtW3btiFDhsiSgIGBgZOTE4/HYzKZwqCxsbHUxiYmJqJP\nV65cyWKxQkJC8vPzdXR0PD09J02aJDloSJcuXSTHCrl3796zZ8/EgnFxcbGxsVLnfMvJyVm0\naJEs70hGHTt27NixoxxXCKBq6P9yBACozKJFi8QmNr1w4YLklVJ0MTc3HzZs2ODBg6VONVG1\n4cOHJyUlZWVl5ebmxsbGCqdEq5rU4dOEo7sNHjw4Ojp6xIgR1Y4JPHr0aAcHBx0dnRYtWqxZ\nsyYlJaWkpGTUqFEGBgZiLfv37y82epy2tvaKFStyc3M/fvxYVFS0Z8+eIUOGiB1uNDAw2Lt3\nr2S/3759k5rP5cuXJa/SI4RgVlaAmkJhBwByxuPxEhMTo6Kivnz5UsdVXb58WTJ46dKlOq5W\ndZiZmVV2nEwq6j4JMaK3Rzg6Om7dulX0dgQhe3t7Z2fnmTNnTps27dChQ+/evRMIBGlpaStW\nrOjQoYOBgcHSpUtDQ0NFi0Jra+t9+/ZJzYTBYDRt2lR4vV1QUNCFCxe8vLxGjBjh5+eXkpIi\nOkSIkNS7iQkhq1atKi0tlYyLDpgHALLAqVgAkKe4uDgvLy/h6bYZM2YEBATUem1cLlfGYD0x\nYMCAzZs3L168mJrxgsViLViwYNq0aaJt/vjjD9Epzig2Njb37t3T1tb+/Pmz1Lt3+Xz+oUOH\nkpOTAwIC3r17p6enZ2dnN3HiRNnneHVxcXFxcam6jUAgMDQ0FEvP2Ng4Ly9PsrGOjk7Xrl3T\n0tJkv90YAFDYAYDc5ObmjhkzJi0tTRjZtWsXh8MJCQmp3Qr79Olz5swZsaDkTAD1ypw5c8aP\nHx8dHc3lcnv27GltbS3W4OnTp5JLGRkZUYfxEhISeDxeZSt/8uTJkydPCCFdu3b18fGRvaqT\nRU5Ojqurq1hVZ2tr++LFC6ntuVzu0qVLV61atWzZshUrVsgxE6ny8/OPHDlCjY/j6upa2QzC\nACoOhR0AyM2xY8dEqzrKzp0716xZU7uR5EJDQ2/evClaCnTq1GnevHl1ylI2t27dunTpUklJ\nSc+ePd3c3ERvMqCdpaWl1HkUKJLXyYkGpb4qKT4+3t3d/d69e3J84xERER8+fBALvn//nsPh\nFBYWVrZUWVnZH3/8YWtr++OPP8orE0kPHjwYM2aM8OKBZs2anTt3rnPnzorrEUBBcI0dAMhN\namqqZLC0tLSyS+ar1bp16ydPnnh6erZq1apjx44LFy68ffu2fA8jSTV37tyBAwcGBwdv2rTJ\n09PT0dGxqKhI0Z3Ki9SaT1gVOTg4NGvWTJb1xMbGPnr0SI6JSf14FBcXC+cIqcL27dvlmImY\nkpKSiRMnil4S+uHDh4kTJ1ZUVCiuUwAFQWEHAHLTpEkTySCLxTI3N6/1Om1sbA4ePPjmzRtq\nBDKxoTcU4fTp01u2bBGNPHz40M7ObvDgwUuWLMnKylJ0AnW0YMECsWvdfvjhhzlz5lCPdXV1\nDx48KONtvOnp6XJMTOrHg81mb9682dbWtuplqZk2FOTOnTuS48gkJyfHxsYqrlMABUFhBwBy\nM2HChIYNG4oFPT091etypRMnTkgGU1NTb968uW7duk6dOtX9bl+FYjKZ58+fP3ny5Lx58+bP\nn3/69OnTp0+LjvzXv3//Fy9erFu3zs3Nreqa29fXd9CgQTt27CgoKAgJCRk0aFDnzp29vb2l\nDqdXLTc3NwsLC7HgjBkzGjVqFBcXt3v37lmzZi1ZsmTAgAGSy8oyyF+tVTZpR00n8wBQCcqZ\nAUOOMKWYQNPnS6FgSjE1devWrRYtWgi/YVxdXdPS0tRrSjFXV9eqvzYnTpwotkhWVhYtqdZd\neXn5yZMng4KCqr6eTKwgMzIyevXqVS26i4qKEr3Fdfz48cXFxWJtYmJiJBO4c+eOPN6udHFx\ncVLf9Zs3bxTXKV0wpZi6w5RiAKBU/fv3T0lJuXHjRkRExPPnzw8dOlTtYLlyV1ZWVpfFu3Tp\nUnWDa9eu1WX9qqO4uNjf39/X19ff318gEDg4OFTWMiMjQ/Rpfn7+r7/+WoseBwwYkJKScv36\n9aNHjyYlJR07dkzy49GnT5+IiAhLS0vqacOGDQ8fPuzk5FSL7mTUtWvX8ePHiwVnzJhR2ah7\nAKoMd8UCgJyx2exBgwZRjyUnNlWc9PT0RYsWnT17tqioyM7Obs2aNd9//30t1jN//vygoCDh\npKWSVGS+2rrz9PSMjIykHlNDDwYFBbVv376KW26F7ty5U7tOORzO4MGDq27j5ubm6uqalJQk\nEAhsbW2VMKPrrl27LCwsdu3aVV5ezmazZ8+evXr1akV3CqAIOGIHAJqgtLTU2dn50KFD+fn5\nPB4vISHhhx9+uHDhQi1WZWxs7OHhUUWD/v371zZNFXLt2jVhVSf0xx9/SJ0xQpKix39hsVhd\nu3bt1q2bEqo6QoiRkdHff/+dnp7+5s2bgoKCDRs2VHFh6NevX6XOkwGgClDYAYAm2L17t+Ts\n8gsWLKjd2kJDQ0WvFBRlamoaFhZWu9WqjpycnOvXr0vGKyoqLl++XO09qoSQYcOGKSAvmuno\n6DRr1kzqhGyU8PDwJk2aNGrUyMDAYOTIkW/evFFmegCyQGEHAJogISFBMvjy5cvanQs2NTV9\n/PjxggULevTo0a1bN0dHR3t7+y5duvz0009Pnz6trOZTnJKSktevX1dxdlh2sbGxDg4OZmZm\nwcHBUhu8e/duz549YoMFtmzZUvSpubn52rVr656MVBUVFdRhMwWtv9YiIiK8vb2pIWB4PN7F\nixdHjBihgnlCPYfCDgA0gaGhoWSQzWbX+kRegwYNQkNDHz169OTJk+jo6NjY2Pj4+H/++UfG\n0X3lJTs728vLy8DAoE2bNoaGhr6+vnW5bPHDhw8jRoyoethhKysrOzu78+fPz5w5s1+/fuPH\njz9x4kRycvKff/45YsSI5s2bM5nMzMzMdu3aTZo0KTMzs9bJSOLxeAEBASYmJjY2NsbGxmPG\njPn48aMc119HS5cuFYu8fv169+7dtCQDUBkUdgCgCaRe7z927FjR8dvUjkAgmDx58v79/8Pe\necZFsTQLf8ggIEmSgIKAAgZQQTCimBAUFQzgUYIo5pwjHrOeY/agPh4DKgYMSFBRgmJAgqgg\nSFJyznmBXXbeD3PfuXOnZ2cDC8o+/f/gz+2p7q6ZHWZqq7uqbmHhGu3t7WfOnOF9fbm6ujo/\nPx9FUbzl9OnTXHOzbdq0SUFBYfLkyTdv3hw9evTt27ednZ0lJSW9vLwMDAwKCgqwUrNsNjsg\nIMDNzY0YStLY2JiTk0NTixakvb39x48fmLV67NgxX19frMgHiqLBwcFz5swRip+y8zAYDMrs\nfd+/f+92XSAQOnrwIw8CgUBwxowZQwpjlJOT8/b2/lX6CIWPHz+C8R//+c9/wIqrJJKSkqys\nrPr06aOvr9+3b9/bt29j7RkZGVwnxc2ytra2kydP7ty5E/tYWVl55coVknBkZGRMTAyCIPn5\n+TNnzuzdu7ehoaGysvLRo0e5xg4zGIxNmzYpKipizsglS5YcPXoUPJGgoCCuOncDMjIylOEU\nqqqq3a8MBEIDNOwgEIiIMHz4cOJHBoOxcOHCLi1F1dVkZWWBjSiKZmZm0vQqLS2dOHEivt5a\nVlbm7u4eFhaGIIiamhoor6SkRDPahQsXsCR2ubm5lLZaZmZma2urg4PDs2fPsJampqY9e/b8\n9ddfNMMiCLJ58+azZ89iDrmOjo47d+5QrjJzOtmmpibhLgTTIy4uDsZKy8rKurq6dpsOEAgv\nQMMOAoGICL6+vqSWqqqqc+fOdX7k9vZ2bH2wi2Cz2fX19WA7pR2GAHUgSCxbtqypqYnUuGfP\nHgRBlixZAsrTZ+7o6OjIzs5GEERFRYVSoE+fPvfv3wdXJH19fWkyRRcVFYH+P07jk1qSk5PH\njx/fu3dvdXV1Q0PDbnPpnT59evTo0fhHGRmZ06dP85gdBgLpNqBhB4FARARK/xa9c4srqamp\nkydPlpeXV1RUNDc3j4iI6MxoIBUVFZ6engoKCsrKyrq6upcvXyYetbOzIxbgwhgxYgRN+S8U\nRaOjo8H29PR0BEGmT59+4MAB0iGuhTowO9LY2HjUqFGkQzo6OtOmTQsODgZ7tbW10ZSUxSqS\n0c+LIIiysvKcOXOILWVlZdOmTXv//j3WPScnx9nZOTo6uhu24ikqKn748CE0NPTPP/88f/58\namrqqlWrOAn/JlsDIf+FQMMOAoGICJT+Lfoi9/SUlZVNnjw5OjqaxWKhKIolPY6Pj++Ejv8H\nFos1d+5cf39/bAmyuLh41apVfn5+uIC8vPz9+/e1tbXxFmNj43v37omJiXEas6GhgdIDh6cv\n8fX1xbx3PDJu3DhjY2Ps/9euXRs0aBB+SFNT8969e7179+YUqEtjMnJyRhKreKmoqNy+fbtv\n375EgTNnzlRUVJB62dvby8jI9OvX76+//mKxWLQn1CnExMRmzpy5f//+devWGRkZUcoEBASY\nmJjIycmpqamtWbOGa7QKBCJcoGEHgUBEBA8PD7DR3d1d4AH//vtvkg3R2toK5rwQmKdPn4IF\n73fv3k109lhaWq5du1ZbW1tKSqp///7bt2/HzSxK5OXlpaSkwPYRI0bg/6+treVRQzMzszt3\n7uAfDQwMvn379uTJk+PHjwcEBAQFBZ04cUJVVfXDhw9gXzExMdDdiDN06NCRI0eSGvv16/fl\ny5ewsLCTJ0/6+/tnZWXNnDmTJEMZ/8FkMhEEKSws3L59uxC/IAHw9/dfvHhxZmYmm82uqanx\n8/NzdnbmK0wYAuksaE+jpqZm0aJFNAKVlZW1tbXdps8vgcFgNDc3/2otupb6+vrKyko2m/2r\nFelCWCxWXV3dr9aia2lpaamsrGxra+MqyWKx/Pz8rKys+vbta2dn9+zZM37nam9vnzt3Lv5w\nk5GR+fvvvwXS+n+wt7cHn5nq6uqgZHV1tQDjg6uiGD9+/MBlVq9eTTp6/Phx+mG9vLxIXSQk\nJFJSUnABSo+djIyMv7//yZMnAwMDAwICTpw48fTp0/b2dqwLk8msrKxsamrCB8nKylJQUKB5\nuTg4ONDrmZmZSTRStbW1Y2NjuV40SvOdiLi4eEFBAddxKGloaMBPWQCYTCalhzgwMFDgMYVO\nVVVVTU3Nr9aiaxHtV2R7e7uTkxONAMfCKRAIBNKdbNiw4Z9//sH+X1JSEh0dfePGDU9PT95H\nkJKSevLkSWxsbFxcHJaJzdDQsDMqUYaLKisrd2ZMruOLiYnhU6SmphJXZjH279+/bNkyTkuZ\nCILs2bPnzp07mBMLHzMsLMzU1BQrluXs7Hzs2DFSiOuMGTP48m7u2LEDDNHAGT58+M2bN+lH\nGDhwYGpqalhYWHZ2dv/+/R0dHSmzTJP4448//P39aQTYbHZKSorAeaRTUlJOnTqVnp6uqam5\naNGiJUuW0Cx8kygpKaEM1E1OTp4/f75g+kAg/AINOwgE8uv5+vUrbtXhrF+/fsGCBTS12CkZ\nM2bMmDFjhKKVq6vrgwcPSI1ubm5CGRxBECcnpz179rS0tBAbp06dihttnz9/Bnu1t7enpKRM\nmjSJ07B//fUX0apDEITFYu3evTs2NjYkJERMTCwuLg5MXMJv6hBK3UxMTJYsWTJ06FAHBwcJ\nCQmug0hLS1NmlqZh6tSpvr6+f/75J40MLwYigiAsFuvatWtRUVHt7e02Njbr1q2LiYlxcXHB\nl8JfvnwZHx8P3pmcUFBQEBMTQ4GgEB71gUCEQ3f5DoUGXIpFRd3PjAGXYkUDHpdiL126RPmA\nio+P7x49OUEq8zB9+nTKpTrBlmJRFL116xax6JmhoWFRURF+NDAwkPKyxMXF0Yxpbm7O6YF/\n//59FEUpk57IyMjQ/LmBS7EmJibgII6OjoJdB3759u3b6dOn9+3bByZh6devX2trK9cRWCyW\nnZ0dsaOxsbGuri54UrwsEOOQxkQQRFZWNj09vRPnKmTgUmxPh+tSLAyegEAgvx5paWm+2ruN\n06dPx8fHHzlyZM+ePc+fPw8PD6cMTRCY2tpaYuhoU1MT0YE3ceLE3r17k7rIyMgMGTKEZkxs\nvZWSN2/eIAhCeQpSUlK8rzkiCOLk5AQ2zp49G0EQFov16NEjX19fPz+/goIC3sfkHVVVVTk5\nOTExsdWrVxO3+ikrKwcEBPBSIPjSpUukvDDZ2dmUpWmxi8Yj169fJ4aMSEtLnz59mtIIhkC6\nCLgUC4FAfj12dnaysrKkPB16enr0Fkz3MGrUKH19/ffv39fW1mZlZQ0cOFBYI6enp+/YsYPY\nUl5e7unpiQeZqqurm5mZxcXFEWWwSl+c1iJLS0tpbCmscq69vf3169dJh2bMmMGX8gcOHHj9\n+nViYiLeMnfu3GXLllVUVEyZMuXbt29Y47Zt265evQrWbOgMQUFBHh4ejY2N2EdDQ8N169bV\n19cPGDBgyZIlGhoavAzy8uVLHqfjq9xw//7909PTb926lZqaqqGh4eLiMnjwYN67QyBCoNuc\nh8ICLsWiou5nxoBLsaIB71Gx58+fJz6aZGVlo6Oju0FDrly7do3oE1q9ejV4Wwq2FMup6FZZ\nWRkmkJeXR+lFGzx4MKcx6e2zR48eYWKkQlg6OjqlpaU0qoJLsVjjtWvXvL29V6xY8fjxY+yy\nYE47IvLy8sQ4305SXFwMBp3Y2dnxO8706dPB60Pp7Pzl+wGEC1yK7enAqFgIBNIzWLdu3ciR\nI/39/YuKigYNGrR27VpirtpfRWJi4po1a4iuRD8/P2Nj440bN3Z+cFLYBA5evuzSpUsoVXkG\nrHgrSHV1dXh4OKfpnJ2dXVxcsP/fvXt39uzZYWFhjY2NVlZW69ato68YS4mkpOTSpUuXLl2K\ntzQ0NISGhpLEmpubg4KCtm7dyu/4lISGhoLl16Kjo0tKSkipjOkZM2YM6LQbO3ZsXFwccXF8\n48aNYL0NCOR3Bhp2EAjkd0GIAa3C4t9//wULOVy6dEkoht3w4cPBxj59+uCbtMrLyyk7ampq\nUrZjDmCw3cjIaNu2bd7e3niLmJiYq6ur0AvY19fXg/G2CIIIsfoCp+zKtbW1fBl2W7ZsuXfv\nHjHdsYqKyqVLl1gs1pkzZ75//66lpeXm5rZw4cLOagyBdC8weAICgUA4UlZWBjaWlpZSCpeX\nl/v4+BgaGurp6c2fP5+ydi2RmTNngguCZ8+exRcEx44dS9lx7969lO26urqUmTW2bNni4+PD\nS/6RTqKtrU2Z50+I+8zMzMzARjk5OQMDA77GkZeX//Dhw8aNG4cMGWJkZOTh4fH582ddXV0T\nE5Pr16/HxcU9ffoUWnWQHkk3rQkLD7jHDhX1DQQYcI+daMD7Hrvfkw0bNoCPTXNzc5JYdXV1\nfX09KR9y7969f/78ST9+Q0PD1q1b9fT0pKSkhg8fju+Bw2AymcRCsRgDBw6kGfD06dOgwtra\n2hUVFYJdAYyCgoJVq1ZNmjTJ3t5+ypQpkyZNcnd3f/PmDSh58eJF0uwjR44U4g3AYrHGjRtH\nmuLYsWNCGbyTlSd6BHCPXU+H6x47aNj1SET7rsWAhp1o0NMNu+zsbNAH9uDBA5JYdXU1pRfN\n2dm5kwrk5+fj7i4xMbGxY8fS3zNsNtvGxgbUZPXq1bjMt2/fjh07tm3btoCAACaTyVWH9PR0\nTjvwzpw5Aypw7tw5dXV1BEGkpKTmz59fXFws2Llzory8fPHixVjSFlVV1ZMnT3Z0dAhlZGjY\niQai/YqEhp1oItp3LQY07ESDnm7YoSgaERGBh3H07t377NmzoEx1dfW0adNAu0dHR0coOrS2\ntn758oXFYvEiPGzYMFCTkSNHYkdPnz5NzA44dOhQriG948ePp7TqEASRkZHJzc2l7FVSUtKl\n33t7e7vQTUZo2IkGov2KhAmKIRAIpFNMmTIlMzMzLS0tISGhpKSEcnEWQRDKpLiysrJC0UFG\nRsbCwoLHTXI0mnz+/Hnz5s14ySwEQb59+7Z27Vqa0VpaWt6/f8/paFtbW3BwMOUhbW3tLs0v\nLSUlxVe0RDfT1NSEFRH51YpA/uuAhh0EAoFwQVJS0szMzMrKSl5enpOMo6Mjj41dDY0mjx49\nAg89fvyYVFuWCOY1p5lu48aN+vr6QUFB/Gsqmvz8+XP69OmKiop6enrq6urgpkMIpEuBhh0E\nAhFB4uLiHB0d+/fvb2lpefLkSaKPqotYvny5g4MDsWXIkCFHjhzp6nlBdu7caW1tTWwZN24c\nlkauoaEBlG9vb2cwGJxGU1BQoCk+i5Gfn+/s7BwTEyOQvnxTV1e3ZcuWYcOGGRoaLlq0KDs7\nu3vm5YWmpiYHB4dXr15hH6urq9etW3ft2rVfqxXkvwqYxw4CgYgakZGRU6dOxf5fUFCQlJQU\nFxf35MkTSmE2mx0QEPD+/XsxMbFJkyYtWLCAr5KpOOLi4qGhoXfv3o2MjGxtbR07dqyPjw8v\nRUsTExMfPnxYVlY2ZMgQHx8fynQhfCEjI/P+/fvr16/HxMRgJ+Xh4YGlUKEs0aasrLx27Vod\nHR13d3dTU1NQ4PLlyxMnTiSm7aVk+fLlEyZMkJWVnTZtGmUlWaHQ2tpqa2ubkpKCfczJyQkL\nC0tKSjI2Nu6iGfni1q1bYJqbbdu2JScnNzU1WVlZLV26lJe7AgIRnG7a7Cc8YPAEKuo7QzFg\n8IRo0G3BE1lZWffu3QsODq6oqCClHcEIDQ0Fe7W3t9va2hLF7O3teYxRICJYSTEUyE6ioaGR\nmZkp2FC8wGAwhg4dyul1ICMjc/v2bcqOnz9/dnJyokySRwn9U5oXWlpaIiIi/P39ExMTURRt\nbW2NjIz09/dft24dOJ2joyObzf748aO/v39kZKTA91vngyco1SNiamr6a2MXYPBETwdGxYom\non3XYkDDrsfBZDLBZGndYNix2Wzi25ST8bFr1y6w7+HDh0HJ06dP86uDYIZdWloaGF1hY2Mj\nwOytra08ChcVFbm6usrJySEIgmUMIaKgoEAZaspkMi9fvkxvspC4desWvyeC8/HjR319fXyo\nUaNG0deXU1ZWJobuGhsbf/78WYB5O2/YHThwgOuV8fLy6swUnQQadj0d0awVi6IozVZfXgR6\nOh0dHWw2W7TPEUVRBEGYTKZg62I9AsxsFYHvsby8fPv27Y8fP25vb1dXV9+5c+fatWuxL66j\nowP7t+tO8+LFixcuXMA/NjY2UopJSkqCOjx9+hSUDAoKog8UBRHsewwNDQXrlcXFxRUWFmpp\nafEyQlBQ0M6dO3NzcyUkJCZNmnTmzJlBgwbRd9HQ0Lh161ZHR0dSUhKY6bepqenly5eLFy8m\ntXd0dNBUoeWkm2Aly+rq6ubNm1dcXIy3JCQk0HdhMBjv3r3DP2ZnZ7u4uHz+/Jkm2IUSNpvN\nYrH46kLC2dn5xIkTNHsWEQR5+vTplStXOjNLJxGNxw4NWF5DUT1HLPckjUDPM+xQFGWz2eDT\nkAhXgZ4OdteK/DkiCMJ1W0+Phpeb+feHxWK5uLjgr97KysotW7ZEUWPzAAAgAElEQVQwmczV\nq1cj//97bG9v7+T7kgYePUm2trbgpW5ubgYlm5qaBPhSBOhCGcqAIEh9fT0vO+2ioqLwmlcd\nHR2RkZFjxowZOnSotLT0uHHj1q5d26tXL6I8iqKPHj16/PhxVVWVqanp5MmTOc0OnguKovTG\nCohglxFBkKCgIKJVxwvggyI3NzcsLGz27Nl8jdPR0dHJe1VfX//UqVNbtmyhuVwMBuMX/tVj\nNkFPf+zQgz126K2fnosIGnZiYmISEhI0Wz3a2troBUSA1tZWNptNemqLGNiaiIKCggh77Do6\nOpqamnr6vfro0SPQoXLkyJGNGzfKyMgwGAwWiyUnJ9eZlGZsNvv58+dpaWnq6uoODg4kb1Z5\neTnYRVJSkvh63rZtG6UdY2VllZ6eTmq0trbm90upqanh1CU9Pf3169cMBmPMmDGjR48mHhoz\nZgwor6GhYWpqykvKuoMHD5JaGhsbY2NjEQR58+ZNcHBwfHw8tuSKsWbNGj8/P+z/CQkJd+/e\nlZaWBuOFx40bB54Li8UyNzd/+fIlV61wbGxsBLu36+rq+JLX1tamrN5bW1vLrwKNjY2ysrLg\nCjVfrFq1ytHRMTw8vKqqKicnBwyJHTly5C/8q29vbxcXF+/pjx16RPsVyWQyxcVpU5p02Spw\nVwH32KGivoEAA+6x6ymA5gVGVlYWKow9dlVVVZaWlviwioqKgYGBRAELCwtw9qVLl+7YscPB\nwcHDwyMsLIzT4Pn5+SoqKsSOGhoapaWl/CrJaY/dgQMHiBatq6srMTKDzWaTMqQgCHLv3j0e\nJ+UaXLlv3z5c+PXr1/TCGFOmTKGci8lk5uTkGBgYcOpIetMYGBgIfG8/ePCAq55iYmKzZ892\ncnI6cuRIdnY25c8/mu+dE0KvPNHc3GxmZkbUSlZWNiEhQYhT8AvcY9fTgcEToolo37UY0LDr\nKZw/f57yvYsFUnTesJs/fz5pcAUFBWIZq3v37oECvMeWpqenz549W0VFRVVVdd68ednZ2QIo\nSWnYvXjxArwyx48fJ8o0Njbu3LnTwMBAVlbWysrq6dOnvE/K1SEhLS0dEBBQW1vr7e3N5Sf+\n/0dFRYXSrmUymZWVlVlZWe7u7ljmFBKjR4/G6sMiCCIlJbVjxw6B67eeOHGCq8Ny2bJlxC6e\nnp4kgZEjRwpw13VFSbGysjJvb29tbW0FBQU7O7uPHz8Kd3x+gYZdTwcadqKJaN+1GNCw6ynk\n5eUpKCiQXqvTp0/HjnbSsGtsbKR8x//1119EsdOnT+M66Ovrv3z5srNnxSeUhp2bmxuouamp\nqbAmpTSwQPT09HgRw7l8+TI4F2bYNTU1oSh67NgxsBcxiBXj5MmTApwUvlhMxMXFhRjn4enp\niWmC09jY6O7ujgtMmjSJUwVbeoiGXWlp6YYNG8aNGzd9+vQLFy5gG5tEAGjY9XREMyoWAoH8\nPvTv3//q1ave3t4tLS1Yi4mJibBS7dfV1WH7oElUV1cjCNLU1BQREVFSUjJs2LDi4uL09HQ5\nOTlTU9NO7pEiUVtbGxUVVVlZOXjw4AkTJvDeEVOSRFVVlVC0Sk1N5XGPf2FhIV8jU6pNZNu2\nbfHx8cSA4hkzZoDuyYMHD65fv57renFSUtKnT5+UlJQmTJigpaXl6+sLykyePHnVqlU5OTnF\nxcWDBg3S0NAgCSgoKPj7+588eTIrK0tXV5dmyZhH8vPzR4wYUVNTg318+fJlWFjY8+fPeXR8\nQiC/EGjYQSCQzuLq6jp27Njg4ODy8vLBgwe7uLgIy7TS1NRUUlKqr68ntZuYmLx//97V1RUP\nn7SysgoJCeExSwjvvHjxwt3dHbfGbG1tnz59ymNxiEGDBuGlpXBMTEyEohhY3kBYcNVQQkIi\nKCgoKirq48ePUlJSkydPDgkJAQ27pqam4uJimvxzLBZr8eLF+I46eXn5I0eOVFZWgpKZmZkI\nggwYMIA+m52mpqampia98jyybt063KrDePny5a1bt8A1Xwjkt6PbnIfCAi7FoqLuZ8aAS7Gi\nQef32JFqMyAIMnTo0NLS0r59+5La7e3thag5iqLFxcWk0AoEQf744w9QElyK9ff3B1cnEQSJ\njo4Wim5v3rwR2muAwKhRoyg3mRGXYkHOnj0LDiUuLk5/e+/fv5/UhVNE6qFDh4Ry0biCL8WC\nuwsQBFmyZEn3qNGlwKXYng7XpVjoVYZAIL81GzduPH78eO/evREEERMTc3R0DAkJiY6OLikp\nIUmGh4fzu+xIz+PHj2tra0mN9+/f55R/DufmzZseHh55eXnExn79+gUGBlZXV69Zs2bVqlX3\n7t1js9kC6zZ69OiBAweSGrW1tXnpiwfqmpmZeXh4YFl8xcXFXVxcgoKCBPC2Tp06FVyjHD9+\nvJKSUk5Ozv79+z09PQ8ePEjKTvfvv/+SurS2toLl4Hr16rVgwQJ+VeokKFWeMMpGCOS3o5ss\nTOEBPXaoqP8cwYAeO9FAWCXFWCxWTk5OQ0MD9vHMmTOUD7TFixd3WuX/Ze/evZSz5OTkkCSJ\nHjsWi9WnTx+w17179xwdHYktdnZ2nYnB/PLlCzEwon///l+/fuUaUmpgYJCamlpcXFxWVoaN\nw2Qyf/78yckbh8vQeOwoU95YWloGBwcTE+kpKCgQHZaUwR9z584dO3YssUtAQACPF6S6uvrT\np0+VlZXgofb29tTU1KysLPpCwLjHDkxDgyDI1atXedTkdwZ67Ho6MCpWNBHtuxYDGnaiQRfV\nig0JCeFkuERGRgprlhs3boDjy8vLg4VZiYZdQUEBpWJTp04FGw8fPtwZDZubmx88eHD06NEH\nDx7gz4SkpCSSZWloaHj9+vWjR48GBgYyGAwBJqI37ObNmweemqysrKqqKqmxb9++uAKUm/n2\n7NnDZrNfvnx58uTJf//9l7JwLUh9fb2Hhweezc7Z2bm8vBw/eufOHTzeon///s+ePeM0Dm7Y\n/fjxQ0lJiajYpEmT6I3CngI07Ho60LATTUT7rsWAhp1o0EWGXXt7+6hRoyjtp7Vr1wprlubm\nZtD4OHjwIChJNOwoK2EgCEK58d/S0jIoKGjhwoWTJk1au3atYEk6QGpqarZv325tbW1jY7Nr\n1y7ibZaQkODp6Wlra+vp6RkfH8/jgPSGnZeXF3hqnEJMYmJisF4BAQGkQ3369CkqKhLgfMGi\ntFOmTMES6UVFRZEOycnJpaSkUI5DTHdSUFCwYsWK4cOH29ranjx5ErTmeyi4YSfYndAjEO1X\nJDTsRBPRvmsxoGEnGnSRYYeiaE5ODmW9AS8vLyHOkpWVZWtri40sIyOze/duSrcN0bALCwuj\nNGhIFQgw1NTUSDZHl5YluH37NkmBGzdu/Pjx4/nz5ykpKTQphekNu+fPn4OnZm9vT3kdnj9/\njnc8f/487hgbNmxYXFycACeVnZ1NOdGHDx9QFJ0yZQp4iFMYRFckKP7dwAw78E64efPmr1ZN\naIj2KxIGT0AgENHEwMDA3NwcbB85cqQQZzE2Nn7z5k1paWlycnJtbe2RI0e4FkXIycmhbKdc\neSQljWMwGB4eHgJrS09dXd3q1atJjcuWLTMyMnJwcBg2bNjo0aM5GUn0zJgxY+vWrcQWa2tr\nyjzGEhISxBJw69atq6ioSE1NzcvLS05Otra2FmB2Thf8x48fCIL8/PkTPETZ+N9DfX09eCes\nXbuWawpDSI8AGnYQCKSnAmbZsLCwWLZsmdAn0tLSGjZsGDEOgAYwfS7Gzp078aJbGLKysqBY\nenp6UVGRAEpyJS4urrGxkdRIzP+ckJDg4uLS1tbG78goig4fPnzMmDF9+/Y1MzM7ceKEl5fX\nzp07wbSCe/bsIYXuSktLDx48uH///vxOisPpgmOzU2a2E3q+w57Fp0+fwDuhqakpNjb2l+gD\nES4wQTEEAuk+2traUlJSamtrhw4dymNuDhpsbW2jo6P37dv3+fNnJSUlJyenw4cPcy110NXM\nmDFDT0+PlHhl+vTpVlZW79+/37lz5+vXr1EUHT9+PIPBAHeAIQjCNZ2KYFDW8CDx7du3V69e\nzZo1CzzU1taWlpZWWFgoISFhYmIydOhQPKx1/fr1Fy9exP5fUlKya9cuYiYXMTExCQkJQ0PD\nDRs2+Pj4CONU/g/m5ubW1tbx8fHERhMTE2wNffny5XFxcaQuy5cvpxyqtbU1NTW1sbFRKPdn\n19Hc3JySktLc3GxhYUEZgk0PpzuBlzsE0gPoniVhIQL32KGivoEAA+6xEw2Ie+zevHmD13qS\nlJRcv369aIQZokCC4o8fP/br1w9/zI4aNaq0tBTsxWnVNSsrqyuULC8v58XqvXjxItg3PDxc\nR0eHKDZ48OCkpCTsZLmOaW1t3RVnhPPz589hw4bh0xkbGycnJ+NHN23ahB+SkZE5fvw45SCv\nX7/GvzVJSckNGzbQbDr8hTx9+hT3OMrKyh44cICv7lVVVZmZmeCdIC0tXVJS0kU6dzOi/YqE\ntWIhEMhvQXFxsYuLC76Jh8VinT9/XkNDY8+ePb9Wsa7AxsYmPT09Ojq6qKjIxMRkwoQJlDVG\nx44d6+/vT2qUlJTEfUWpqannzp3Lzs7W09NbunTppEmTOqOVhobG8ePHiVYOJcTceBj5+fkL\nFy4kFXZLS0ubO3ducnIyLzUw4uPjm5ubsUzInaGysvLvv/9OSkpSUlJycHCQk5N7+vRpdXW1\nubl5aGhoVlZWdna2vr6+nZ0d0XA5ffr0ypUrY2NjpaSkxo0bR7nsW1RU5OLigpcRY7FY586d\n09TU3LVrVyd1Fi7fv39ftGgRXpe5tbX1wIED2O3B+yDq6urgnXD06NHf2UkJ4YNuszGFBfTY\noaL+cwQDeuxEA9xjd+TIEfD5o6qqKhpfMVhSDEXRpqamr1+/0uTvqKmpIbnBEARZuXIldjQs\nLAwvEYFx+vTpzqsaEhIybdo0Y2PjadOm2djYkGY3NTUFE93t27eP0xvk6tWrlEESIJwialtb\nW1NSUnJycrjeCfn5+aRNikR69+79/ft3gS/LoUOHwDHV1NQEHrCLWLt2LainmZkZ7yPg6U6I\nd8LTp0+7TOVfgGi/ImFULAQC+S2gLPZVU1PT1NTU/cp0NR0dHbt27VJVVbWwsNDV1Z0wYUJW\nVhYopqKi8vjxY2NjY7xlwYIFp06dQhCkvb196dKl7e3tRPldu3bl5uZ2UrdZs2a9fPkyKyvr\n5cuXYWFhxKQkFhYWjx49AkM6aAq15efn4+lgaBg1ahSlu+7cuXMaGhrDhg0bMGDA0KFD6Vd1\n161bV1lZyeloQ0NDZ+JmKM+xurq6ublZ4DG7Ako9OeXEpod4J8yePbvTqkF+F+BSLAQC6Q5A\n1xSCICoqKpTV1jkRHh4eERHR0tJiY2Pzxx9/UNak+h04fPjw8ePH8Y/v3r1zcnJKSkoCjRtr\na+vU1NSkpKTy8vLBgwfjRl5ycnJFRQVJuK2tjbhPsfOoqam9ePEiLS0tKytLV1d3xIgRlMlc\nKL87DD09vdGjR69aterSpUt4o7i4ODF4olevXv/5z39iYmKePXtWX18/cuRIDw8PGRmZO3fu\nbNy4ERdLS0ubNWvW169fdXV1KeeKiIigP52PHz8uWbLE2NjY3d1dX1+fXpgE5Tmqqqr26tWL\nr3G6Gko9wdVzyH813eY8FBZwKRYVdT8zBlyKFQ3wpdiCggKwFIGvry/vQ5F2EVlaWtKXN+1O\niEuxra2tlN6pa9eu8T4gJ98VX+VKhRWb8vPnT0VFRVAZHR0d7MTZbPatW7fs7e0tLCwWL178\n9etXPz+/KVOmjBgxwtvb+8ePH9u2bSN2NDExqa6uHjRoEDjm7t27KXVgs9m8xzvLyckFBwfz\ndY75+fmkGmIIgvz5559CuHxCJSUlBUy7c/nyZd5HgCXFejqw8oRoItp3LQY07EQDYlTsq1ev\niM4YHx8fJpPJ4zgPHjwA398bNmzoSt35gGjY5eXlUVobu3bt4n3A5ubm3r17g4PwuI0sNjZ2\n/PjxMjIyioqKc+fO/fHjB9+n9H8JCQkhpYszNjbmsVAEpadt8eLFlA7X+fPncxrHzs6O8sJS\noqqqyq/58vLly759++IjrFix4veM2g4MDMRTnEhLS+/cuZOv7tCw6+nAqFgIBPK7MHXq1Kys\nrMTExNraWgsLC75y0gYFBYGNT548AXMU/3LU1NQkJSVZLBapna+kuL169bp48aK7uzuxccOG\nDaamplz7pqSkTJ48mcFgIAjS1tYWFBSUkJCQnJxMKl/GFzNmzPj06dP3799zc3MlJSXNzMws\nLS1JsR2cePr0KdgYFBSkoaFRUlJCaqe5ShcuXBg1ahSPm95qampiYmLmzJnDizDGtGnTvnz5\nkpqa2tTUZG5u3pmcyV3K/Pnz7e3tExMTGQzG8OHDicYoBILAPXYQCKQ7kZOTmzBhggAdKV/n\nv9vGdgwFBQVXV9c7d+4QG1VUVObNm8fXOEuWLNHW1vby8sILUfj7+9vY2IAF70ns2LEDs+pw\niouLT548eeLECb4UICEnJzdu3Ljp06fz25Hya2IwGFu2bDl48CCxUVZW1tPTk9M4ZmZmX758\nOXToUFJSkoKCgqOjo7S0dGhoaGFhIWVIgQBxOXJycuPHj5eSkuK3YzejqKjIl/8S8l8FNOwg\nEEgPYPjw4aGhoaTGESNG/BJluHLx4sWioiI8wZu6uvqtW7cE8KzExcURy4vV1dUtXbp0yJAh\nQ4YMoemVnJwMNn79+pXf2YVCR0cHycrEMDEx2bdvX1ZW1v3797EWeXn58+fP03+nxsbGt27d\nIrbs3LmzvLxcR0cHrJrA4+2RmZl5/vz5mJiYurq6jo4OZWXliRMnrl+/nhfnaF5e3vnz5zMy\nMvr27bt48eKJEyfyMiME0rV026qwsIB77FBR30CAAffYiQbEPXadoa6ujhTnKCcn9+XLF8FG\na21tFe6tRZnH7t27d35+fo8ePRL4W6YMgdy8eTNNFxaLZWRkBPZydnbma2pSNjsmk1lZWUkf\nrULq0tbW1t7ePnnyZMpXj7S0dEREBIqicXFxV65cuXfvHi9lDzg92/fv308af9WqVdxPEkXD\nw8MpYzJkZGRCQkLo+8bGxpLiGDjVtPitgHvsejoweEI0Ee27FgMadqKBsAw7FEXz8/NdXV1V\nVFTk5OQmTpz48eNHAQYJDQ0dOnSohISEgoLCwoULCwsLO68YysGw6yRsNpsy+YirqyulfGpq\n6rRp02RkZCirXAQEBPA47/Xr142MjMTExJSVlVetWoUZATSGXX19/YYNG1RVVREEMTAwuHz5\n8uvXr62srCQlJenXNOXk5BQVFcXFxc3MzB4/fkyjUmVl5ejRo7HzkpCQmD17NumOYrFYFy9e\nNDY2lpCQMDAwOH78OC+3XFtbG82WPmlp6ZSUFE592Ww2aEDLysp2Jkly9wANu54ONOxEE9G+\nazGgYScaCNGwwxH4rnj58iXpTWxiYiKUnCldYdihKGpoaAgaHHv27AEli4qKaIrBe3l58Tjj\nf/7zH1LfSZMmffr06dGjR69fv25sbCTJs9lsR0dHUhfB8gtyKn5QWFioqalJEh4/fjxJrKOj\n49OnT0+ePCFWiaUnPj6eXiV1dfWqqirKvtnZ2ZRdzp8/z+Psvwpo2PV0YOUJCAQiaoiJiQnW\nccuWLaSWjIyMK1eudFqjrgIsVKqqqrpixQpQ8ujRo1VVVaTGAQMGbNu2LSIi4vr167xMx2Qy\nt2/fTmp8/fq1paXlvHnzJk2aNHXqVFKRg6ioqGfPnpG6gBHBvAAWsUVRdMuWLfr6+uXl5aRD\n7969y8jIwD/m5OSMHj3a0tLS2dnZ3Nzczs6urKyM64ykwh4glZWVWCEQ3vtyHRMC6Wpg8AQE\nAvm9YDAYd+7c+fbtm5aW1ty5c3nZw84LHR0d379/B9tTUlKEMn5X4O3tXVFRcfjwYazo+8CB\nA//9919imYGOjo4HDx4kJSWBkSUIgqAoevLkSd6nKywsrKuroxGIi4tzc3N79+4dvtorxKuX\nm5vb0NCAZ+/78ePHpk2bwsLCOMmHh4ebmJggCMJisRYsWJCUlIQfev369eLFiyMjI+lnNDc3\n79WrF3ZtOZGSklJUVHT37t2ioiJDQ8MlS5Zgi87GxsZ9+vQBjekxY8bQTwqBdDXQsINAIL8R\nhYWFEyZMwHP8Hjx48Ny5c5Q+Kn6RkJDo1asXmAKDMg/w78OuXbvWrl37/fv33r17GxsbE1c5\nm5qasHVSTn35PTXK8hIkYmNjExMTra2tee/CI1JSUngswp07d5YvX97a2kojjydMfv/+PdGq\nw4iKikpNTaUPH1ZUVDx9+vTKlStpZJqamkxMTPCMLYcPH37x4oWlpaWUlJSfn9+CBQuIwt7e\n3qNHj6YZDQLpBuBSLAQC+Y1YunQpsXJDW1vbpk2biItunWH+/PlgI7/p5boNNpt97dq1GTNm\nWFpaLly4cMGCBYsXLyaWGtu+fTuNVYcgCMns4Iq6ujovCTuISeNmzJjBS7VfXV3dAQMGjB49\n2sLCQklJidIcnDt3LhZsUVBQsHLlSnqrTlpaGv/iiBlhOOnJiRUrVoSEhIwdO1ZeXp4yVOXL\nly/EPHxVVVVubm7YWvP8+fMjIyOnTp2qo6NjZWV14cKFy5cvc50RAulyummzn/CAwROoqO8M\nxYDBE6IBX8ETNTU1lPvnTpw4IRRlamtrzc3NiSMfOHBAKCN3PngCvNWXLFlC+dC+e/cuJqCu\nrk7zbLe3t+dUsY3mzyo3N7dfv370b42EhARil7t378rKyuJHtbW1LS0tifKrV6/GJNvb28eO\nHUs5pqmpaWVlJSZ26dIlegXExcWJQb4xMTGUYmlpaXx9BQwGw8bGhjiCg4MDzRXooU8nGDzR\n04ElxSAQSI8BC1AF2xsaGoQyvrKy8qdPn+7fv5+UlKSkpDRz5kySCdL9NDQ0HDhw4N69e1VV\nVaampnv37sXcbK9evbp9+zZll1WrVjk5OcnLy3O6LDo6On5+frNmzQKt5JiYmN27dyclJcnL\nyzs4OBw/fpyUKk9fXz89PR3b46itrR0cHJyQkEAUsLW1HTlyJLHFzc3N2to6MDCwuLjYzMxs\nyZIlvXr1evLkSVxcnKys7PTp08ePH//06dP169cXFRWB36+4uDibzc7IyBgzZszjx4+HDh3a\n2NhIc8WUlZU/f/5sYGCAt4wdO9ba2poU4mpvb29mZkYzDoisrGx4ePiLFy8+ffokKytrb2//\n8+fP58+fg5K+vr6JiYkNDQ0WFhaHDh2aNm0aXxNBIF1LN1mYwgN67FBR/zmCAT12ogFfHjsW\ni0WZsyMwMFBgBaqqqjIzMzn5roSFYB67jo6OqVOnkk7W398fRVEwHpbI27dvGxoaKDPrIggi\nJibW2toKThcbG0t0rSEIYmRklJycTFPqvri4mJhh2NbWFjPO+OLJkyc8vo9kZWVramrCw8Np\nZNasWQNOkZeXN378eFzG3t6+vLycXz1RFG1oaGhvb8c/fv78mfLyklpevXolwFy/Cuix6+nA\ndCcQCKTHICEh8ddff5Eax48f7+zsLMBo6enpEyZM6NOnz6BBg9TU1DjlrfiFhIWFRUREkBq9\nvLxu3rzJte+FCxfa2tr4mm7z5s2kjWs/fvwwNzfX1NS8evUqZZe+fftGRkZ+//49ODj4/fv3\nz549oyyGQQOKouvXr+dRuLW1ddmyZdOmTZsxYwalgLq6+u7du8H2/v37x8TEpKamhoWFZWRk\nvHjxAg+t6AzDhw8HF8RRwOm4YcOGzs8FgQgLaNhBIJDfCE9Pz1u3bg0cOBCrfODj4xMUFES5\nq52e+vr6mTNnvnv3DvvY0NCwdevW321vO2X9Vjab7eXl1atXL069pKSkRowYQVP7dezYsZTO\nPE5dqqurfXx8Hj58iH1sbm7++PFjZGQknsvD1NTUwcFh0KBBNOfCierqak7BDZSkpKSIiYnd\nu3dv/fr1WGIRWVlZCQkJGRkZe3v7N2/ecCq5KyYmNnjwYEdHR8H05MSVK1f27NmDbWfU1dUl\nbcLDSE9Ppw/1gEC6E2jYQSCQ34slS5ZkZmY2NzfX1tZeuXJFTU1NgEH8/f1zcnJIjb6+vqC7\n5RciLy/P6VBwcDCn4Akmk1ldXc2pr7y8PCf7lWY6BEGwcqvBwcFGRkZjxoyZOnWqnp7ewYMH\nabrwgpycHGWJMxp5BEGUlJTOnTtXXV3d0tLCYDAYDEZTU9OLFy/43TbXeeTk5A4fPlxRUdHS\n0lJYWEiZzURGRoa+eBoE0p1Aww4CgXQhRUVFISEhDx8+JCYx4QVSeXV+oaz4VFFRQZOANzs7\n+969e48fPwbrHHQRjo6OpE1vOFlZWTdv3uQU95qVlUW5PD1o0KC///7769evmZmZ4FH6Fe3s\n7Oy///574cKFeM2G1tZWX1/fGzducDkNWuTl5ceNGwe2i4uLUyY98fDwIH7EbgMpKSk2mx0Z\nGXnz5s3379+z2ezOqCQYmCZz5swBD82ePVsApzKCIPX19c+ePfP3909OTu6sfhAITrds9RMm\nMHgCFfWdoRgweEIEOHHiBG64yMjI7N+/v9umPnDgAPi4k5OTo4yiYLPZxG1S8vLyly5d4ms6\ngdOduLi4UD6ZDQ0NURS1srKiPIqVQ12zZg2xUU9Pj5hSbuXKlR0dHcS56urqhg4dyuldwKlQ\n2+DBg1EUZTKZlZWVAtTV3bZtG+jNkpSUvHv3bnZ2trS0NLHdxsaGcpDk5OSBAwfiYqNGjRIg\nhoMXSMETlOzZs4eos7GxcUVFhQBzPXv2jLgRcO7cuS0tLQJpzR8weKKnwzV4Ahp2PRLRvmsx\noGHX0wFLiCIIcu/eve6ZPSMjA9ym5uPjQyns5+cHqvru3TtcIDc39/r16//8809iYiLlCIIZ\ndmlpaZyWKY8cOYKi6N27d8FD+vr6+N9FVFTUtm3bVqxY4SUkxbwAACAASURBVOvrCzr/Tp06\nRZqxvb39+vXrfAWjKCoqolSGHYvFev78+ZkzZx4+fNjQ0EB5gteuXQMHnDlzZlZWFibQ0NDg\n7u5uampqbW3t5+dHOUhLSwvRqsOYOHGiABecK7wYdiiKfvjwYceOHT4+PleuXKEMQOZKfn6+\nkpIS6aTWrVsnwFD8Ag27ng407EQT0b5rMaBh19OZNWsW+FK3tbXtNgUCAgKIHiw7OztODqdh\nw4aBqi5ZsgQ7eubMGaLN9Mcff4D5QQQz7Nzc3ChtqdmzZ+NTkJYmVVRUsrOzwaEo62IZGRlx\nmtrPz4/HxW4zMzMUMOwKCgqIF01bW5toB+NQehwXLFjA11Wi/IWAIMj379/5GocXeDTsOs/x\n48fBM5KTk+uG2aFh19OBCYohEMivAd+qRaS0tJSTfFNT07NnzwoLCwcMGODo6MgpSRvvLFq0\naNKkSRERETU1NcOHD7e1tRVA1ffv32/atInYHhAQYGZmRpl0g19KSkoo29etW4fv2bp586aD\ng4Ovr29DQ8OIESPu379PGQPB79VetWqVo6NjVFRUY2Pj2bNnc3NzOUlS5vJwd3dPSUkhTrRw\n4cK0tDRlZWUBtIqOjk5OTlZWVp42bRqYToXTWZSWlpqamnJSm56kpKSPHz9KSUnZ2tqamJgI\nNkhnoLwyDAajrq6OvqAIBMIVaNhBIJAuwcDAIDExkdQ4YMAASuHExERnZ2c8L4aRkVFYWFjn\n81Zoa2u7u7vzompFRQWp0dDQEEEQypRy169fF4phZ2xsTFkOi1j3bOPGjefPn0dRFEGQkpIS\nLS2txMRE0BYhVmLA4XS1Mfr16+fl5YUgyKtXrygNO2lp6S1btvj4+JDac3Nz37x5Q2osKSkJ\nDw93dXUlaQUWbMUuLEZzc7OTk1N0dDT2UV5e3s/Pj/SVcToL4ji8g6LosmXLrl+/jn2UkZHZ\nvn1754N/+YXy+1JWVsYyvEAgnaKbXIfCAy7FoqLuZ8aAS7E9nYSEBHDX1+vXr0HJlpYW8D1n\nYWFB2vvfdTx9+pQ0u4yMzOHDh3/+/Onk5AQ+NrFtZ1lZWb6+vn/88QdWYEqAecvKysDAAmIA\nAZi+GEEQbW1t8O/i58+fvXv3JknysqOxpqYGDDSRlpY+depUcXExLkZciiUV78I5d+4caXCw\nHpecnBwW+YEBriDLycmRaryyWCywwiz9W4CGCxcugJoHBwdjR7ttKba6ulpPT4+kxrFjx7ph\nargU29OBe+xEE9G+azGgYScCPHz4UEtLC3tp9enT59atW5RiL168oDQUPn361G2q+vn5gZvZ\nZWRkiIWqcEaOHLl//35i3IO4uLhgMb9hYWHEjYDm5ubEP+3p06dTXpmvX7+CQ0VGRuJOLEVF\nxTNnznCd/dWrV+DCn5aWFljDjWjYVVZWUmb3CA8PB6e4evWqiooKJqCrqxsSEoIf6ujooMzD\nvGfPHtIgRUVF9vb2uMDixYvr6+u5nh0lpEK3GHPmzMGOdpthh6Lo58+fLSwsMAWkpaV37tzZ\nPb9koGHX04GGnWgi2nctBjTsRIO6uro3b97Ex8fTBA9yqnb/4sULkmRSUtLNmzefPXvGKQyz\nMzQ3N2/evBlUg2h4Yezdu5dSYdzxwxcdHR1v3ry5fPlyTk4O6RClFYIgSEREBCZQW1v7zz//\nrFy58uzZs5WVlUwmMy0tLSEhgZe8JBUVFWBlXltbW8rCvqTgCbBK2Lhx4zgV5G1pafn06VNK\nSgrJZmpqaqI8uzlz5ty4cePly5ekp1xBQcH79+8FKwKL079/f3DGcePGYUeJhl17e3tkZOSN\nGzdiYmK6yOTq6OjIzs6OjY3tzucANOx6OtCwE01E+67FgIadaNDS0lJZWUlpK+BwWtrLzc3F\nZZqammbOnIkf0tLSevnypdC1pUzzNmXKFLxdXV39xo0blFlqEQShf9oKwOLFiyknKigoQFH0\nyZMnxMVuKSmpGzdu8D74v//+C44sJiZWVlYGCpMMOwaDsWnTJnwd2dnZuaSkRIATpK88279/\n/9jYWAGGpWHq1KngRN7e3thR3LBLS0sjRmaMHDkyLy9PuJr8KqBh19PhatjByhMQCOQXY2Vl\nRTTaMJYuXaqvr49/3Lx5c1hYGP6xrKzMzc2NU1SpwFRWVoKNKIqmpKQUFRVlZWWVlpZ6enpS\ninHq3hmOHz9OSuGLIIi9vb2enl5OTo6rqyuxRCmTyVy+fDmPNQxQFE1NTaVs53oWubm5b9++\n9fb2/vjx49WrV799+/b48WNtbW1e5iXx559/0hzNz8+fP39+bW0tL0OhKJqWlvbq1av8/Hwa\nMV9fX1KLgoLC9u3biS3t7e0LFixIT0/HW5KSkjCHAi+a9ETa2tqSkpKio6OFfg9DfgHdYV4K\nFeixQ0X95wgG9NiJBrx47FAUra6u9vDwwHatSUlJrVu3jriYyGAwKLOfgAl4OwnljrpVq1aR\nxDw9PSkfpx4eHsLVB0XRgIAAScn/TV+goaFRWFiIoignk2jt2rVcx8zOzqYsZo8giIyMTGNj\nI9gF89gVFxdTJjd2dXUVeHH87NmzpCQpJG7evMl1kKysLGtra7zL/PnzaTbhPXnypF+/fpik\nmZnZmzdv8EOYxy4yMpJSky9fvgh2jr8VoMcuOjoaj16SkpLavn17T3/wivYrEi7Fiiaifddi\nQMNONODRsMNobm5OT08Hd+MVFxdTvmi3bt1KFKuvry8oKOjMPQNGoSoqKoIJgdPS0sCAX1lZ\nWVI4J1daW1tzcnJoduszmUwwx+/s2bNRFF21ahXlNZkxY0ZpaSn9pJQJmTFsbGwod0Nihh1N\nyQp3d3e+zp0Ii8XKzs5+9+4d5chcY0VbW1vBNXQnJyf8whYWFoJ/aHl5eeDyMWbYcdr0+fz5\nc4HP8feBZNgVFBTg0S04Qv/J1M2I9isSLsVCIJAeQ69evUxMTEDnnLq6OmXBeDwINDMzc/Lk\nyUpKSv369dPQ0Lhy5YpgCkyZMiUgIACP5DUxMQkLCzMyMiKJmZmZhYaG4mIIgmhoaISGhpqZ\nmfE4UW1trbe3t7y8/IABAxQVFbdu3cpgMECxt2/fgrkAg4ODs7OzKROhIQjy4sULbW1tIyMj\nTgUbIiIiiLmFScTFxSkoKCxduhRcAC0qKnry5Amnjrdv3y4vL+d0lB4JCQkjI6MhQ4YQfZM4\nXPPVvXr16tu3b6TGkJAQeXn5qVOnampq6unpKSsrT5gwgbj63L9/f07Lx8LNnPebc+3aNfC7\n/vvvv3+JMhDh0G02prCAHjtU1H+OYECPnWjAl8eOBjDXmr6+Pnb16urqQCuHU2oVXsBiFbEA\nBXpycnJev36dk5PDV0kxNpsN1ltbuXIlKElZaxVBkIiIiIqKCvoVTFlZ2bi4OHDMixcv8vJq\ncHBwIP71MZlM4h5HShISEni/CJSAwbZDhgxhMBj0vShT01Gio6NTWVlJMxTmsevo6Jg0aRKp\nr4uLSyfP7jeB5LHDklSDdP5v9hci2q9I6LGDQCCiwN69e7ds2YKHYVpaWoaEhGCZ565duwYW\nTtizZw8vw8bExNjZ2amrqw8cOHD37t1YAg5xcXEjIyMwfyyIgYHBxIkTOTnPKCkrK5szZ05o\naCip/cqVK2CRhsbGRspBJCUl1dXVX716RfQakmhtbQWtYQRBdHV1edHz+fPnsbGxxBb68Agx\nMTHKENe4uLhp06ZpaGgYGxtv3bq1vr6eeDQpKcnBwUFDQ8PQ0HD9+vU7d+5csWIFniRv/Pjx\nQUFB4Ko3BoPBOHjwoKmpKe9VQIqLiy9dukR5qL6+fuvWrRYWFjo6Ovb29ps3byYG9EyYMKGo\nqEhNTW3w4MFHjx5ta2vjccbfH8pvTUNDA4zagfQYus3GFBbQY4eK+s8RDOixEw2E5bHDqK2t\njY2N/fHjBzGvGFjzCoNrLjdwR52dnZ1gGct49Ng1NDSAC7s4eHY6nKtXr1JKRkZGYgIsFuvr\n16/Xrl0jViHD0dfXB3VoaWnhsVbbv//+i/fC9tg5ODhwEp4/fz4414cPH0hiNjY2+Na3pKQk\nktFmbm7OYDCqqqo+fPhATHZDCaekM/RQvj6YTOaYMWNIku/evSsqKnr//j1oCy5YsIBet98Z\nksfux48f4D6Hw4cP/0INO49ovyKhxw4CgYgOysrKo0ePNjQ0JFZ9UFNTAyXl5eXl5OSampqO\nHz8+b948Ly8vrJoCUWb16tWkXtHR0YGBgV2hOcapU6d+/PjB6Sh4IpqampSShw8fXrly5du3\nbzMzM69fvx4cHEzp2wPzDyMIIicn9+jRoyFDhnDVFuzu5+dHmepvxowZlPsa165dS2qJi4vz\n9/fH/r9hwwZiuhYEQZKTky9fvqympjZmzBhishuQ8PBwsBAcL2A2Danx9u3bJPckgiBr1qzR\n0dGxtrbet28f6VBgYGBUVJQAs/+GGBoa3r17l+j6Xbly5c6dO3+hSpDO0k0WpvCAHjtU1H+O\nYECPnWggXI8dJSkpKeBq3cqVKysqKkhlBoiPDk7Z0TZt2iSADjx67Gg8XpS1cRsbG8GSX0Qo\nS3vhXLhwgZMmTCbz48ePgYGB8fHxUVFRYLiArq4uMYMJ5rFbuHAhUUZTUxPLY0c5RXt7u5iY\nGKjV8uXLMQHKFDZubm68XMmDBw9SnjKlLUtiwYIFpAcLpxBjBoORmZlJeah76rp2BZQJipua\nmqKioh4/fszVUdojEO1XJFePHUUIEgQCgXQDTCbz9evXubm5BgYGkyZNwvfPcSUvLy82NpbF\nYo0ePdrY2Hjo0KEXLlxYv349Hlg6ceLEv//+28vLi5Sr9u7du7Nnz16wYAGCINLS0uLi4mw2\nmzQ4aCPW19e/fv26srJy8ODBxAW7Dx8+fP/+XUNDA9xozwlO28W0tLTmzJkTHh7e0dFRWlpq\nbGxsa2srLi4uISHBYrFoBuzo6OB0yNPTc82aNZyOSkpK2tjY4Nnsnj9/7ujo+PPnT+yjpqbm\nvXv3SCt0oaGhDx48ILaUl5dHRUUtW7YMQZCCgoLY2Njv379LS0ubmZmZm5t/+fKFUn/8IsjK\nyoKb1ThdIhKURiGCILdu3Vq+fDmeH0dOTg4MNw4MDNTX19fX129sbFRUVDQwMKCMxpWUlJSU\nlOSkT11dHS969hTk5eXt7Ox+tRYQIdFtNqawgB47VNR/jmBAj51owMljl5GRQSzZZGpqmpGR\nwcuAx44dw1/qMjIyeCq7vLy8K1euHD9+/NWrV2w2Oyoqirhci0PMITxlyhRQ4P3798TpXr58\nqaGhgR+dPHlyfX19bW0t0ZjT0tJ68uQJL8pTRrn26tULbLSysioqKnr79i3Pz/L/YdWqVadO\nnRIgQLW1tTUwMPDw4cO3b98Gs/symcxFixaB0ykqKqIoeurUKZIBROmrwzA1NcUy7f3xxx/g\nUR6vJGXGlhEjRqAo2tjYGBAQcOjQofv377e0tNDHDmNQBpTMmjULRdG8vDzKLjt37uT3Cv8m\nwJJiPR2YoFg0Ee27FgMadqIBpWHHZDLBzf7Dhg1jMpn0BQyeP38OvmLBAqmlpaWcluRcXV1x\nsZycHNImNtLbuqSkRFVVlTTCkiVLQBNHVVWVPi0wBpvNJm35p4k9tLOz41QCgYbg4GCuavAO\nk8lsaWlBUbSwsNDFxQWcTkZGBozw5Yq9vT2KohUVFaSAYk9PTx4Vy8nJAZ12bm5u4P3Tu3dv\nXlQiJenV0dEpLCxsbGzMysqilBexpVgRQ7RfkdCwE01E+67FgIadaEBp2L1//57yZamgoIAg\niLq6+uHDhynrMcydOxfsZWNjgwvk5uY6OztTrqxhnD17ljhgbW3tkSNHFixYsHLlyqioKNJ0\nlAnSJCUlKXe2Xbx4kZcLwmaz79+/7+3t7ebmxmlrF05iYiKnNUdKJCUli4uLef1uaPnx44eT\nkxPXnBeUblFe2L17N4vFampqOnXq1MKFC5cvX/706VPe1Tty5Ag4JuYmVFZW3rFjB/6EnDp1\nKo8qHTlyZOnSpfPmzdu/f7+npyd2N+rq6mL/IfH69WuhXOfuBxp2PR1o2Ikmon3XYkDDTjSg\nNOxoChjgbNmyBRwNzEmBIIiBgQF2tLGx0djYmGZMU1NTynpZnNi7dy+PNgGCIHv37uX34ty4\ncYN+zHfv3p06dYp3HQ4dOsSvDpRQ5nwWOvv27RNYQzCVMYklS5ZgkmlpaZSL3SC3bt1qaGho\nbW3lutuM/h30mwMNu54ODJ6AQCC/HZxKNhE5c+bMpk2bSNlTDQ0NwbQUeGa4K1euZGdncxpQ\nXFx8+PDhq1evHjNmjIeHh6SkZFRUVFBQUF1dnYWFxYoVK8BsXpR69urVC0VRcEs+TYI6TtBf\nBzExMUNDw3HjxjEYjH/++ae2tlZNTc3JySk9Pf379+/a2tpubm7q6upXrlzJy8szMDBYu3Yt\n5a41EkVFRVeuXPn586eurq6Hh8fgwYNJAikpKWvWrAFzPuOoq6uzWCxOYcW8c/jw4bKyMnd3\n93HjxnEVTktL8/f3LyoqMjQ09PHx4XoL3b59e/PmzRYWFmZmZgkJCfv27UtMTBQTEwOzQONg\n3+CLFy+io6NJh+Tk5EaNGpWent63b99FixZt2LCBh/ODQH4R3WViCg3osUNF/ecIBvTYiQac\ngifAmlog4eHhpF7JyclycnIksejoaOyop6cnj889GxubzZs3E1t0dXWLiopI0zU1NQ0cOJDU\n19fXF/TkGRsbc82HDNLR0TFx4kROSi5btgxF0TNnzhAbVVRUUlJS+J0I5+3bt/Ly8vhoMjIy\npNpr165d47r86uzsvHXrVnoZvuCaDvfOnTtEreTl5cPCwvr160c/LFhWrqOjgzJiBkGQqVOn\ndnR0NDQ0HD58mFLgw4cPAl/23wrosevpwKVY0US071oMaNiJBpwMu+rqaldXV2xTFKcISspS\np6GhofgbXUND486dO/ghTstz9MnecBwdHcHpMjIyxo4diwlIS0tv376dxWIxmcytW7fi+Vms\nra2PHDlC0ra5ufnr1695eXn016e0tHT27NnYOPh1EBcX9/HxaW5uzsjIANNtmJubc73szc3N\nX758Ic3OZDJBY0hBQaGkpAQTKCgoAO1mEC8vrx07dtDL4KufsrKyGzdupCyMQeTRo0dMJrOu\nri4pKYkUhlJWVgbuctPT00tKSrK0tKQZMyQkBLwy5eXlzs7OJEkXF5fy8nIURRsaGs6fP085\n2rx587he9h4BNOx6OtCwE01E+67FgIadaECfoLiqqio+Pj4vLw+seWpsbMxkMil7sVisjIyM\nb9++kQIsKGMy5syZQ+MVIyIhIcFJz4KCgk+fPjU2NhIbGxoa4uLiiLaFoqJiWFgYm80+dOgQ\nbtmMGDHiy5cv9FepvLw8ISGhpqamrKwsMTERf4KdPXuWUtX8/HxOQ7HZ7AMHDuD2maWlZXJy\nMnbo+vXrlKNhxnFiYiIv5XERHgxlKSmpWbNm4YHJ/fr1u3v3LinsFKRXr154KMb06dMLCgow\nte/fv08pHx8fj6JoXl5eQkICuA6upaVFE2FdUVGRkJDw48ePhISEiooK4nf68+dPypSKSkpK\n9F9iTwEadj0daNiJJqJ912JAw0404LHyREREhJKSEv4S1dDQSExMFGA60jqapaVlfX097nLj\nCr/LqWDEpZSUFBiwqaOjU1VVJcDpHDt2jFJPmpx/pKVbBEH09PSqq6uLioqIV5jI1atXy8vL\nQdtaiMjJyZ0/fx7cxUiDlZUVZrhTJv9DEOTcuXP4WScmJhLTDSorK+PldPmioaGhvb2dMkZH\nRkZGgAF/Q6Bh19OBwRMQCOR3oby8/OLFi+np6RoaGosWLSJumZ8yZUpWVtb9+/fz8vKMjY3d\n3Nx4ySsLsmfPnpkzZz5//ry2ttbS0tLFxUVCQsLS0hKsRg9iZmZG3H/GlZaWFjDPHJPJPHTo\nEKmxuLj4+vXr27Zt431wDMqlRjU1NU6hAyhVHpDCwkJ/f/+ampr6+nrKXlZWVv/++29ZWRm/\n6vEOg8GIi4vLyMjw9/ffu3cvWPADJDExMTw8fNasWZxW6o8cOfLhw4fRo0evWLHC0tIyMzPz\n3r172dnZ+vr6rq6uRDuPX+bNmwfG6FhZWQk84G/Ihw8fAgICKioqTE1N16xZ06VmPaS76S4T\nU2hAjx0q6j9HMKDHTjTAPXbJyckkj9Hx48e7R4fKysq+ffsSp1ZRURk5ciTpYYgHYfDI58+f\neX/S+vj4CKY8mBY4ICCAkzCnSNXVq1e7ublRHlq5ciWKokuXLuX9XATD2tq6sLCQ9EXQc/r0\naZSzxw5n4MCBwnJBYR47BoMxdOhQ4hSysrKfPn0SyhS/nKqqqj///JN4dr179+a6W6BnIdqv\nSK4eOwFzS0IgEAhfuLu7kzxGBw4cSE1N5XccNptdWlrKi8sHp0+fPh8+fJg7d662tra6urqL\ni8vHjx8jIyM3b948YMAAJSWliRMnxsTE8F7yFYOTk4MyPbLAHpHbt2//+eefAwcOVFRUtLGx\nCQ4OpizthaGoqEiZs01LS6uxsRFsl5KSwmIFSBU4MGjKggmAlpaWj49PSUkJX10QBAHryZLI\nysoSwBtKg6ysbFRU1MqVK/v376+srDx9+vS3b9+Sfgm0t7eTfJxtbW1d6vUUFpmZmUePHiW2\nNDQ0uLu7/yp9IMKn22xMYQE9dqio/xzBgB470QDz2HFKivbXX3/xPlRzc/OWLVuwsAA5ObnN\nmzfz8lfAZDIPHjyI1ZWSlpb29vaurq7uxAn9L+/evaM8KQsLC1KLpKQkqQRt17Fu3TrS7AoK\nCj9//iSVMsOQlpbGen3//h2Mh9XW1ubhHcIrgwYN4kteV1cX++s4fvw4V2EtLS2hXD3MY0cv\nU1RU5OLigpnv6urq58+fz8/Pnz17Ntaiqal56dIloSjTRXDK58I1grsHIdqvSOixg0Agvx5O\nTheuzhgia9euPXXqFJYZmMFgnD59evXq1Vx7+fr67t+/v6GhAUGQ9vb2a9euubm58eXw4wQn\n5bOzs01MTIgtLBZrw4YN7e3tnZ+UKydOnJg5cyb+UVVV9datWwMGDKCsi4W9BhAEMTU1vXbt\nGnGtfN68eerq6kJULDMzk16A6CDU09MLDAzE9ME0pKe1tbWT6vFIW1vbrFmzHj9+zGKxEASp\nrKxcv3796NGjg4ODsZby8vJVq1ZxXT7+hXC6D7vtGkK6Ghg8AYFAupx+/fppamqWl5eT2q2t\nrXkcITMzEyzA5e/vP3ToUDExsUGDBtnb24NpOGpra0+ePElqfPXqVXR0NKdEtbxjYWEhLS0N\nviabm5tBD2VSUtLDhw95qQzRSeTk5EJDQxMTE79+/aqqqjpp0iRVVVUEQUaNGnXnzh2S8KhR\no3Bzys3NbcqUKW/evKmrqxs5cuSIESN8fHxSUlK4zjh//vy8vLzExMTOqD1o0KA///xTU1Mz\nIyNDR0dn8uTJ+JryqFGjuHbHb6S2trbnz5/n5ub269fPwcGBx2JivBMYGPjlyxdSI7i+vHv3\nbi8vL4EL6XYpI0aMABvV1dUNDQ27XxlIl9AdfkOhApdiUVH3M2PApVjRAA+eePjwIenhw1fG\n16CgIPpH2bBhw8DSEfHx8ZTCxEwZneGvv/7i/WG7e/duoUwqGK2trcOHDyfqIycnl5SURNOl\ntLSUFFsKWirC3YdHeUtcu3aNPnOevLz89+/fURTNyMggFgvW09PjN+KB61Ls7t27eTwXLOPx\nb0hVVdXcuXNJ2j548OBX6yVMRPsVCZdiIRDIb8G8efPCw8NtbW379OkzZMiQo0ePgg4kGrim\nt01JSfHw8OCxF+bE6jxbtmzR1dUF2ynNHWFNKhgyMjJRUVEbN240MjLS1NScOXPmhw8fKJ03\nOFpaWvHx8X/88Yeenl6/fv2GDx8OLmGj/3edVE1NTVNTk3dHLIlHjx5dvXqV2IIVru3o6CA2\nOjo6rl69esCAAdra2nPmzImLizM1NWWz2a6ursRiwYWFhQsXLgSr+nYGHrPwSEpK8pW0r5vx\n8/M7duzYkCFD+vTpM2HChBcvXixYsOBXKwURHt1lYgoN6LFDRf3nCAb02IkGPCYoRlG0qanp\n6NGjM2fOdHFx8fPzI5WdaG1t5WWpCNwADhoZffr0qaysFNYJUtaDB3OeycvL//z5U1iTCsDX\nr1+9vLwMDQ319PQmT5585coVFouFouiNGzfU1NSkpKQUFBSGDRs2bdo0Dw+PqKgoUncmk9m/\nf3+u1793797Tp09ft24dpb3LC2ZmZsR5t2/fTjkLeIKfPn2iHPDFixe8XyWuHrsfP36Ay7ug\nQ3HhwoW8T9rNwATFPR1YeUI0Ee27FgMadqIBj4ZdXV0dcRENQRBbW1uSbZeQkMA1aQhYryI7\nO5uYzldZWZmvNz09QUFBxOL0GNu2bSsrKxsyZAje0qtXL5rkc93Aw4cPQT2nTp26cuVKTleS\nlGWQyWRyKlxBiZycnJqaGv6RFLpBM5SOjg5xXk4J9lpbW0nnGB4eTil569Yt3i8UL1Gxt2/f\nJtp2w4YNO3fuHDGmeMSIEcKKvO4KoGHX04GVJyAQSA9gz549xEU0BEFiYmIuXry4ceNGvMXK\nyiozM/Px48e5ubmNjY1gHVVJSUnQq2dkZJSWlvbkyZOMjAxdXd25c+cKK9izpaXF29ubFDwh\nKyu7efNmTU3NL1++BAcHp6SkaGlpzZw5k8cyrF1BY2Pj8uXLwSCPiIiIiIgITr18fX3nzJlD\nzFEyYMAAMG6AEwwGY8CAASdPnszLyxswYICLi8vPnz8jIyOxgl3jxo0LCQlZuHAhFklKhOQX\nHDhwIOX4ISEh8+fPJ7aQfhjg8JtmhSuLFy+eMGFCWFhYeXm5ubm5k5OTpKSkk5PTs2fPKisr\nLSwsZs2axbWcLgTShXSbjSksoMcOFfWfIxjQYyca7XeCaQAAIABJREFU8Oixo3wrT58+HTv6\n6tWr8ePHq6qqDhw4cP/+/c3NzUwmEyzouXXrVky+vb391KlTgwcPVlZWHjVqVGBgYFecWkxM\nDOVDVSjTtbS0HDhwYNCgQaqqquPGjQsPD+elV0ZGhrOzs6ampp6enoeHBxZNAtY945F//vmH\nzWbv2LFDQUFBTExMSkqK3xE+fPgwYcIEaWlpcXFxRUVFKysrQ0NDVVVVW1vbyMjIHTt2kOTF\nxcXPnDljY2OjoqJiamp69OjRkpISyg2L3t7e2CmzWKxLly6Zm5srKSkRfYQYM2fO5OsZwovH\njpK3b9/a2dmpqakZGRlt3769vr4eP1RaWurt7d2/f38NDY3Zs2d/+/ZNgPEFhsFgHD582MTE\nREVFZcyYMSEhIdBj19OBS7GiiWjftRjQsBMNeDTs9PX1wZf35MmTURQNDg4mtTs4OLDZ7OLi\nYmdnZ6xFRkZm+/bt+CzLly8ndfHz8xP6qUVFRVFaM3fv3u3kyGw228nJiTTs48eP6Xvl5OSQ\nljj79etXU1PDaY2SK+fO/T/2zjOsiSxs2JMQSCBApCNgQaWJIAguNhAVFXWxIPau2BAVxba6\n1rXv2hZdy67rYsFVsa+iIqBiL4ANAQVE6b2TEJL5fpzrnW/emTOTIQSUvHP/8IJTnzkzzjyc\n85QDM2fOJBTi1SwDAwOqHTUAfe7dGzduzJw5E/O01dHRmTdvHqHNpEmToArljBkzwFVDtUPw\n7/Tp0xt7JKqcYhcbG0uQwdPTE5gwVlZWdunShbAmHz58aOwUSkP2ijh69Cir2LVqWMVOPVHv\npxbAKnatAolEQv8oMlTsoAHeNm7cKJfLoWb4V65cAR0rKiqSk5PB+DU1NfX19a9evSK319HR\nqaqqUtVVA0pLS8nZGhAEycjIwNpUVFTIZLLGjnzjxg3ysKampkBXoGLChAnkXitWrCgqKuLz\n+eQqhcTExEB3y5YvX3748OHc3Fwwb25u7smTJ8nN6LU6BEE6dOggl8ulUunFixdfvnwpkUig\nXsyEtK2AQ4cO1dXVpaenk6t4PN6DBw+qq6sbu+yosopd165dyWKEh4ejKLpp0yZy1fDhw5WQ\nTQnIGieCIAYGBt9tKBZVod6fSDbcCQsLS7Pw5s2bAQMGCIVCPT09Nze3uLi4poy2a9cuwjma\nnZ3dypUr8/LysrOzye2xcLj6+voODg4xMTFOTk66urpCoRCa9bK2tlaJvLT0GBgYkOPYbdy4\n0draGkGQY8eOtW/fXiQS6erqzpw5s7CwkPnIUAfPwsJCExOT7du3S6VSaC9oiOAXL14YGxtD\ns3IJBAIaGYKCgsC+Drlq7969CxcutLa2Xrp0aVVVVdu2badNmzZx4kRCM4XOFllZWcuWLRMK\nhWPHjnV3d7ewsCgrKyM3e/v2LaFET09vyZIlurq6w4YNI7dvaGioqqpSqFaqipqamuTkZHI5\nuB1UN6XZxUIQhOJBKisro0rxx6IesIodCwtLo8nNzR00aNC9e/caGhrkcnlCQsKIESMSEhKU\nHtDS0jIxMXH27Nl2dnZOTk7Lli178uSJUCikUj7w5Q8fPhw+fPi7d+9QFJVKpe/fv4d2ge6u\nNZFFixZdu3bNx8fHysqqf//+ERERGzduRBDk2LFj8+fP//r1K4IgdXV14eHho0aNolLIyFBd\ndVlZ2bp169atW8e8FygMCQm5fPlynz59hEKhtrZ227ZteTweNIWUjo5O3759jx49GhYWRh+J\nTSKR/P7771jswJMnT+7fv9/Dw8Pa2trPz+/JkycK4/ZxOJwDBw5gXh0lJSX07bW1tbt27aql\npVVVVSWTyWQyWVpaGlVL+qFUiKamJtRVAqw8zU1pAah2altMAJZvQwttHaoO9igWVfd9ZgB7\nFPs9Q840j+B8HfAwj2NHBdlJAkEQfMqEfv36KXzRWVtb059jMkEsFn/9+hX6TOJtuaRSKVSn\nOXv2LNXIFRUVz58/x359/fo1zbVwuVxwDFpbW5uTk4Oi6Nu3b/Pz86Eh3w4ePEiY68uXL9Al\nxXjw4IFEIvn69Wt1dbXChUUQ5OHDh4QlAj9TKaD4C2EyPp7evXszafb69Wvmt7W8vByLa0hz\nFCuXy79+/Qp9kqEbh/fu3UNRlJwHD0GQoKAg5uI1hZSUFLIO17VrV9bGrlXD2tipJ+r91AJY\nxe57ZuDAgeTPFSECGaDpil1KSgrhlFZbW/vXX3/Fng2FyQCEQiFe+VCCvLy88ePH83g8BEHa\ntGmza9cuguUcXrH78uULVIx169aRR05ISMCMCDkcjp+fHxh5+/btNFd05swZPz8/gmIkEokI\n0V6GDRuGl/PIkSNMcqe6ubmBK2WYO+Hw4cMoimZnZ48dOxbsXRkaGu7Zs6e2tpbGkZYq6IxK\n8qteunSJyW19+vSpu7s76GJra3vjxg2oYieTyXbs2AFOlnk83sSJE/Py8vANvnz50rZtW7wA\nmIO2XC7HXHwAjo6OeJ/Z5mbfvn342UUi0b1791jFrlXDKnbqiXo/tQBWsfueIeeaBF8scsum\nK3Yoiubn51taWhKm++2330AtNCPCiBEjli1bFhAQsG7dOnIO2UYhlUp79epFGJ8Qvxev2JWV\nlUF9DrZv304YGWoKhr2ywREzeRwEQfAhl/FoaGhs3rx56tSps2fPPnPmDP6/T1RUFLRL0zl7\n9qxEInFzcyOU79u3Dx+lGcPS0nLr1q2lpaXQVercufPGjRv9/PyaIlJsbKzC25qZmUmwAhQI\nBLGxsWTFjqxk9+7dmxA9u6KiYufOnRMmTFiwYEF0dDS+Si6Xnz9/PjAwcPLkyQcPHiSHVm5u\nnj59umTJknHjxm3cuDE/P58Nd9LaYRU79US9n1oAq9h9z5w7d478Nf3ll1/ILVWi2EGn09XV\nBd9IcsALBEEiIyOrqqoIX19AY98PFy9eJI+vo6NTW1uLtcErdvX19WDHS+H6QNORcTgcbEen\nvLzc1NSU0IA+/YaXlxc2fk1NDbbyTBKyKYGRkVFxcXFERAS5Sl9ff8uWLeRy7EgaaumFHeh7\ne3srJ1KHDh3q6uoU3lZo1o0BAwYQFLvi4mLoNqfC6DPfLaxi19phvWJZWFhUz/jx4xctWoQv\nGT58+Jo1a5ppug8fPpALq6urgXfC5s2bBwwYgK/icrkBAQF6eno6Ojrjxo0DZ6PV1dWhoaEi\nkcjAwIDevZTAgwcPyIW1tbVZWVnQ9nl5eeSECgiCrF+/vk2bNqtXr66trQUlSUlJ5GYoimIR\nW0Qi0Z9//ok/0NTQ0OjRoweNtMCf4O7duy4uLsBN2MfH5927d1QHxHgamy9BKBSePHnSyMgI\neoMqKysnTZo0YsQIfGFQUBDmPyuTyci9YmNj165dm5ub27ZtW+iWHgEfHx/8Sb2xsfHZs2eZ\nOAekpKSQC/HeGLdv33Z2djYxMcHuFx7oJbOwfA+wKcVYWFiU4eDBg1OmTLl7965EIunXr5+v\nr2/zzQX1ReBwOCDsGZ/Pj4mJ+e+//+7fv//PP/+UlJTI5XLQRiqVRkZGvn///uXLl4GBgdjO\nX3Fx8bp168rLy3fv3k0/dXFx8T///MNcKgRBRCIRl8vFZMBTUVGxe/fu3NzcU6dOIQhCzpQA\naN++PfbzH3/8gVdAZTLZnTt36AW+ffv26NGjgdNrQ0NDTEwMODqk6YUNrrCNtra2p6cnh8Px\n8fGZMmUKsC2DLgWXyzUyMvrvv/9u3br18OFDPp8/aNAgvN+GlpYWWSqpVLpjx47w8PDc3Fx6\nSebPnz9hwoQBAwaUlpaeO3fu06dPXbp0mThxIjQYHhloM+xCnjx5gq0hFIU+vyws34yW2jtU\nGexRLKru+8wA9ihWPVDJUWxWVhbZkH/EiBGEZuSochgrVqwgF2LupTSsXbsWOiDBBZiQ4WDU\nqFH0L97ExEQURZ88eULelDIzM8PGoQp4Bj3qxYCm8VAh165dI6xSZmamrq4uoRn9aRGKok1J\nqNq9e3clIj/jgZ6wb926FRzF9u/fn2Z2kUjURMPNbwh7FNvaUc1RrKamplAo1GWAkv9HWVhY\nWGDI5fJjx46NHj0a+d/+kt26dfvrr78Ijaki2CEUkVrlcnloaKi9vb2ZmdmQIUMePXpEbgMd\nk8vl/v333zRiHzt2zNnZmaYBCLrbq1evkJAQvG4nEAjwG3JUV2RmZkYTqi0vL49m6qZDiBhc\nXV197NgxfX19/IU4OTkdO3YMQZDk5GR/f39LS8vOnTsvXLgQC9RcW1vLZIOQinfv3nXp0gWa\nWYEh/v7+BHV/woQJwcHB2PhUHfX09P755x+yNw8Ly3cCo6PYefPm3bt3Ly0tDQQHl8vlnz9/\nfv36dffu3e3t7VFYaHIWFhaWprN27dpdu3bhS3x8fBYsWDBq1CjyrhVNqgOq47mzZ8+CH6Kj\no6Ojo2NjYwnmetAxLS0tLSwsaMQ2NTV99erVtWvXbt269eeff5IbYCFa9u7dO2XKlH379mVn\nZ//www9btmzB24dRXZG1tfWjR4+cnJyqqqrItVpaWhKJhEa8JoKPLyOXy/38/O7du4dvsGXL\nlp9++onH46WlpXl4eGDx8I4cORIbG/vq1StdXV2BQMDhcBr1+Vi8eDEIy4cgiEwmy8zMHDx4\n8P3795kEMoTy66+/TpkyJSYmRiqV9u3b19PTE1tPkUhEDpjs5OQUHBzs5+dHCG7CwvJ9wWTf\n78KFC05OTiASJsaHDx/s7OyuX7+u/H6iUrBHsai67zMD2KNY9aApR7GfPn0iv7I0NTWpMl1C\nt9wAcXFxZPdSMuAvVTzQtK2rVq0iNKNKNi+VSm1sbAjd27ZtW1lZyWQFysvLoSchIA/pkiVL\noFfRTA6wAF1d3S9fvmASnjlzhtzGwcEB1EKjlmzatAnUNio/BJfLhQZ5ad++PZOVZAgWxw56\nBE8+g26NsEexrR3VhDtxdHS8cOECufzw4cPdu3dXUjRlYRU7VN2fWgCr2KkHzBU7sVi8f//+\nCRMmTJs27Z9//pHJZNAoJwiCREVFUQ1CDjmmoaGxa9cuFEXv3LmD3wCjyiWalpa2bt26MWPG\nzJ8/Py4uDkVRQkaHfv36nT9/fs6cOf7+/tu2bQOhSagUOxRFX758aWZmhnVv06ZNTEwMVvv6\n9evFixePGjUqNDT048ePKIrK5fKIiIiZM2cGBAT8+uuvV69eJexNjh07FvStqakhZ2JYtWqV\nk5MT9NKUQCgU4jc7NTU1XV1dHR0de/TosXDhwg0bNtjb20M77tu3LzAwEJrSavjw4SiKxsfH\nN9bGjip2Mc0L/9mzZ0FBQaNGjVq9ejVeH6UCU+zEYvGgQYPws3h4eIwePXrBggXx8fEKx/me\nYRW71o5qFDstLS1CxEXA+fPn+Xy+kqIpC6vYoer+1AJYxU49YKjYVVVVEYLZDh069PLly9AP\nOdC3qEhKSiJoGw4ODkD9ys/P//3330NDQ48ePRoSEgJVHfT19fElmzdvJlhTEdQLCwuL7Oxs\nGsUORdHy8vIjR46EhoaGhYUVFhZi5adOndLS0sKGEggEN2/eHDduHH58wkajlpbWnTt3sBFk\nMtmlS5cCAgK8vb0XLVoE8pL17NkTum4M6d+//+DBg/v167d69eomDgXF39+fxs0FvyAEqFxG\nOnXqVFxcTF72gwcP4pvp6OgoTECCzzwhl8uvX7/+008/LVq0iBDKDvyp0EphFbvWjmoUOwsL\ni8mTJxM+sQ0NDcDUoEkCNh5WsUPV/akFsIqdesBQsYOqWbt37yaoWQiCmJqa4iMDkwE2+wTI\n2Tmh0enI0zFh5MiR9IodlMLCQvIxKxMBzM3N6bMXbNq0idyL+bnnkydPaMZpOtu2baOJMzd4\n8GBoedu2bb28vKh6TZs2jbAImZmZ5FkUpgyGphQj588QCATv379v7B3/TmAVu9aOarxiAwMD\nIyIibG1tg4KCNm3atGnTpuDgYGdn5+vXr8+cOZPJCArJzs7+5Zdfpk6dOnHixJ9++ik5OVkl\nw7KwsLQKbt68SS6Mi4s7duwYYU/rn3/+oddRIiMjyYVkUzlPT8/Q0FB8iYmJSWVlZSOE/h9u\n3boFjVpHT3x8POZVgMFEgPz8/ISEBOzXx48fr1q1auvWrenp6aBk/PjxBHM0T0/PS5cuMRQM\n7BG+efPm9OnTDLswZ/To0SBlCLS2f//+CxcuJNvS8Xi8q1evnj9/nir2wo0bN1AUffny5b//\n/vv48WMQvY88S2ZmZmMDC2dlZZE9ZMVi8e3btwmFqampFy5ciImJIdzWioqKO3fuREZGZmRk\nNGpqlZCYmPjvv//Gx8czDMfNog4wUQ9lMtn27dsJfkAmJiYbN26k/+uHIXK5fO7cuWFhYTU1\nNWKx+PTp0+PGjaMyLmZ37FB1/3MEwO7YqQcMd+yg+V4HDhyIoui7d++WLVs2evTolStXAis0\nKuRyeUhICNQSy9zcHNrl9u3b8+fPHzt27NatWwlRPJjD4XDy8/MbuzLQNFwMAYfRMpkMf1TK\n4XDmzp07b948cnvw9z3e1I+Gly9f/vjjj0rLRoWxsXFYWJhcLqc5h8Vazpo1y87OTl9f39jY\neNSoUVjQuKqqKmtra3IXHR0dT09P7FdnZ+cNGzZAB3/58iXNTSHv2OFzUeDZsWMH1qa+vn7a\ntGlYVdu2bW/dugWqIiMjjY2NsaqFCxc2Mfwec4qLi318fLCp7e3tk5KSUHbHrvWjylyxcrk8\nKyvr2bNnT58+TU9PJz+dhw8fVu5xKS8v9/Pz+/DhA/i1tLTUz88vNTUV2phV7FB1f2oBrGKn\nHjBU7MaPH0/+dq5du7ZRc/3+++9UusLo0aMVdpfJZPRBTGhQwl/y48eP5HGY+BNwuVzwips0\naRJzCbds2TJmzBiFzTQ0NLCUXyqnT58+MpkMeghOQCgUUn0C9uzZQ25PzgMBdQ3W09OjP8cn\nK3ZSqdTExIQ8FN7Qk5xMTyQSZWVlJScnk/PM7ty5s7GPinKA6I94unTpUl1dzSp2rR1VKnYK\nQRDk7du3yvVduXLlvn37Kisr6+rqIiIiAgMDqb4ErGKHqvtTC2AVO/WAoWKXmZmJj46GIEin\nTp2AxwNzbG1tIToCgujr63/69InJCFeuXCH0/fHHH6GunQQUZlmAQvC3BV99skUXmeTkZBRF\nmQiGYWpq+unTJyY2fFTOpyrhxYsXKIpOmTJFYcvVq1dDF62uro4Q/JnKYo+c5u7vv/+mvyNQ\nG7vz588Txpk0aRJWK5PJoKv6yy+/QPOdWFlZKfGoNBaQRpnMhQsXWMWutaNQsftecsWuWbNm\nw4YN4H+7gYHBhg0b8IY1J06cwFLraGlpyWSyiooKmtEUNmjtAHVHvW0mQFR65WyeWgsoiqr9\nsxoXF7d///5Pnz6Zm5tPnjx55syZ0E0pAwODmJiYbdu2PXv2TFNTc8CAAT/99BOKoo1anJyc\nHHKhubn59evXjY2NmQzl7e199erVPXv2fPjwwcTEZOzYscHBwdnZ2TNmzEhNTZXL5UKhsLa2\ntqGhgdAxLy9Pifu4Zs0ac3PzPXv2FBQUcLlcOzs7FxeXCRMm7Nq1686dO3V1daampq9fvyZ3\n7N+/f48ePerr65nPVVxcvHLlSpFIBF79hoaGXbt2vXXrFrkl1F5QJBL169evsLAwKSmpoaGB\nw+HweLxGCQCYNGlSbW1t586dp06devbsWZrkE58/f6Za0qtXr+7atSs6Orq2trZHjx6+vr6L\nFy8mN/P19fX29o6IiMjNze3SpcvSpUuHDRuGjXn16tWjR49+/vy5ffv2c+bMCQgI4HA4MplM\nJpMR8rwNGTJk06ZN+/fvr6ys5PP5AwcO3Lt3LzZOVVUV9DWVmZkJDR+dm5tbXl5OTiWnWqhO\nkNPT0729vdX+taPen0ipVEpv1PttFLuHDx/+9ttv4OcdO3bY2Nhs2bLF3t5+27ZtmpqaN2/e\n3LhxY1hYGBY/KT09/fnz5+BnkUhkbGxMf8PU+I7iUcJeu9Xxf+E+qvE1Xr58GTP5ysnJefXq\n1evXr6FHaQiCtG/f/ujRo/iSxq5Mu3btUlJSCIUjR47s2LEj86H69OmDT1QPBIuLi0MQZPbs\n2devX4f2at++vXL3MT4+HtNH37596+vre/bs2V9++eWXX35BEOThw4fQ89OioiKy8T49crkc\nHz7G2NiY6tvP4/HImuuOHTv4fP6cOXPAr2DboFECAEDQ6dzc3Pj4+Hbt2lFtLCEIYmVlRbWk\nurq62BIhCJKVlQVt1r59e09PT0xmBPdEHT58GDPCy83Nffr06adPn8AGG/m9evXqVcxBuK6u\n7saNG+CICZTw+XwDA4OysjKy/FCFz8rKiry8Ksfc3Bya2AOL3aPGrx0Mdf1EKr53KtweRBgf\nxdbU1Hz+H8RicUJCwsiRI+vq6rAGs2fPxtus1NbWVvwPX758AbFXqABHsTQN1ACwz/ytpWhe\nwFGsTCb71oI0I1KptLy8/FtL0VyIxWKy5ROCIE+fPm2mGU+cOEGYS1dXNyUlRSWDR0VFUb1F\nBQLB3bt3lRgzOjqaPBr48IMG9fX1Hh4eCl/0KsTV1XXhwoWEQhsbm4KCAujdbAo0QU8MDAw+\nf/7MfCX9/f0JI3h4eNTX10MbFxQUkKPl8Xi8z58/V1ZWSiQSfGOJRGJkZESW8PHjx1ibbdu2\nEWpNTU3z8vIyMjLISeEOHTrU+CdFGaZPn06Y2tnZua6uDhzFtowM3wr1/kRKJJLv8ShWR0cH\n7wQHRJHjlGvCHzTa2tpYgAOwda9wH7u5N7q/B/6PXKMaXya4NHW9wLS0tNLSUnL5kydPmqis\nfPz48cOHD23btnVxcdHU1MTKZ86cmZubu3Xr1rq6OgRB2rVrd/ToUTs7u6bMhfH06VNoubGx\n8cGDB11dXcF9BH+plpSUdO/evX379kqMmZ2d/e7du6KiIhRFe/Tocf78+RkzZhCSsTYTXC53\n4MCBw4cPr6ur++eff0Chm5tbeHh4QUEB9G7q6+vT2Et4eHg8e/aMqlYsFgcHB584caKmpgZB\nEGyHqVOnTsePH4c6SlPx119/IQiChXTx9vYODw/HPxt4EhISyNuNDQ0Nz58/9/X1JbxzkpKS\nyEljEQR58uQJlvZjzZo1JSUlYWFhYCvFzs7uxIkT5ubmCIJcvHgxMDDw8+fPCIIIBIJVq1YF\nBQUxv66mcPDgQalUimVD7tOnT3h4uEAgwFa7ZcT4hqjrNSq+LiV25qhAlHWeqKmpmT59elhY\nWFVVlUQiuXjx4tixY3Nzc6GNWecJVN0tQwGs80Rrhypg2KFDh5Qes7q6Gp+YwcHB4dWrV4Q2\nZWVlDx48ePnyJX0U38aydetW6OVkZWWh/5NSLC4uDq+OzJs3TyqV0oy5Y8cO6JhYtDahULh/\n/34URT9+/EjegAE0h69Dly5dHj9+HBMT8/79exAAgepu7tmz58WLF2TfT8Dr16/T09Ojo6Op\nzt8fPXpUXl4eHx//4sWL3Nzce/fuAa1LuXuUlZUVHR2dlpZG3+zu3btQYa5cuUJwnvj3338J\nPj0YYWFhhGGLiori4uKSkpIIN72+vj4hIeHevXvfxGXh69evd+/eBfvWoIR1nmjttBqv2M+f\nP2/evHnKlCkTJ05ctWrVmzdvqFqyih2q7k8tgFXsWjsymYwcaZbP51OFsWBCYGAgYcAOHTq0\nzBpiZr54evToAWpLSkpycnLIZ3Y///wzzZjkQ08o169fR1H08ePHTBqrCjs7O7wvc319PTSA\nHPDPHTlyJLlKU1MT6/7lyxeoD+/ixYub5W7RUlVVhU+AC9DV1S0uLsYrdomJiVSHxXw+H4vP\n1epgFbvWTqtR7JjDKnaouj+1AFaxUwPi4+MJWSJ2796t9Gjl5eXQbKEnTpxQnch0EMKV6enp\nvX79GlSVlJTs3LmTLJu+vj5NQFo9PT2o3kDAx8cHtCeEz2jWuCQIgmBRdlEUlUqlN2/eJNxN\nLEhvbW2tUCgkdI+IiMBf7IQJE8hTCAQC+sByzcTFixcJZnanTp1C/3e4E2ioZ0CLxaJrDljF\nrrXTasKdsLCwqB/9+vV79erVgQMHMjMz27VrN23atP79+ys9Wl5eHtSdkMazUiEoil65ciU+\nPp7D4Xh5eY0aNYqm8Y4dO7y8vM6dO1dYWOjk5LR06VJ8QGOoGJWVlRUVFeT9ITA1OaUYlPT0\n9G3btn39+rVz585nzpw5efLkp0+fRCIRNBIKFJFIVF1dDY0tMmHChMjISGgVuKLIyMgjR44U\nFhZ27NjxzJkz0dHRwEe4T58+WFjpu3fvzp0798qVKyADh7m5+blz54AlpVwuv3DhwuPHj4Gd\nGQGxWFxUVIQ3RqyoqDhx4kRqaqq5ufmECRPs7e0ZXmOj8Pf3f/Xq1eHDhzMyMjp27Dhv3jxX\nV1dCm+zsbHJHLS2t27dve3t7N4dU3xYURS9evPjo0SMNDQ1vb+/myD7C0kKoUItE2B27lkK9\n/xwBsDt26gHDAMVMoNqxCw8PV27AhoaGESNG4Ify8/NTLuNTSUnJrl27yLKJRCKaAZmEC0b+\n985co4ISY4wZMwaaT0xXVxdFUWiSBgRBbt26RVgfLpeLN6fT1tY+f/782LFj8W2GDBmCGZlJ\nJBJ6VV4gEODjISQnJ+Pl5PP5LbYdC8Dv2M2fP58scPfu3VtSnuYAumMnlUoHDx6Mv9KAgIDW\n+/pV70+kwh07Ve7kR0VFdezYUYUDsrCwsGCIRKLZs2cTCq2trcmpkxhy4MCBGzdu4EuuX79O\nk5eMnmnTpuGzggKoctcCyJknoOAjBkgkEiVki4qKmjFjBrkcGPmtXr2aXOXg4JCRkUFYH7lc\nXltbi/1aV1c3Y8aMixcv4tvcuXMHywa7devW+/fv0wgWFBSEt2ObNm1aQUEB9qtEIlm0aBF0\nq68FCAoKIhw9IwiybNmybyJMc7N7925C8J2EsPcwAAAgAElEQVTIyEhCXEmWVgMT9bCgoGDG\njBkWFhbQN5SKdFCmsDt2qLr/OQJgd+zUAxXu2KEoWlNTg0+Q2q1bt8TERKVH69u3L/mF1q9f\nPyWGAl6x9+/fx/uLLFy4kN4rVi6XE+IPGxkZ7d69Gwt+RhPpjQoHBwdo+W+//TZ16lR8oIRR\no0Zh/7+WL1+Of73b2tp++vRp0KBBjZ0d4ObmBoZ1dHSkaTZz5ky85zKVAtcUN+rGQvCKvXDh\ngqmpKRBDIBBs2bKlxSRpPqA7dm5ubuSVx4w7Wx3q/YlUjY1dcHDw5cuX+/fvP3jwYOhRCAsL\nC4sKef/+/YEDB9LS0qysrGbPnj1w4EAEQb58+fLbb7/l5+ePGDHCyckpICCge/fuPB7vxYsX\nf/zxR2ZmprW1NQgSBn7t2LFjUFDQDz/8gCCIRCI5ePBgXFycTCbz9PRcunRpYmJicnIyeerU\n1NRBgwYZGhqOHj168uTJhJBRmZmZe/bsSU5ONjU1nTRpEsEmz8vLKzk5+fXr18XFxc7OzlZW\nVvSXyeFwLl26lJWVdeTIkfz8/ICAAHD0OW/evISEBLlcnpqaumjRIvpBhELhxYsXKysrZTLZ\n8+fPk5KSoKFJqqqqTp06tW3btkuXLjU0NIwZMwacwGJLPWbMmI4dOxobG0skknfv3s2bNw+6\nPkxISkqaOXNmWVlZRkYGudbNzW3z5s3dunXT09NbvXr1lStXamtr27VrB82xgSAIQ0tEJaiv\nr//jjz/u3LmTlpbG4XBsbW0HDBgQFBSEBcALCAjw9fVNTEysq6tzdXU1MTFpJkm+OdBFbr6V\nZ2lemKiHRkZGV65cUZGu2VTYHTtU3f8cAbA7duqBEjt2UVFRBI/F3377LSkpieB3GRwcjKLo\nyZMn6V9xJ06cEIvF7u7u+EIQPFYh06dPxwv2/PlzwtncihUrQBXYsVM5SUlJCoUEW4wSiYQ+\n7PP58+eZLHXPnj2ZrExTCAkJQVE0NzeXrCdBNw7u3LnTHGtbX19PyB0H8PT0pN9nbe1Ad+ww\nPxg8CxYs+CYSNh31/kSqJtyJjo5OYWGhikRqKqxih6r7UwtgFTv1oLGKXX19PVnr4vP53bp1\nI394bty4oTBiiFAoXLt2LX0bGv777z9MNujB4rNnz9BmU+xQFCWbFeIRCARAAKpYxxjkiHrQ\npW5uzM3NCwoKUApNgoyfn18zvQQwQ0AyICi0ugJV7D5+/Ej4r2RsbJyTk/NNJGw66v2JVI3z\nhJub2/v375m0ZGFhYWkKb9++zc/PJxSCw0Fy49OnT1dVVdEPWFNTc+3aNaXlwSzKc3Nzoa9B\naL5XFXLo0KHNmzdDnWF1dHRu374NzpqpsilgkBtAlxoKML/T1NT09/efMmUKI7n/NxwOR1dX\nd9SoUQ8ePABWa1Tr5u3tbW9vz+Vyzc3NV65ceebMmWZKDEWzYs19T79DunTpcv/+fR8fH21t\nbaFQOGLEiAcPHuCj+bC0IhgZzO3bty8oKGj//v1YajwWFhaW5gAaUI0KhhHs8F6lNGC5SvFc\nu3atS5cu8+fPpxIsOzt7zpw5qampnTt3njdvnlgsPnHiRHZ2tp2d3dKlS7t27YpvLBaLw8LC\n7t+/L5fL+/bty+fzHzx4kJGRgaKoqalpz549V6xYgVnrAwQCQY8ePfT09Mj+sB07dvTy8gI/\nK1y3rKyskpISfG4M5kutqampo6NjZ2e3bdu258+fnzlzhmFHDKFQ+PnzZyaza2pquri4mJmZ\nOTo6zp8/H9tG2rFjx7Fjx8rLyy0sLHbv3g2yh9XU1Hh4eISGhlIl/qKB5vKpquRyeXh4+I0b\nN8rLy93c3FasWKFOVneurq7R0dEymYzD4TR37GuW5oXJvl/fvn3btWuHIIiOjk4HEqrYWWwE\n7FEsqu77zAD2KFY9aOxRbG1tLfQjDc1nhSCIhoYG/StOS0tr8eLFCt+EPB6vR48eVLX9+/ev\nr69vbDgnPp8fFRWFXRowwKfvYmRklJmZiV8QmnPk+fPnY81+/vlnhfKYmZllZ2fjl7qxznAa\nGho3b95sVBf87PijveHDhyvsIhAIHj16hKIoIcQaAUtLSxAYuVGsX7+easBt27ZBu+DzFCOw\nm9UqYDNPtHZUY2Pn6ek5iBoVicoUVrFD1f2pBbCKnXqghPMEeUNo1apV9OHQaPj1118rKyu7\ndOmCL4Sqg3/++SdNeJEDBw6QT+gIngdkzM3NsfAZmzZtYiKwr68vthSJiYlUzSwtLYuKirCW\nVVVVtra2CgcfM2YM1uXZs2dM5CFgbW0dHBysREcEQcaOHYvN/vHjR3IWMjI2NjZMVEn6jwKU\n7du3Q4fq1q0bNMvZhQsXyI2HDRvW2Hm/Oaxi19pp0VyxLQOr2KHq/tQCWMVOPVAujl1sbOyI\nESNsbGwGDhx46tQp8Bj8999/0C/xzJkzhwwZYmNjM2TIkCtXrly7dg38Onjw4MuXL4MBS0pK\nli9f7urq6uzsTOWOsGHDhvfv30+aNAmLdoHnxx9/RFH0xYsX/v7+dnZ2np6eISEhChUOBEFe\nvnwJZOjXrx+T9pqamliyij179kDbTJs2jbxHVVpaumLFCicnJ5rBQaoJwNSpU5nIQ4DL5crl\n8rCwMAsLC+hC0aCvr48XOD093d/fv02bNtra2lZWVkOHDoX2GjlypMKRTUxMGvWAoSgKna5N\nmzbFxcXQ9tBEFFpaWsqlKvmGsIpda4fNFcvCwvINKCkpCQ8PT09PNzc39/PzI5iaKWTAgAED\nBgwgFFKlDbWzsztx4gS+xM/Pj9DG0NAQU5IKCwv//vtv8jgymaxr164RERHx8fHkPKHA7srd\n3R1LtBAbG7t//36F13Ls2LElS5Y4OjrW1NQobIwgSENDw7p164YNG+bl5QXNjYsgyPr168kp\nwgwMDEJDQz99+vT27VuqwfHWY1KplIk8BFAU5XA4wcHBwcHBDx8+9PT0ZN63vr4e/2unTp3O\nnTt39uzZV69eCYXC8vJyaC8mKXEZ2gtmZmaeOHEC5LrNyckhNxCJRFR53qD3AvzJwWRqFpYW\ng06xY5h9OSUlRUXCsLCwqAMvXrzw9fUtLS0Fv+7YsePMmTNU4WeZY21tbWlpSf4eN0q3QBDE\n1NTU1tY2LS2NUI7Z9vXu3Zt87kaepUePHjo6OvgUW1COHTsWHh4+ZcoUGn0LD4qiO3fu3Llz\n5+LFi+vq6sgNTExMoNldY2Jihg4dSq/i4BNjzJ49+9y5c0xEwoOPkOLi4qKrq8s8jK1YLJ4z\nZ87x48fBr5WVlf3791cYqy8rK0vhyEyegcjIyKlTp9LnZINGtgP069cPkxzfXqGVJwtLC0Pn\n+WLMjBaTlYWF5ftHJpNNnjwZ0+oQBKmrqwsMDCwpKWniyFwu99ixY4TCwMBAaGYweqAJYQ8c\nOCAWixEEIW+GIQhiaWlJKGnTps3evXuZTCeRSP7++2+q7TcqwsLC/vrrL3I5j8cj+/nW19f7\n+/sr3LhKS0v79OkT+HnIkCH0Kb/IcDic8+fPY7/q6uo2NrXu33//DTbMEARZuXIlkwjMChGJ\nRPv27aNvU1BQEBgYSK/VmZqabt68map2+vTphF1kHR2dQ4cONUpUFpaWoGWOhFUIa2OHqrsB\nAYC1sWulvHr1CvqqgWY+UIKnT5+OHj3a1tbWy8vr6NGjylk43bp1CypkXFwciqI2NjbkKn9/\nf+hQUVFRvr6+nTp18vHxOX/+/MmTJ7EQJAqZPn26q6urubm5qalpoxxUnz9/ThDj8ePHDPvu\n27cP6yWTycaNGwf2nGiCXHC5XC0trc6dO8fExJBX4M6dO8OGDevSpYutra2dnZ2dnd348eNj\nYmJ8fX2ho40fPx50NDQ0JNcyX4euXbs6OjrOnj378+fPCu/46dOnqcbp0KGDk5PTggULcnJy\nCLliCdTV1W3bts3Dw8PBwWHq1KmpqakK5/0OYW3sWjusjR0LC0uLQhUxWGEkYQRBUBTNyMjI\nycmxsbFp27Ytvqq4uDg5OdnMzKxnz56XL18uKytLTk4WiURyuRyvjjQ0NLx///7169empqa9\nevWiCm9GLyT0bJGqi6+vLzh3xtQUb2/v9u3bK7xYBEGWLl2KxVgxMDCgMjKjEkYqlX748KG6\nutrR0ZHJ8uL7Arhc7vnz5+/fv//mzZvevXvPnz8/ISGB3OX169f29vYlJSXZ2dkPHz40MDAo\nKSmxsrICB7uDBw+GhiPJzc2FKtAVFRUoiqanp0NlZh5d79ChQ97e3uDnhoaGlJSUiooKR0dH\n6E2nWZ/ly5cvWbIE/PzmzZvi4mJ7e3tobF6BQLB27VqaADQymSwtLa24uLhr1674oH0sLC1K\nC2mYqoPdsUPV/c8RALtj10opKiqCbrq8evWKvmN6ejreUmrq1KmVlZUoitbX1wcHB2OWTK6u\nrvPmzcMyMdja2sbHx4MRHj16hNeoeDzehg0boI9Qamoq9H0I9n6g/pKWlpZpaWlUwmMpxYqK\nikaNGsXw9evs7PzlyxfQkfk+n4aGRkFBQWxsLGYwp62tvWrVKobdr127hon97Nkz/LYZVfyR\nNWvWPHr0yM7OjlA+YMAAmt2y5ORk6GjLly+ncRBmmO2Ax+Nh0V4ePnyIhXrh8/k///wz+abT\n7Gjev38fRdHMzExMTUQQZNKkSUABZc6rV68wl2Qej7ds2bKGhoZGjdAysDt2rR023Il6ot5P\nLYBV7FovGzduJHw7p06dSt9FIpGQg/dOnz4dpY3QCzAyMsrOzs7NzYXukRw8eBA6Y1BQEKFl\naGgoqEpKStLW1iYP1a1bt7q6OuhomGI3YsQIemkJuLi4gJTzz549I0TRowqqp6+v//nzZ/K+\nVK9evRRO5+bmhh1ei8VicqZdqvxdVL6i7u7uNGeX5CjEJiYmzs7OVOIxjFqHIMimTZvAFDk5\nOWRTb2iyV2h2WnDCXl9f7+7uTqhqVGw8sIVJJeR3BavYtXZYxU49Ue+nFsAqdq2XhoaGAwcO\ngDwNZmZmoaGhCi8TGqOOw+FkZmbq6Ogo/Mxv3Lhx27Zt0CorKyvojGKxeOvWrcAlol27drt2\n7cIrKDdu3ICqOBcuXICOBhQ7er9XaL5XBEEww7W4uLhevXppamrq6uqSo73gmTx5MrnQ2tp6\nwYIFIGYyh8OByj9hwgRMZirPD5FIRDM1GXx2DfIiT5kyBYS743K5bm5u0EAzCILo6upOmDAB\n7F/Sq/JGRkb79+/HNsN++eUXcpu2bduShamurl6zZg2moerp6a1cubKqqgqltrnMysqifWz/\nP2FhYeTuenp6QGv/rmAVu9aOQsWOzQfHwsKiYjQ0NJYsWZKZmSkWizMzM9esWQPdAMOTmZlJ\nLkRRNCkpSWE8EdD98+fP0Krs7GxowDY+n79u3brs7GyxWPzly5dVq1bhw+1aWFigsPhkUDkx\nqGQAUHnFYmN6e3s/efKkpqamsrISGg4XA3rK+eXLlz/++EMikVRWVspkMpAHkgA+WMy7d++g\ngzPJYIGHZk34fP7p06fr6+srKipkMtnLly+pbN2io6P//fdfIDONZsnhcAoLC5cuXYodzUPX\nPC8vDzg44xEKhTt27KioqBCLxWKxuLKycvfu3bq6ujSXQH9DFbasqqoqLi5mOAILi6pgnSdY\nWFiaCz6fD43EhvHu3btt27Y9fvyYSnuzt7fX0tIiBLYlExUVRRX+18zMjD5BAraRlpGR8fvv\nv6elpVlaWvr5+XE4HLJu9/Lly+rqaqANkKG3D9PW1oa6ZRACqWhoaJw6derkyZM0Q3Xs2JEc\nKAToQ3K5/NKlS1evXoUG4E1PT587d662tvbXr1/T09Ohg3fq1OnFixc0sxOIiIi4c+eOh4eH\nvb391atXCwsLu3XrtnTpUrAadXV1Bw8efPLkiaam5qBBg6ChZBAE2bdvn0AgKCoqsrKyoglc\np6WlFRISEhwcjGmfBCcbgEAg8Pf3t7KymjVrVu/evQm14I7n5eUdOHDg7du3pqamVM4u5Bg3\nCILk5OQcOHDg/fv3ZmZmEyZMABaZUDG0tbWhnr8sLM1Ly+wcqhD2KBZV931mAHsUqx7QpBS7\ndOkSfWyLESNGoKTM641l8+bNTOS8d+8ewaate/fu0AE7duyIT9IKAEexMpmMKsIth8P5+eef\nyeXW1tZisRgbRy6XK/S9MDAwePfuHXQTdNy4cUwScNGgo6OTlpZG9qJobBhePT29pKSkyspK\nQqB7Ly8vKt2OOXw+Hzv//fjxI5WeDYAaWb59+5ZgNUjWwIYOHUrumJSURDBMXLduHYqiOTk5\n5BEWL17M5NlrYdij2NYOa2Onnqj3UwtgFTv1gEqxq6qqMjAwoPkee3t7FxQUoCiKBQRRgpkz\nZzLxTGxoaOjQoQOhL5/Ppwp9PGPGDMIImPPEgQMHyO05HM6BAwfkcvncuXPx5ZaWlu/fv8eP\nEx4eTn9Fenp6CQkJKIqeO3cOmNOpFi0trczMTDc3t6YP5eLigoURwcPEaFIhZmZmmEJ85coV\nExMTqpYCgYDsutuzZ09yS/wz4OXllZeXR35UXFxcyB1fvHiBouidO3fwW7YBAQFUrjbfFlax\na+2wcexYWFi+R168eFFWVgatWrp06dSpU4GXYnFxMTSy2uLFi3v37n379m2oJtSvX7/Ro0f7\n+fkxNBdLSUkhH/9JJJLp06cLhcI7d+4QqqKioqiGgsZn1tXVBSrOsWPH1qxZ8++//xYXFw8Y\nMGDYsGGEPUuoFb+enl5gYGBNTY2Hh8fMmTNB3L7x48fPnTtX4SF1Y6mvr4+NjX3z5k3ThwI7\nduRyJkaTCikoKEhMTASOwKNGjQLmiYmJiWTHC7FYHBcXN3PmTKykpKQEetY8ffr0Xr16FRYW\ndu3atWfPnmTvk/z8fGiqjDt37ri7uw8ePDg1NfXJkyeFhYXdu3fv1q1bE6+RhUU5WMWOhYXl\nG0CjkfTu3RuLPUGVqF4kEk2aNIkqUlq/fv1CQ0ObLkx9fT30mI9KKqoqfNDdTp060Xh9Qrtz\nuVyoB2tjc5QxRCwWM48STA/NQjUd/F0TiUS+vr5Ue8CE+0sllVwu9/T0FAgEVEaZVB2x8XV1\ndaGxmllYWhLWK5aF5f8uIACEtbW1trb2Dz/8cPXq1aaPeeHCBTc3N21t7U6dOm3YsIFqe8bN\nzY3qJNHDwwP72dzc3NramtwGmLKNHTsWOsLRo0e5XC7IkaWtrW1gYKCvrz9o0KBnz55B2zs4\nOEDTFfTp04dseo+QUsXX19f//vvvNjY22tra9+7dI7eXSqXa2tp2dnYgTgdUBgB0OqlUqqGh\nweVyDQ0NQYQ2BEGo/F6bzqlTp1CYR3Bjad++PT7itGrR0dEhn4o6OTmRI/MhCPLDDz/s2bPH\nwsICLGPXrl2hqpu5uTn9pFZWVlA3CyWyFVMhlUr37dtnZ2enra3t6Oh47Ngxcl5gFhYFtNCZ\nsOpgbexQdTcgALA2ds2NXC4fNmwY4YXw77//NmVMcpSykSNHUjlP7Nmzh/xG2rhxI6HZ7du3\nCW3waVuhWSKo0NbWpkqAcerUKULjoKAgFEXr6uoIAXX19PQIKSjmzZvHXIaQkBCaBRSLxVAr\nLjxbtmxBUXT9+vXkKjc3N5rwv0ygCjXSp08fhjkhMG7evPn169dmcgs9cuQIdAGPHz9OaBkS\nErJo0SImY+rq6r58+ZIm3jKKouT4yVjqW5VAtkpk6P3DHNbGrrXDOk+oJ+r91AJYxa65uXbt\nGvnbZmpqqnQeJIlEAt33ioyMhCp2KIpeuHCha9eufD5fS0vL1tb25MmT0Dv+6NGj4cOHW1lZ\nubm57dq1izDa0qVL9fX1NTQ0qFI14PHy8qKS//bt2z4+PpaWlh4eHocPH8bWoaysLDQ0tFu3\nbtbW1pMmTSJoda9fv2aiNOChTx5fXl6+YsUKMB10PTU0NN6+fQt1U01MTCwrK1uxYkWHDh2A\nKR6HwzEwMIiJiYEqN23atDl37tygQYMsLS179eo1Y8YMqMDLly+vqKhYuXIl/XUZGRm5u7tb\nWVkNGzbs4cOH4HIyMjKoMpUxZ82aNcePH+/du7elpeWAAQPwWdHI3LhxY+DAgeCK/vzzT6pw\nfVCGDRtGr9ihKBofHz9s2DArKyt3d/fffvtNYXvmpKSkkEXi8XhQNw6lYRW71g6r2Kkn6v3U\nAljFrrnZtGkT9NuWmZmp3IAfPnyADrh+/XoqxU61zJo1S+GXWygUqnbSEydOMNcbAGfPnmUy\nslwupwoysnPnTmj5uXPnqEZbvHgxtAs+I+qkSZOgbY4fPy6VSqkOvvFUV1cT5lVJhN6bN28q\nd3dQFD19+jTzidq2batCRa2xnD17FirVrVu3VDgLq9i1dtjMEywsLHCook4oHY2CKr2ESsJb\nMIHJRE3fPVJiUuW6cDgcKjNEqgNTmpGhVRoaGvhEZ1ADNQRBwMahwvQhPB6PLLBAIAB7h02h\nKY9Qo/q22LPaqNm/rVQsrQ5WsWNh+T/K8OHDyWeXvXv3NjU1VWK05OTkRYsWkT/h2tragwYN\nIhQmJCT4+Pjo6uoaGBiMHTuWKgUC4OnTp97e3jo6OoaGhpMmTfry5QtVS4WhfRm2IfD169fJ\nkycbGhrq6Oh4e3s/fvwYqzp79izU/ZYmnC+Hwxk3bpyBgYFQKNTX1x8yZMiMGTMsLCy0tLS6\nd+9+4cKFlJQUPz8/kUikr68PVeBMTEwmT55MtvTncrkjR47k8Xh8Pt/Q0JDH4wHVcOLEifX1\n9X5+fuShhg0bNm/ePD6fz+FwNDU1obFaNDQ0Vq5cyefzz58/T3VRABRFRSKRpqami4tLZGQk\nKBQKheQHABuZfkBA27Zt8f40GCdPnnR0dNTU1GzXrt3atWsrKysPHTpkY2PD4/Gsra137twJ\nnFX79+8PPdGGMmLEiNLS0qCgIDMzMz6f7+7uDs1i3Ex4enqSHXstLCx++OGHFpOBRR1oqb1D\nlcEexaLqvs8MYI9iWwCC+4KpqSm9+RcVX758obKRP3DgAMF54sOHD4RtMwsLi8LCQujIiYmJ\nhI2ijh070ixaSEgIzevO0dGxsQteUVHRqVMn/CCYBwbZTl8lELbNCNHUNDU1Hzx4gKJodHQ0\n840cb29vFEUJqS/at2/v4+NDP3tTOH36NFjDz58/kx0v9PX1of4fBDgcTnR0NPm+/PHHH4SW\nhBQXCILMnz8fNI6MjMRvTFLB5XLj4uJAbDw8V69ebdQz0xQuX76M/3NLKBTGxsaqdgr2KLa1\nw9rYqSfq/dQCWMWuZXj+/PnKlSunTp26e/dupf/jQI3bunXrlpiYSM48MXr0aHLjZcuWQUcm\nax4Igqxfv55GmIkTJ5K7WFlZHTlyRAlTP6glore3N5WnSHPg5OTk7Oxsb28/ffr0nJwcTLYv\nX75s2LDB19eXySBv375FUfTBgwfLly+fPn36vn370tLSoC0XL17cq1cva2vrJkbxMDExwRxQ\nKisr9+zZM2rUKA8Pj6FDh27ZsiUvLw+6D0dm69athJtSW1tLn0YM482bN6DLx48fly9f7urq\n6uzsvHDhwl9++QXqPuzk5EQu7NChQ0u+iD59+rRu3bqpU6du3Ljx69evKh+fVexaO6xip56o\n91MLYBW7VoSrqyv5c+js7IzCUoqRk3ch/7OlRMbIyIjc+Mcff6QRBqromJmZKXdpUDVUJBKl\npqaSy5sJBwcHGgkPHz7MZJCdO3cSOlLtOE6cOBE0aHpcQ3pHHIYp0YYMGULo+PbtW4YChIeH\nU80OjSRMZYJZXFxMcyGtC1axa+2wzhMsLCzNDvRzSLWn0nyNle6ixGgqd8KgQYnrJUM+K6fa\nccQM+5p+jSqRnHw6zPwMmkYA6OzQE1sNDQ2FjiMsLN8PbEoxFhaWpjJmzJiHDx+SC6kak1OB\n0TQ+cOAAtHFdXd2BAwfu3buHoqiXl1dISAj4VI8ZM+bixYsMx6cHRVFjY2Nyub+/v5GRUbt2\n7b5+/cpwKC0tLaXzunbv3n3y5Mlfv37t0qXLkiVLCPujPj4+enp6VVVVNCNoaGhMmTJFJpP9\n9ddfJ0+eTE9P5/F4ffv21dTUJKfJqq6utrKyKioq0tDQgDZgiJeXl7GxMTiEffz4sZaWlre3\nN4fDiYuLq6iokEgkVJm7CAQFBRHEW7FiBZfLJaRk4HA46P9OmGFgYODt7U017JgxY65cuUIo\nhCp2vr6+TFTJvLy8X3/9NSEhwdDQ0M/Pb8aMGUzcgevq6sLCwmJjY0FCs2XLlin9RwgTkpKS\nfv3114yMDGtr68DAwIEDB2JVd+/ePX78eHZ2tq2t7dKlS5sY6ZrlW9IyO4cqhD2KRdV9nxnA\nHsW2IhoaGghJLHx9fYF9FfkoNi8vj+CNa2RkRGX9Vl1d7ebmhm88Z84cMCzhw+Pg4FBVVQV6\nTZs2DV/l4eFRW1urxHXNnTuX/M7s0aNHcXFxz549G/Wm3b9//08//cSkJcHMy87OjtDg4sWL\nBDkjIiLonQM0NTWLi4uhjrHNhIGBQXp6emlpKTQjHHOcnJzwV1pTUwPdaCSrUFwu98qVK/T3\nd8qUKQoF6NChQ3Z2tsJH5ePHjwTBJkyYoLBXXV0dQU23s7OrrKxU2FE5rly5Qjj+3rt3L6j6\n9ddf8eVaWlrXr19vJjFaAPX+RLI2duqJej+1AFaxa13I5fLIyMjFixcHBwdHRkZiN46s2C1Y\nsID8+dy9ezfVyFKpNDw8fP78+cuXL799+zYohDpUrlixAut18+bNZcuWLViw4NSpU8rl0oiJ\niSFPYW1tLZVKt2/frlAhmDBhgoeHh7u7++LFixMTE8GYDx8+XLVq1dy5c+fOnUu1nWNoaHj2\n7NklS5YsWrTo8OHD5JA0hoaG5P/+qamp7u7uNPKQFcRmRVdXt6ysDHqvG8vr16+xyxw3bhzz\njlj2Cxpu3LjRo0cPcl9PT8/AwMCDB7eVYioAACAASURBVA8yfNNCvXwuXbpE32vz5s3kXlSO\nRE2krq6ObLHK5/MzMzMzMjLIfxiYmpqKxeLmkKQFUO9PJKvYqSfq/dQCWMVOPSArdjY2NuSP\n2eDBgxs1bO/evcmDdO/eXYWSU22wlZeX4w+wqFi6dCnN4NHR0TR94+PjQbOIiAj6BnjoVTeG\n554qJCoqCnqvG8u+ffuwazQzM2PeMSAggMmN7tatG7lvv379mD8qDQ0NPB7ErmnhwoX0HT09\nPcm9unXrxnxq5jx58gS6SuHh4VTZU54/f94ckrQA6v2JZJ0nWFhYvi9kMhm5kGAvpRBo+8YO\nosQUCILIZDImE9G3oTdcw/pSDQIthy4sBvq/7c9aALlcTi8S83Gwnxt1FQxnb/qzhKKocoO0\nwGOMQbUacrm8UY8Zy/cPq9ixsLA0F2VlZY8fP05OTm5oaMAKobsUXl5ejRpZ4SASiSQhIeHp\n06eYV0FmZubNmzevXLmSmJjIxI8BOoWjo6OhoSG0ioCdnd39+/eh3hUfP35MTU2lOorV1dXF\n7K6gG5O6urrW1tYPHz588OBBXFzchw8fwDebXiorKyuFMqsQgUDwww8/MFkoheAHaZRp4/z5\n8xs7Pgb5gczPz79//35aWhpZ3eHxeNA7pfCpVsn/BYa4uLhAo0/37du3T58+5HKRSMT6T7RW\nWmbnUIWwR7Gouu8zA9ij2FaNXC5fv349ZrjTuXPn+/fvg6q8vDzCgZqjo2Njn2dyQoh27dqV\nlJSA2qtXr2KpDvT09DZv3jx06FB8Y0tLSyZ55QMCAggvTGNj48uXL1dXVxPyHBDyQ5iYmGA/\njxkzBouCJpFIFKomf/75J14G8rHvoEGDCCbwzs7OCQkJ+fn55DxjAA0NjeTk5EadYzYRbW3t\ngoIC8r1uLIsWLcKvRllZGcPII127dmX4LBUVFVlaWuL72tjY4D0Y6urq5syZg9X27NkzOTmZ\nMMjr168JnrNDhw6VyWT0U1dWVhJOq62srJovZl54eDhhlbBY32vXriVUnTlzppnEaAHU+xPJ\n2tipJ+r91AJYxa5VExYWRvhOGBoaYmH0c3Nzg4KCXFxc3N3df/rpp4qKCiWmKCkpCQ0NdXNz\n69GjR0hISFFRESh/+/Ytk2+/UCj88OED/RRSqTQwMJDQUVtbOykpqaysbPXq1e7u7i4uLsHB\nwTdv3pwyZUrXrl379etH0DgRBPHz8wMDDhkyRKFgoaGhmAAPHjxQ2B7Qrl270tLSvLy8RYsW\ndenShdzgxx9/JJRQbRkyjBsMEIlEenp6ZA8PBEFMTEywe+3k5EQe1tDQ0MPDo1evXlZWViKR\nyMTEZPTo0X///be/v7+9vf3AgQP//vtvsm6Ul5fXt29fLS0tgjJNIDIykvmzVFBQsHjxYldX\nVxcXl9DQUMIXZPHixYTB7ezsqqurCYOkpqZOmzbN0dGxb9++u3fvZpjmpLS0dOXKlW5ubq6u\nrkuWLKHKracqYmNjhwwZYmNjM3jw4HPnzmHlcrn87NmzPj4+9vb2I0eOBFGEWi/q/YlkFTv1\nRL2fWgCr2LVq2rVrR/7WbtiwoQWmnjdvHkOlhLAbBAV6SjVr1ixCM2yzkBzPD5CcnFxbW0uv\niwD4fD6mNLi4uDC8FgRBDh06BHqNGDGCXMtkauWIioqi8iZJSUkBIh06dIhcKxQKlUjyBnjx\n4gW9VD179lRi2MrKyvr6enxJRUUF1O8Ey4Tb6mAzT7R2WOcJFhaWlkYmk0FtyzIzM1tg9s+f\nPzNsyUQe6Gg0Halmz8zMzM/PRxnY/kskkry8PPBzTk6OwvZkqaAyMJlaOTIzM6mu+t69ezQi\n1dTUFBUVKTepwrusqoctJycH6unSMg8zC4sSsIodCwuLitHQ0IBaVhEsmZqJtm3bMmzJRB7M\nVo9hR2h70MXU1JTJthmPx8NWD2+rpxBMKuYroBIsLS2pZuzVqxf4AbosAoEAmguYCVTrjKEq\nZxEzMzPomXXLPMwsLErAphRjYWFRPYsWLdqwYQO+RCgUzpo1q1GDVFZW3r17Nz8/38HBAWSj\nomoplUrv3r2bnp7esWPH2bNnk43EyQgEAiZek0FBQbNnzyYULly4kKq9gYEBOdVYly5dSkpK\nnJ2d3d3dFZ4hTp8+HfNe3Lx5M8OQvHp6eiB9VmJioo2Nzd27dwkN+vbt++jRI3wJj8fDeytj\nNCqNmLW1dWFh4aBBgwiDg/GjoqKMjIwqKipkMhk575mHh8d///1XUlISHx9vaGg4a9YsV1dX\nuVweGxubmppqYGCAIEh5ebm9vb1cLk9NTbWwsBg8eDDIuGVkZGRtbU2zbbZo0SLs54yMjIcP\nH0ql0j59+jg4OOCbZWZmxsfH19fX9+nTp2vXruRxDA0NJ0+efPr0aXyhpaXl6NGjmawPiqL3\n7t2Liop6+/atvr7+mDFjJk6ciNUWFxfHxcUVFxd3794deuj/HVJfXx8dHZ2ZmWltbT148OBG\nWWSytBAtcySsQlgbO1TdDQgArI1dq6ahoWHkyJH4V421tXVWVhbzEWJiYvA+nr1798bcIwh8\n/PjR0dERa9m5c2eCj4K9vT3BWE1PTy88PJyJGHv37sXHntXU1Dxy5Ai5WUlJiVwup9cUe/Xq\nlZGR0aFDB6yEy+US/FiFQiHmYvL27VuoGwQVQqEQ2yEj0KZNm5KSEnx2BBsbm5MnTzZqRxBA\nUK+hgXnx4Le7FG5Yjh8/nj5/hoWFxYMHDzZt2kTWJzQ0NMAPWlpaq1atwm7Ntm3b8GkVlixZ\nglXt3LkTX7Vo0aKKigqCjR2KouXl5fiMeZ06dXry5AmTh6e4uLhv374EOTt16gQsCy9evAiU\nV8DQoUPJDhnNQVNs7JKTk/FxsG1tbd+/f69a8VSCen8iWecJ9US9n1oAq9i1aqqrqzt27Ej4\npHl6ejLsXlRUZGpqSug+evRockuZTEavCgD27t0bHx+/d+/ebdu2Xb9+nWFEibi4OPJQ+EQI\nGCUlJWRHYDLgdfzff/8tXrx4x44d0Fy0Q4cORVFUIpFAN5CUBnjmvnr16vTp03FxcUC3gOZd\nbW6gmR6Yg1eGMH7++eeCgoLr16+fP3/+8+fP2H25ceMGufGxY8dQFL116xa5at++fWTFDpCY\nmHj69OmYmJi6ujomDw+KomPHjoVewuDBgzMyMsDWI5558+YxHLkpKK3YSaVSQiJjBEG6detG\ntWLfEPX+RLKKnXqi3k8tgFXsWjVXrlyBftIURhgBHD9+nNyXw+EUFBQQWiYkJEAnIuDo6KjE\nVUybNo08FDRAWklJSffu3RWKweFw8vLysF76+vrQNmVlZVCdsiloamoSZI6NjVXtFAyhCrPS\nFPr27Qu9g/7+/uTGwGEWesbdvXt3VakpJSUlVFfK4/Gg6YYFAoHSbsLMUVqxI5+2Ax48eKBy\nIZuIen8iWa9YFhaWb0BhYSG0vKCgQOnuKIqSy6kmUq4Zk15UQzGZAn8JKIoSbM6w8uLiYuUE\npoGcUSotLU21UzCkORJVNeqmgEJoVXFxsapEAn+XQqsaGhqgs4vF4oqKClUJoHKa+J+apcVg\nFTsWFhbV07lzZ3Ihl8tlaDQG7a6lpYU3UKNpSUa5bPRQaamGYnJpmpqa2Ak1h8OB+g5rampa\nWloyvC7m4I3JANAsWC0ANCxcE2nUTQGNoVUqXHYrKyto3GYEQQQCAXQiQ0NDpd2EWwCqxVHu\nPxdL88EqdiwsLKqnf//+5JSXs2fPZhgkYuTIkeQ8lcuWLQPuohKJJDEx8eXLl7W1tV26dJk0\naRKhJdmi/+eff6aZrri4+PTp0xEREeXl5SC/bUpKikwmCwkJIafXXLFiBf7XhoaGa9eubdiw\nYfDgwYSW5JO4kJAQ/PHrtm3byMIsWbJEW1u7R48e0DjDSvPTTz8Bad+9e/f06dPKykpnZ2cm\nZnyqjWzM5/P79evXlLns7OzImUVARqxXr17t3r378OHDWEbXFStWEDJ9IQiybt06BEFCQ0OF\nQiGhinBz8aSkpBw6dCguLo7hjqNQKFy+fDm0Kjg4eNq0adbW1oTyn3/+uTnOqVWFk5PTmDFj\nCIWjRo1iYoTA0qK0yImwKmFt7FB1NyAAsDZ2rZ3c3FwsJISGhsaCBQuYP7QXLlzAO2xqaGiE\nhoYC46fIyEjMk9TAwOCvv/6qqKiYPXs29kUcP3785cuXsV0EY2Pjf/75h2auwMBATJ/gcDiY\nc2XXrl2fPXsWHR1N0Ay4XO7OnTtB3+vXr+PdMzU1NUUiEfjZxsbmzJkzmH+upqbm8uXLyRZU\n69evx9RQDoczY8aMhoYGUFVcXDxp0iQaXUdDQwObDkGQNm3aBAcHQ/fDbGxsUBSNi4vDtql0\ndHS2bt2alZXVFP8JghZCVqFooLkuhepd//79sdQmFhYW58+fz8jIIASus7OzS0xMRFH01q1b\neDcFLpe7evVqsMK3b9/GdqHMzMxOnz5NzjyBomhFRQU+SIqurm5UVBSTx1gqla5cuRL/ZwaH\nw8G+X+/evcM2TXV0dLZs2dIyr7umeMWWlpZOnToV3CAOhzN16tTvM4mFen8iWecJ9US9n1oA\nq9ipB9nZ2SBSF/Muz58/J59h3blzB0XRly9fkqtu3bqFomh5eXliYiL2mZHL5enp6R8+fJBK\npTRz7d27l0aBMDMzmz59OrTq1KlT2dnZmBaIwePxnj59mp6ejj26hYWFSUlJNJEsJBLJ/fv3\nb926Rf5PLZFIyFuGZmZmubm5L168cHV1JVTRnCRu376drMN9q9NYAHnDjDnjx4//+PFjamoq\n0IOh8Yrbt29fWlq6efNmctXZs2fBCstksvT09JSUFPCcQBU7cmI3LS0tquA7ZGpqapKSkq5d\nu3bt2jXyLc7JyXnz5g1zT9um0/SUYmVlZYmJid/zd1a9P5GsYqeeqPdTC2AVO/Wgtra2qKio\nUb5++AiuGD4+PiiKTp48mVw1cOBApcVTGMiN6mjMwcEhKCgIWrVs2TKl5SGwb98+6BSxsbE3\nb96kl5wAObhGaycjIwOs0u3bt6nahIWFQbckO3XqBF1wsmKXm5sLHXnhwoWqusstDJsrtrWj\nULFjM0+wsLB8X9CkZ21s5laFVFZW0jegMqjKzMykUgo/fPigtDwE3r17By1/8eIFeSePnrq6\nOlVI9B0Bkh8gCPLy5UuaNtXV1eRy5t6vVPF0vpVPMQuLQr5fO00WFpb/m0ATj4KzNpoq5VBo\nFkZl72VhYdGpUydoFdkoXmnIQZ4BDg4OhJQVCiF7xbZ2sPtua2tL1cbS0hLqmoq3TaTH3t4e\nWt6+fXuGI7CwtDCsYsfCwvJ9sWDBAnIhOPekqVKOOXPm0NTq6ekNHDgQWhUcHLx69Wqy2sfl\nctesWaO0PASWLFlCVsjatGnj5+c3dOhQcsAOY2NjqqFmzJhBdialUYlagKakGR04cCCmcvn7\n+0MVNUNDw4kTJ06ZMoVctWzZMoYTde7cmaxea2hoANdaFpbvEPYoloWF5Vsil8sjIiLu378v\nl8v79es3ffr0IUOG7N27d926deD0UENDw9HR8fbt23fv3m1oaAgICLhx4wao4vP5q1atghre\nQfnvv/9u3LiRnJwslUq7dOnSt2/f7du3JyQkQHMwiESi3r17t2nThpy9fu7cuSEhIRwOJyQk\nZP/+/SiKgnIul3v06FGwl1NaWnr48OHk5GQTE5Px48f36dMnIyPjr7/++vz5c/v27X19fUEm\ndXNzc4FAkJ2draur++OPPw4fPhw/kb6+/rlz58aPH19fXw9K+Hz+gAEDZs2aJRQKXVxcCgsL\nsdNkT0/PI0eO+Pj45OXlEa5l6NChf/zxh5eXV1BQUFlZGSicPn26paXln3/+qXRUXqFQWFNT\ng/3K4/EaGhqw1aOPtSsQCAYOHBgfH0+O0szlcjkcDjmiMoa7u/upU6ewX+Pj44cOHXrp0iVs\ndgRBDA0Nz5w5c/Pmzfr6ejMzM3wQ3YCAAKDYyWSyU6dOxcfHczgcLy+vqVOn4meRSCTHjx9/\n/vx53759S0pKMDl5PN7BgwdVHmiQBolE8tdff7148UIgEPj6+mLO5iwscFrK2k9lsM4TqLpb\nhgJY5wn1gN55oqGhAZ+ZHkGQPn36iMViFEVzc3OpArnZ2dmdOnXq3LlzWVlZzCUJDAwkD+Xq\n6lpTU/P06VOCGFS57W1tbdPS0sCAO3fuxFfp6Oi8ePECVKWkpBA2z2bOnEkVrhbP/PnzCWJH\nREQo3NkyNTUFTrVUaXNBmjIURUtKSq5fv37mzJm7d+9CwyM3Cpq4JMbGxufOnVu0aBGTcCrO\nzs708dsI/rP6+vrPnz8H67N+/Xpye01NzePHj//www/4wo4dO65cufLNmzegY319vaenJ77B\ngAEDSktLgfNEZWUlIadtr1695s6du2vXrpKSEuZPXdMpLy8nhBuk/wIqhHWeaO2wXrHqiXo/\ntQBWsVMP6BW7PXv2kL/KGzZsQFH0zp07NF/6BQsWNEoMqty1CIIsX748MzOTfExJxcmTJ1EU\nff36NbnKw8MDTNeUMCLXrl3DxC4oKGDoJDF58uTVq1fTNDAxMcEvCNURswrx8PBYuXJlMw1u\nY2Mjk8mePn1K1QBqU/j7779jK7B161Zyg82bNwPFDnq+D259CzN//nyyJGfOnFF6QFaxa+2w\nuWJZWFi+X65fv04uvHbtGlUVoU0TJ8KGio6OZu40CqaOiooiVz179qywsLC4uPjJkyeNEo88\nPuD+/fvQfLJkrl+/Tr9i+PPW6urquLg4pSVkyLNnz2j06Sby8ePHlJSUGzduUDWQSCTkQvza\nQpcLCyJD82S2MN+PJCytBdbGjoWF5ZshFoupCqFV9B2Vay8WixsVCgQMRdWlrq6OHLW4UeBF\nZS6YWCymXxP0fwwBEQQBG6jKidcomjXGSl1dXWPHxy8RzbOnsLYl+X4kYWktsDt2LCws3wyo\nTVjPnj2pqghtmjgRNhT9XNCpoV3atm3brl07CwuLpkRgwV8ac8Hc3d3pG+OPJo2MjKhitaiQ\ntm3benh4NNPgurq6Dg4OjX0MFK6tm5sbTW1jp1MJ348kLK2GFjoTVh2sjR2q7gYEANbGTj2g\nt7ErKCggxGMzNDT88uULiqL19fVUXy8dHZ137941Soy6ujpnZ2fyUHp6eh8/fkRRlOARSYWN\njU1VVRWKonK5nOzbERkZCaa7dOkSuSOT8V1cXIDvCEZwcLDCXgKB4Pnz5+np6fr6+lRtLl++\njB+2sYkrlCAyMvLTp080ImEQ3BSYcPToURRFZTIZlbHgnDlzCH4blpaW+NR2OTk5BAcXU1PT\njx8/Ahu79+/fE2IcYre+hXn79i1BEjs7O5oMdQphbexaOwpt7DQ2bdrU2P9R3xaxWHzz5s2x\nY8dSNaitreVyuUwc0FovID0iNNu32iCRSGQymY6OjsKM4K0X8F9U7Z9VqVTK5/OhB5RCoTAg\nIADYpWlraw8fPjwiIgJsJmloaAQEBNTV1RUWFsrlciMjI21tbT6fP3DgwDNnzkC1tLy8vN9/\n//3EiROJiYnt27c3NDTEqng83rhx46qrq/Pz88ViMY/H09XV9fX1jYiIAOHQRowYIRQKc3Jy\nxGJx9+7dt2zZYmZmVlRUhCCIvr6+XC7n8/kODg579uxxdHSUy+VhYWGPHj2SSqVcLldTU9PE\nxKRXr14ikcjOzk4oFNra2ubn5797904ul2tpafn5+d26datTp05fvnwpLy9v06ZNu3bt9PT0\npFJpu3btRCKRWCwWCAQ9e/Y8f/48QdsYMmRIRkZGSkoKiqJaWlqOjo6ampoaGhrm5uY6Ojoa\nGhrW1ta9evXKyckpKyurrKzMzc2VyWT4/zUcDmfv3r2zZs1CEKS0tDQsLOz48eN5eXkWFhY5\nOTlYIBUulztw4EAbG5vs7Gws3waXy+XxeEZGRpqamlhLAJ/Pnz179v9j773jo6ja/v+Z7TWb\nsum9QQqphBQgdAgQWigJAUREogKKeD+KqNzqragoWBAUUSnSQ0eMFAsdQiQJJIF00kjftE2y\nu9ky8/vjPN/5zTOzu9lUyN7n/Qev5cypM2dmrpy5zucKCgpqampisVgTJkwIDg6uq6vT6XRu\nbm5vvvnmtWvXduzYYWNjY2trq1KpwB9poCybzQaLmi4uLqtXr963b19TU1NeXh4hccLn8wUC\nAYjfiqIol8vlcrkMBoPBYDg6Ok6fPr2hoSEjI8PR0fHll1/GMKyurk6hUICyAoFg9erVO3bs\nmD9/fm1tbWNjI5vNdnNzmzx5sqWlpY+Pj06nO3z48OHDhz08PEQiUVdXl0gkmj179pEjRx48\neLBu3brPPvssPT39zTff1Ol0TU1NNjY2ixYtOnDggI2NDWXKtba27ty58+eff757965UKu2p\narQp2NnZzZw5s6qqqqmpSSqVJiYmHjhwgDy3e4pSqURR1PTdQkMR835FYhh2/Pjx5ORkgzkG\nwbrsX+CKHW7uf44A4IqdedCLWLG94+bNm+TFIR6Pd+rUqb5XW15e7uzsTH5mfvrpp0ZWmCwt\nLW/evEnRp0AQZO7cuTqdjrLIN2nSpK+++oqc4uDgQMipAObMmUOp6s033wSHWlpaAgMDDT7c\nSXC53K6urtzcXLppQmH8+PHk1qurq11dXQ1l5vP5586dAzkNhbUFuLu7f/vtt2TdFmtr6+zs\nbBzH1Wr1+PHjTRkFHS6X+/PPP+M4vnPnTspfgG+//TaO4+3t7aGhoeT0FStWjBkzhpyycOFC\n8JxZvXo1pf4NGzYYmRsFBQV2dnZEZg6Hs2PHjr5PuYEGrtgNdaDciXli3rMWAA0782BwDDu1\nWu3u7k55K1taWjY2Nvax5smTJ/fU2jC0mqJXSI8umDd69Gii9YyMDL1VgWljPGwGhaioqJCQ\nEFNy3rhxg+jA7NmzjWe2trZubm5++PBhtwvP9JEGBARgGPbRRx+Z0itDCASC/Px8euUoihYX\nF5vyIRtBkB9//PGPP/7Qe+j27duG5gbdg5DH4+Xn5/dxyg000LAb6kC5EwgEYv5kZ2dXVFRQ\nEltbW/so6iGXy/UGpTBOc3Oz3nS64x2CIORgCYDbt28TYRI+/vhjvVVt3boVQZAeiYlkZmbq\n1d6jQwTLUqvV3briNTc3X7t27bfffut2nyZ9pI8ePSosLOyjJIpCofjkk0/oleM4/sMPP5hY\n+ZkzZ/ReHQRBzp07pze9trb27t27lESVSmVEgQUCGRygYQeBQIY8hH+ViemmV4v3nyyIRqMx\nMScRp8uQiF1TUxM5mykQbnPdQjSqUqmMxPUiUCgUvT7PnZ2dPRqFXoigahTa29tNrLyzs9PQ\nqTaUPkBTDgLpO9Cwg0AgQ54RI0bodZQODw/vS7X29vYUBztTMBQdy8ToolKpFESbRRCE7mAH\nWLFiBYIgYWFhpvfKysrKxCAW8+fPBz8sLCx8fHy6zR8WFtajnhAIBAI/P7/elSVDdJjCjBkz\nTKw8PDycEn+MwJCIDGV3DrkqU1qEQAaQQfom3H9AHzvc3B0IANDHzjwYtM0T9AhRL730Ut+r\nPXnyJKXaqKgo48FbN2zYQLcyraysKisrbW1tyYmWlpbjxo2j5Dx06BC5A/RYq97e3uBQenq6\n6VuqMzIydu/e3W02Pp9Pbr3bT7Fr167FcVyn002bNs14TvoOCbDVwLhKS7csWLAA1+cKGRAQ\ngON4dnY2Zfunm5sbJfKsk5NTQ0ODUqn09PSkVwLUT/Tyyy+/UPLPnDnz2X9kQR+7oQ7cPGGe\nmPesBUDDbigCggGQU0wx7Og3bHt7O5C6aGlp0Wg0hvTDdDqdXC4nfv/www9+fn4sFsvLy+uD\nDz4w0ZpsaWmhd5vMuXPnIiIiOByOk5PT66+/XlVVlZmZ6e/vD1bmRCLR+PHj3d3dWSxWQEDA\n3r17MQzLysoirAQURYOCgp48eYLjeGFh4bx58yQSiYWFRXx8fHZ2dn19/Ztvvunq6spms8PC\nwgglPIKysjJfX1+w5RNF0aioKIVCQYz6xo0b48aN4/P5NjY2vr6+hAoxeYsom80mQosePnwY\nCKZIpVIrKyvKTlJ7e/vHjx+Dk1lVVQWKnD9/nqJmzGAwmEymt7f3F198oVar29vbS0tLa2tr\n33jjDRcXFxaLFRwcPGnSJKBVxGAw/Pz8Dh8+3NDQ8P7773t4eLBYrBEjRhw4cIC4u69evTpt\n2jSxWCwQCHg8Hvr/oJhNoF03Nzc3Nzc2m+3q6rpp06b29va2tjaNRrNkyRJgcLPZ7BkzZhBK\nb+np6RMnThQKhTY2NsuWLcvPz8/MzIyLixOLxVZWVgsXLiwtLZXL5Vqttr6+fsqUKcAo53A4\n8fHxTU1NxifP8ePHQ0NDQWc2bNjQU6E7rVY7+Np40LAb6kDDzjwx71kLgIbd0CI9PT0mJobJ\nZDKZzJiYmPT0dJBuxLBTqVQffPAB+J4lkUg2btyoUCiOHTsGPv8xmUzyVkdfX9/jx48TZevq\n6pYtWwbWq9zc3IDmBajz3//+t5WVFYIglpaW7777riGLTaFQbNy4USQSEZZQdHT03bt3Kdnq\n6+vnzp1LfF0F1oazs/P3339Pnpx0C2D37t1AKITP5z///PP19fU4jl+8eDEoKAhFURaLZWVl\nxWQyGQwGIdbIYDCmTp0Kbm25XL5u3TqKMq1QKAwMDASjdnd337NnD7nFgoKCuLg4YJeACsln\nBsfxS5cuBQcHoyjKZrOnTp368OFDjUYzZ84cyuiI5UYGg2Fvb08xsBgMBrCfxGLxrFmz9Mqh\nAfMLWJnkxUuRSHTixAlyn9955x1yBi8vr8uXL5Mz7NmzB3yY5vP5y5Ytq62tbW1tXbt2Lbhw\nfD4fFHdycoqPjyfm0oYNGyhPSAzDdu7cCcTzhELhiy++CMSKDxw4AExwHo+3ePHiwsLCsrKy\nxMRE0HkvLy/KAmp/UVVVRbTiyedcGwAAIABJREFU7e1NWN6DADTshjrQsDNPzHvWAqBhN4Qo\nKiqi+G+JRKLCwkLcqGH32muvUQwCQ1EECIBqmt6gFMCCoUuRrVq1Sm+fV65cSa9fLBaTZeQ0\nGo2R2E3ffvstkZNi2NE/ekZHR1+7ds346ACRkZE4jickJJiSmbDtZDKZIXdAcGbu3LlD+W7r\n4OAwYcIEU1rpR1AUvXfvHujzli1b6Bm4XO6dO3dAhr1791KORkREzJw505SGli1bRr4i33zz\nDSXD+PHjDx06REkcMWIEXRrw2LFjPbkbukehUNClEFNTU/u3FUNAw26oAw0788S8Zy0AGnZD\nCL3xuJKTk3HDhl15ebkpr2cKvr6+OI7T38cIgkil0qKiIr2lHj16RGn90aNHhppYunQpke3Y\nsWNGOiMWi4nwX2TDTqPR6HWr9/f37258/wvdecsQUqkUiOxv2rTJeJ6JEyeaWOdAExQUBE6U\noeAHQCRZp9NRXBJ7ChBAxnFcqVQSS7NkupVrBri6uvbbrYLjOI7v2rWL3oqbm1v/tmIIaNgN\ndaCOHQQCGXDy8vJMTCQwYloZobi4uKur6+HDh/RDMpns5s2bJnZPbw30zEayIQjS3t5OF89D\nEKS2tlavlJ3ptuz58+dNzCmTyerq6hCjXZXJZLW1tcYvx2BSVVUFfiiVSr0ZQFfr6+tBSLde\nk5ubC35UVlZ2dHTQMwDVmG6pqqpqbW3tS08o6L1YlZWVbW1t/dgK5L8WaNhBIJC+IpFI6In0\n7ZzdFukWgUDA4XD0lmUwGIYiddJ7YqR1cuZuO6l3jGKxWG+AY9Ojc5oecpTBYIAtpUa6CvL0\n7oQPBMQXYUO6MOCsisViQxlMhLg6hnbd6o1fTIfD4VCcHfuI3mvR761A/muBhh0EAukrixcv\nNjGRYNSoUXRdt25fbElJSSiKzps3j24kzZw5c9KkSR4eHpR0V1fXsWPHUhLHjBnj4uKitwly\nt+fOnUvsM6UzefJkcqhQAktLy+nTp1MS+Xx+YmKioarIcDic9957TyqVmpI5Pj4eeDcmJSUZ\nyjNz5kwLCwu9l8N0tZR+ZNGiReCHoZC7oKsikWjWrFmUQzwez0RtFEdHR8KDUK83oVAonDFj\nBiWRw+HQtWzmz59vXOCmpyxYsEBvorkGrYcMNoP2Vbi/gD52uLk7EACgj90QAsOwJUuWkB8s\nycnJ4NoZ2Txx9+5dshOVlZVVWlqaEX3XkSNHEqfr559/Jptcfn5+tbW1OI7fuXOHbBJZW1uT\nI5+SuX79Ot0TbunSpZQpt2/fPnocUgRBvL29KysriWyUzRPV1dXDhg0jMvN4vH379qlUqm53\nhzAYDOCqn5aW1q2eMDFqwDvvvGMkT1dX15QpU8iH/vWvf126dMnEVStDve1pkcDAQKLD9fX1\n9DFOmTKFmC11dXV+fn7EIS6X+9NPP507d06vwxx5IJaWln/99Rf5ipSXl5NFW/h8/uHDhxsb\nG8nGJZfL3bZt21dffUU244KDg7sVPekF27dvH4RW9AJ97IY63frYoXj/BcwZHFpaWl599dXD\nhw8byiCTyVgslvHPQEMdlUqFYZh5r9vL5XK1Wm1jY6P3q5Z5oNPpOjo6np1vZH3kypUr165d\nw3F8woQJhKs+eMJaWFjoXfNobW09duxYSUmJp6dnUlKSVCrV6XSnT5/OzMzkcDgYhj158kQu\nl7u7u48ePXr+/Pnkl3dJScnZs2cbGxuDgoISExOJ+ltaWlJTU4uLi729vRcvXqx3HwOgqakp\nNTX16tWrYJfi9OnT9e4Sffz48bfffpuVlcXhcFxdXW1tbQMDAxcvXky2LJubmykNqdXq1NTU\nvLw8W1vb+fPnA6sCx/G0tLTbt28LhUJ7e/va2lq1Wu3i4vLnn3+Wl5f7+flt2bKF2NxaW1t7\n4sSJW7dugZBZFhYWsbGxkZGR169fB6NOSkqirPFkZmZeunSpsrKyq6vLzs6OcmZwHP/9999v\n374tEAimTp0KYi00Nja+/fbbeXl5lpaWfn5+fD5fKBRmZmbW1taOGDHi+eef/+OPP3bv3t3R\n0cHhcEJDQydNmqTValUqlaur66JFi06ePPndd981NjZaWVnFxsY+fvxYpVKFhISkpKRcuXKl\nublZIBBcu3atqKjIxsZm9erVL7/8MrnDWq1206ZNv//+u1KpHDFixKpVq2bOnEm+5TUazfHj\nx3NycqRSaUJCApDCqa6uPnny5JMnT2xtbTEMa25uDgwMnD59+tmzZ4uLiz08PJKSkugbL1Qq\nVWpq6sOHDx0cHBYsWODu7g46cOLEifv379vY2MydO9fJyYnH45WWlv7666/Nzc2hoaELFy7U\na9n3nfz8/PPnzw90K3SampoYDAaQBDJXzPsVqdFoFi5caCiKMYLAFbuhiXn/OQKAK3bmAbFi\n19TUtG7dOj8/P/DeJauKGOHixYuTJk1ycnIaOXLk3Llzw8LCbGxsbGxspFLpyJEjv/zyS3Jg\ngMLCwsTERA8PD09PTw8PDy6Xy2AwUBS1trbeunXrF198ER4e7uTkNG3atKtXrxKlKisrV6xY\n4e3t7ePjk5KSUlNTo1AoPvzww5CQEGdn5/j4+E8++YR46aIoOm7cuNDQUGdn55kzZ3744Ycx\nMTGOjo4uLi7Ozs4CgUAgELi5ua1Zs+bXX3+Ni4tzcnIKDw/fsmWL3jXLmzdvjh49ms1mAz1e\nBoPB5/MlEkl4ePg333yj0WjOnTs3fvx4JyenyMjI3bt363Q6Sg1nz551cnICw+RwOKNHj05L\nS8vNzY2NjeXxeCwWy97eftq0aWFhYWDghw8fTk5O9vT09PPze+2118rKyuj+fKAqKysrFxcX\nV1dXsjGNoqhEIiEsRSaTKRQKORwOEOQD3WCxWM7OzlFRUS4uLsHBwbNnzwZCfSiKisXigIAA\niUQCxIG9vb1dXV15PB6Px3N1dR0+fLirq2tgYOBbb73V1tb2999/T506FVz6bdu2gQt9+/bt\nGTNmODs7h4WFffzxx3l5eYsXL/bw8Bg2bNioUaMCAwMdHR3d3d3d3Ny8vb1XrFgBVlXz8/OB\nLjSXyxWLxfHx8Q8ePMBxvKGhYc2aNcOGDfPy8lq2bNnjx4/B35PkM6zT6X7++efIyEgnJ6dx\n48adPn2auHbTp08HPfn000+J/dHPJhiGHT16dPTo0dbW1mKxWCKRREZG/vjjj/QZZR6Y9ysS\nyp2YJ+Y9awHQsDMPgGHX0tISEBBAth7EYnFxcbHxskYW5gkIdRK6lp5xgCReXV2dvb09Od3V\n1XUglEHmz59PGd2lS5eMFxk9ejQl5fXXXyfXcObMGb0FTV/7eWaXw+kxapOTk//66y9KYrcj\ntbOzu379OiWGGIIgfD7/xo0bFC9Pa2vrR48eUQy7t956i1J2586d9Gs3b968vt4qA8lnn32m\n9/y88cYbT7trA4J5vyKhYWeemPesBUDDzjwAht1HH31Ef6nMnj3bSMGuri4TvxZduXIFx/HZ\ns2ebkpnA0dFRq9WmpKT0qFRf+P3334nRYRhGj0xqCjk5OUQl5v01jY4hBebeldKbvnDhQrJh\nV1BQQM/D5/PBZ1wK58+f7//7pz+ora01si0jLy/vaXew/zHvVyTUsYNAIE+fjIwMemJ6erqR\nIoWFhS0tLaZUfufOnW5ro1NbW1tRUdHTUn0B9BNQX19fVlbWi0rIHTbx/JgN1dXVvShVW1ur\nN72mpoaeSJmoeuetUqnUq19Ivr7PFJmZmRqNxtDRwbwFIIMDNOwgEMiAo3fBwLiEhOkCE0To\n9572isPhDKbABHlEvZbPIBd8Zj+kDhC9G6+hrbt6a6Nclx5dpv6VROlH+utGgwwVoGEHgUAG\nnLi4OHoiXUWMjK+vL13ozkjlxmujExQU5OLi0tNSfYEsbmdtbQ02pRqBbnnw+XyyYIqrq2s/\ndu/ZJyQkpBelyIIpZOgxYREEmTp1Kvm/48aNo++stLe31xtBmC5e+IwQHR1t6Ks9n89/dmLN\nQfqNQfsq3F9AHzvc3B0IANDHzjwAPnZKpZIiNuvj49OtmNatW7e6jdawefNmkLmpqcmIIUgx\nksRicVZWFugeRTlv9OjRa9eu7fmjlArFr/+9996jjC4vL8+41u6qVasoAsK7du0i15Cbm0tX\noePxeEbEnijLV8/sak1cXBxFrO6jjz4qKCigGCjdehmGhYVVVVXRZasdHR0rKioosoIBAQFA\ngIZ8kvfs2UPOw+VyL1y4kJeXR1Ep2rhxY19vlYHk5MmTeq/17t27n3bXBgTzfkVCHTvzxLxF\negBQx848IHTsWCzWoUOHLl68qFKpoqOj165dS9+rSKeiomLHjh35+fnOzs4BAQEPHz58/Pix\nQqEQCAReXl5Lliwhrzd0dnbu3Lnz7t27DAZDp9MVFRUBryx/f/+dO3cCzbyampqgoKB169Y5\nOTmBUmq1evfu3devX2cwGBMnTnzxxRfZbPb58+dPnz4tk8nCwsKWL18+duzYhoYGBEGEQuGW\nLVsyMzMbGxtDQ0OnTJly4sSJsrIynU7H5XJLSkowDPPz85s1a1ZcXNz333+fk5Pj4OCQmJg4\nbdo0+ujq6uq2bdt2/Pjx5uZmDMOEQqGtra1EIgkKClq2bNm4ceNKSkq+++67oqIiV1fXlStX\n0hf5qqurV6xYkZ2dDW6WhISE1157zcbG5pNPPjlz5oxCoRg+fHhcXFxxcTEYeEpKypkzZ+7c\nucPhcOLi4pYvX7548eLTp0/rdDoEQZhMJovF4vF4EonEy8uLy+VyOJysrCzCuY3FYo0ZM6a8\nvLy+vh7HcWtra6lU2t7e3tHRgf0/RCJRWFjYsGHDSkpKrK2to6Ojv/nmm4qKCiaT6efnFx4e\nnpeX19HRAWRrcBwHzvuBgYF8Pl+hUAA5kqSkpJqamh07duTm5jo5OSUnJwMjrKGh4dtvv33w\n4IGtre3ChQsnTJjw3Xffpaenc7lcqVTa2toK/lJisVgsFis2NvaVV17hcDgdHR07duw4c+aM\nTCazsrKaO3fuunXrLC0tdTrd/v37//jjD61WO3r06DVr1mg0Gh6PR1cH/PnnnysrK319fYE8\nCrh2O3bsePDggb29fWJiot416WeKhw8f/vDDD3l5eTKZTCwWh4SErFy5Uu/Soxlg3q/IbnXs\noGE3JDHvWQuAhp150NraqtFoyOJnxtFoNAwGAyxEAZEtYukL6OIKBAKFQsFgMMD8p+RBEESt\nVrNYLLA0pdPplEql3kAFPQLDsK6uLp1OR1SlUqnIy2l0gWJAXV0dEIqj5KentLa2kp9aOI5r\nNBrySQP51Wo1hmE8Ho9cvLm5WSKRMJlMnU6HYRibzVapVCj6v493JpOJoig4UUQRonhrayuo\njWgdHCL3R6vVarXajo4OFEW5XC6PxwOGLH1QRgZIPycYhul0OjabLZfLRSIRg8Ho6uricDgo\nihJlKecBpBPZyG2BvuntCdGQoa4StLe3IwgCdHPIfVCr1UaCy3WLVqtFUZTJZJIn59PCiECx\n8Qs6+PT6zJv3K7Jbww762EEgkAEhLS0tJCREKpV6eXk999xz3W5pvH379pgxY4RCoVgsHjdu\n3NixY4VCoVAoHD9+/K5duxwdHdlstlgsZjKZYrFYKBTyeLzhw4eDPBMnTszMzLx69WpkZCRR\ng5OTE4vFEovFAoHggw8+6N0oiouLR48ezWKxBAKBWCzm8/lRUVGWlpZCoTAgIODEiROGCk6a\nNInBYDg6OoI3Op/Pd3Bw+M9//lNeXv7cc89JJBKRSDRixIjdu3cHBgaCFy2Hw0lJSXn8+HFC\nQoJIJBIKhaNGjbpw4cKmTZvs7Oz4fD6TyeRyuXw+H0gZu7u7R0dHM5lMGxsbsEbF5/N5PB5o\njsfj8fl8Pp8P9oiA/zo6Ok6aNMnKykogEAiFQtAun8+3srJiMBgeHh42NjZ8Ph9FUSsrKzab\nPXXq1AkTJgiFQpFIZG9vL5VKxWIxqA005+npuWvXLgzDiIGr1erPPvvMyckJNLd58+auri4E\nQSorK93c3FAUBecEXEQwKIlEwmKxOBwO6JVEIhEKhRYWFh4eHuA8hIWFzZs3z9raWiAQ8Hg8\ngUBgYWGRlJSUmZkJArmCkXp6ev7222/kq1BUVDR79mxQSUxMzM2bNw1dr87Ozo0bN3p6elpY\nWIhEIhCBY9iwYWPGjAEpYWFhFy9e7On8uXfv3vjx48FIJRIJmEVz584tKSnpaVUDh06n+/bb\nb93d3fl8vp2d3TvvvKNQKJ5ulyorKxMTE/ty5v+rGcDvwAMD9LHDzd2BAAB97IY0ly9fpjxq\n/P39jUzanJycbt3pjCMQCIz/Zf/RRx/1dBSNjY12dnbG2z1+/DhOixU7btw4Q/ltbGyMV0hZ\nX+xLLNdB44svviDGvn79espR8Imz7+umdPRe8QsXLoCeNDQ0EB/cAXw+H/hW0klKSjKlRUoI\nWuMUFRUZGrWLi4tMJuvphOwX6LFi6TKTSUlJT6VvgPb2dl9fX0qX/v77b9NrMO9XJBQoNk/M\ne9YCoGE3pCHHVif46quvDOWPj4835bXaF9hstt64XkbYsGFDt9U6OztjGEY27J76asfgw+Px\n5HI5juOGxPn6ZT+Kifj7+4MLQQ8agSDI1KlT6RfadAm60NBQ0+ePcWPxnXfe6dFs7C8ohl1z\nc7Pej9R37tx5Kt3Dcfzzzz+n9ycsLMz0Gsz7FdmtYTdIUYf7FxzHjcgtmpJhqAM8acx7jDiO\nIwii0WjM2McOmK3mdx11Ot2jR4/o6dnZ2YYGm5OTM8CdQjQaTXFxMfB8NxFTelVdXV1bW8vl\ncomhdRsrzPxQqVQPHz4cOXLk/fv39Wb4888/B60zBQUFnZ2dHA5H7+V78OABfRJmZ2ebWHlu\nbm5XV5eJTnLG58/9+/ef1r1Pfuzk5ubq7UZWVtbIkSMHt1//i95ZlJubq1arTXwdAI9S83u0\nAjQaDW50d8TQM+yAQQqcNozkMZ5hqANmrXmPEXjtqNXqp92RAQTHceCV/7Q70v/w+fzOzk5K\nolAoNDTYgfhOR4fL5fbobJvydRjsJEUQhKiZEnz2vwSwp8HQ13Djwi793hNwW+m9fCKRiD4H\nTN8xIBQKTTcXjO/7NnI7DCjAJiCaNnTJ+Hz+03o06d30IBQKTX8dgFdkv3bqGaL7GThgi4UD\nBfwUi5v7OjMAfood0qxYsYL+tLl27Zqh/L3e3GA6np6ePR3F8ePHu60WfBOh+NgNZkCLZ4GQ\nkBBwqyqVSrpssqOj4x9//DFonXnuuefAVUhNTaUf1fsBtKGhwUQhhVWrVpk+f7Zu3WqkqlOn\nTvV0QvYLlE+xOp3O39+f0jdLS8uGhoan0j0cx/Wu76akpJheg3m/IqGPnXli3rMWAA27IU1L\nS0twcDD5ufzBBx8Yyd/V1UVRiyUD5CeMM3PmzNGjRxs6yufzi4qKejGQV155xUijvr6+tbW1\nOM2wO3v2rKGBrFy5kpxCEbthsViJiYnkFD8/P1ME/0ynR74NpmR2cHAgB5K/du0a2UiysLAA\nbu+LFi3qTXeN9mTChAmUlICAALLV8vLLL5OPjh8/XqVS6b3Qp0+f7lYdIywsrK2tzfTJo9Vq\nKbrcBGvXru3RPOxH6JsnsrOzbW1tib4JBIIzZ848re4B3nnnnb6cefN+RUKBYvPEvEV6AFDH\nbqij1WqPHj2akZHB4/ESEhKMWF0AHMeBdi6LxZo6dapWq/37778xDBs7duzs2bO///77I0eO\nyGQypVKp0+l4PN7ChQvHjBlz+/ZtDMPGjRs3a9YsDMNOnjx59+5dHo83ffr0wsLCgwcPdnR0\nREdHf/HFF702j65du/bZZ5/l5eVxudxZs2Y9//zzaWlpQKB46dKl4EsWXceupqZmwYIFxcXF\nYrE4IiLCwcHB3d196dKljo6O2dnZv/76a3Nzc1hY2JIlS/7888+vv/66oaEhODj466+/lkql\nt27dunDhgkKhiIqKWrRoUU1NzZEjR6qqqsrKyurr67VarbW1dUBAgK+vb2Ji4vLlyx88eMDl\ncmNjY93d3dvb26uqqmpqaurq6hgMBpvNFggELBarq6vLyckpISEhKirqwoULdXV19vb2N27c\nyMrKAoLPERER69evf/DgQUZGxqNHjxgMhr+//1dffZWTk3P9+nWFQlFRUZGfn9/W1iYUCn19\nfYE6sZ+f3/LlyylfWhsaGg4dOlRWVubh4bFs2TLi2/Svv/66cePGhoYGFxeXlJSU6upqhUKR\nn58vk8mEQiEwYYEIS1tbm42NjYeHR3FxcWdnZ2RkZEhIyJkzZ6qrq5VKpUAgsLa2njVrVmRk\nZFZW1u7dux88eCCRSJKTk5cuXUpZLr127dqlS5fUanV0dPSCBQuMPE8qKyv3799fX18vEomY\nTGZnZ2doaKivr+/ly5fb29sjIiKSkpIoAUVM4fz58zdu3EBRlMViAaG46dOnx8bG9rSe/kKv\njl1ra+uBAweKi4tdXFySk5Pd3NyeVvcI7t69m5aW1rszb96vSChQbJ6Y96wFQMPOPCAiTzyz\n0av6BUMCxWYDWF0G6m5Puy8DSHt7Oz3yhJlhRKDYbDDvV2S3ht3Q2zwBgUAGCJVKdefOnX/+\n+cfe3n7KlCnOzs4D11ZjY2N2djabzR45ciSx3lNQUHDmzBkej5eYmEi0juP4gwcPKisrvby8\ngIpKW1tbZmbm48ePeTzeiBEjQkJCUBRVKpXZ2dngE7Crq2tVVVVWVpZMJpNIJGFhYYbCyOI4\nnpubW15e7unpGRQURDmqVCqzsrIKCgrUanVnZ6dSqZRKpdbW1p6eniEhIXq9zhsaGu7fv89m\nsyMiIrKzsy9fvuzt7b1o0SK5XH7y5MmOjo45c+aQR6HT6YRCYWpqqlwuX758ud6I7HV1dWvW\nrCkqKpo+ffq2bdsaGhpOnTrV0NAgEAjkcnlXV1dVVZWXl9frr7/e1dUFApR5eHi0tLRcvnzZ\nwsLi3XffjYiIIM7w999/397evmLFivHjx4PEH3/88c8//wwNDX3zzTdlMtmNGzcyMjI6OjqG\nDRvm7OzM5/P9/f29vLy6urpycnLA8l5HR4etrW1hYeFff/3V1NTk4+OzatUqPz+/rKysO3fu\niESikSNHtrW1aTSa0NDQ/fv3Hz582NLSMikpCWgFBwQECASC3NzctrY2hUKh0WiCgoLCw8PJ\nVmNbW1tWVpZGo/Hw8Lh7925FRUVMTExsbKyhvxBwHP/6669v374NlIqjo6Pj4uKeKRXAK1eu\nXLt2zd3dfeHChaZ4F0AgvWTgPwf3M9DHDjd3BwIA9LEbZK5cuULezslgMNatWwc2l/UFhULR\n2NhIEZD77LPPiB2LVlZWBw4cwHGcHG0TRdFXXnkFx/GysjLyN9zJkyd//fXXlDXO6OjogwcP\nkn326Sp6SUlJ9FumsrKS/EVs/PjxT548IY5evnzZiGnr4+Nz+/ZtIjPwsdu8eTOxv5KsiMFg\nMMgLz7GxsXv27NH7VcHFxaWjo4PcSYrLHYqiRtawDR0KDAzEcXzy5MnkRHd397t375LtJBRF\nDQl5jBkzpltDvy9BtxAEsbOz+/XXX8Go9+/fr3dJyd7ePj09nT7NLl68SP9UJ5VKc3JyejRd\nwYeCHhUxherqand3d6JjbDZ79+7d/d6KidB97MwP835Fws0T5ol5z1oANOwGk+rqar12xtat\nW/tYM92wo+9VBEt09NZ/+ukneth7vZiyMPPSSy+RO6bT6caOHUvJM27cOGDLlpeXd/uJ3MHB\ngdg52NTUZMQ/pEeQhViNfG3pKZS9LICnG7SUjlAoLCgouH37tpE8NjY2lJgNGo3GkAOWVCrt\n0aNyIAw7DMPo6okMBuPBgwf925CJQMNuqNOtYfds3dUQCOSpcPDgwdbWVnr6119/3e9tbd++\nnZKiUqlOnz5Nz/n+++9nZGSYUqdOp+s2z969e9va2oj/3rt3jx459Pr165mZmQiC7Nu3j5xZ\nL3V1dUePHiX+Sx9X78jOzu7o6AC/KZs6+4JesVxyjNdngc7Ozp9++mnnzp1G8jQ1NVH+Nti6\ndatWq9WbWSaTpaWl9WcXe879+/eLioooiRiG0QN5QSD9AjTsIBAI8uTJE73pNTU1hl6Zvaaq\nqoqeqNcy69a06hFarba2ttZ4N4h0QydEb2bjFfYConW91rZ5U1lZ2e3Jp5xq43EjSktL+6Fb\nfcDQxCgvLx/cjkD+W4CGHQQCQVxcXPSmOzs790LfwTh0AVsEQfS20r9721ksFjkkvN5uIAgC\nhB4MnRC9mY1X2AuI1s17d79e3N3duz2TFDGOsLAwI5l9fHz6oVt9wNBwPD09B7knkP8SoGEH\ngUCQ5cuX6/VVf+ONN/q9rfXr11NSeDzeggUL6Dk//vjjqKgoU+o0xcfuxRdfJMutRURE0LXE\nxo8fHx4ejiDIypUruzWqHB0dk5OTif/Sx9U7wsPDiQBrP/74Y7/UiSBISEgIPfFZ87ETiUQp\nKSmvvvqqkTxSqTQpKYmc8tZbbxn688PW1nbmzJn92cWeExoaOnz4cEoig8F4//33n0p/IObP\noLn79Rdw8wRu7p6hALh5YpC5evWqg4MD8WRgMBjr168foF2xn3/+ObEr1tra+uDBgziOz5gx\ng2gdRdHVq1fjOF5WVjZmzBgifcqUKd988w1lW0NMTMyhQ4fIqzh04ZLk5GT6LVNVVUVIfiAI\nMmHCBPKu2D/++MPIup2vry99V+ynn35KjItsa1J2xY4bN27v3r16LekB2hU7YsQIHMenTJlC\nTvTw8Pjnn3/IW1lRFDVkIsfGxna7itnHXbH29vbnz58Hoz5w4IDe8+Pg4HD37l36NLt06RLd\ntrO1tX12dsV6eHgQHeNwOD/++GO/t2IicPPEUAdGnjBPzFt9EQAFigcflUqVkZGRkZFhZ2c3\nZcoU8ofLXmNIoFgmkxF5sORCAAAgAElEQVQ6doSmV1FR0ZkzZ7hcblJSkqOjI0jEcTw3N7ei\nosLLyyswMBBBELlcDnTs+Hx+YGBgcHAwiqIqler+/fvNzc3BwcEuLi5PnjzJyspqamqysLAI\nDw838tkrLy+vrKzM09OTLpKiVCrv378PdOwUCkVnZ6ednZ2VlZWHh0dISAh5RIRAMdDn43K5\n4eHhDx48uHTpko+Pz6JFi9ra2k6dOiWXy+fNmxcQEECMQqvVWlpaHjlypK2t7fnnnycbmgRN\nTU2vvPJKYWHh9OnTv/jii8bGRqBjJxKJ2tra1Gp1ZWWlp6fn+vXrgY6dTCbz8vICOnZisfi9\n994Dy5DgDH///fdyufyFF14gFiz37t17+fLlkJCQ//mf/2lqagI7SORy+fDhw11cXHg8nr+/\nv4eHR319PYgDIRQK29vbbW1ti4uL//zzz+bmZi8vr5SUlOHDh2dnZ5N17Lq6usLCwg4fPrx/\n/36gYycWiwUCQWBgoEAgyMnJaWtrUyqVGo1mxIgR4eHh5GeaXC4ndOwyMjLKy8tHjx49ZswY\nIzp2O3fuvH79ulAodHd3j46OnjZtWk917AZUoPj69etXrlzx8PBYsGABsSg7+ECB4qEOjDxh\nnpj3rAVAw848gJEnzAMYecJsgIbdUKdbw+7ZcrCAQCAQCAQCgfQaGFIMAoEMNjiOnz9//vbt\n22w2e8qUKeTvj1qt9tixY9nZ2dbW1vHx8aGhoXV1dUeOHAEhxXx9fdPT05VK5YgRI1pbW3//\n/ffS0lKhUDh8+HAMwyoqKlxdXdevXx8bGyuXyxMTE+/cuYMgSERERGpqak5Ozt9//w10iePj\n44kW1Wr1kSNHcnJybG1t58yZExgY2NbWtm/fvrS0tKamJq1Wy2AwOjs75XI5k8mMiIg4efIk\nffWxvb09NTW1sLCwtbVVIBBIpVKxWHzo0KGamhoOhxMREdHW1lZaWophGIPB0Ol0bDZ7zpw5\nW7duPXLkyMWLF3NychgMRlhY2Jdffkn2dARcvnz5jTfeqK+v5/F4EyZMiI6OXrx4cVpa2tGj\nRysqKoAkjYWFxfPPP+/l5fXo0SMnJ6eFCxcWFhbeuHFDrVar1eq6urrW1lZvb+/Ozs7Hjx9j\nGBYQENDY2PjkyROtVltYWKjRaLhc7sqVK8VicV5e3v379zs6Othstr29fXV1dXt7O47j1tbW\nS5cuvXPnTk5ODoZh1tbWjo6OGo3G0tLSzc2ttbU1IyMDSAeD0Xl7e8fExERHRycnJ+fn5wuF\nQj8/P1DV9OnTX331VYlEgmHYqVOnbt++fffu3ba2Njs7uyVLlqSkpCAIcv369S1btmRlZSmV\nSi6Xy+VyJRKJo6OjWq3u6OjgcDiurq5NTU08Hi8qKgpF0aqqqsbGRjs7OwRBBAKBSCSaNGnS\nxIkT//nnnwsXLrS3tyuVyoKCgvb29lGjRr3//vsgJ4X79++npaXdu3ePzWZPmjRpyZIltra2\nR48ezczMrKqqEgqFQUFBCxcu9Pb2zs/PP3v2bENDQ3Bw8JIlS7hc7p07dy5duqRQKCIjI8eP\nH3/kyJGysjJ3d/fk5OQ7d+6kp6fzeLwpU6aQt+zQKzF0m0yePHnChAnEIaVSefjw4YcPHzo4\nOCQkJFAEkHU63YkTJzIzM8Vi8fTp001U+Tadjo6OQ4cOFRQUODs7L1q0iOw+ODjcunXr8uXL\nSqUyOjo6ISHBjL/q9JKB9vLrd+DmCdzcPUMBcPOEeUDfPKFWq6dNm0Z+Cq1atQocAh5ylEPk\nrawmkpSUZHy/5+zZs7VaLY7jdXV15B2LXC5348aNUqnUSFkmk1ldXU0eY25uLjkam+nQO8lk\nMtPS0siVP/fcc/SC3WrQPFMxUvViaWmZkZFBjhdHEBISYnxjrOno3QuMIAiHw7l69SplrtIV\ngwUCAV2shMfjvfDCC2QjzMfHhyIlTb5AlIu1Zs0a0NyuXbsolZDnlUajmT59Orngiy++CA5V\nVFSQbSkul7tnzx6iYHt7O8WSe/fdd8nD7OPmCWDPEZXz+fzU1NRe19YLXn/9dfLoxo0bp1Qq\nKXnM+xUJQ4qZJ+Y9awHQsDMP6IadXsH9Q4cO4QaMmAHi888/x3F83rx5lHRT/vq3t7cnhoNh\nmN5oXb1GIBAAoxPHcb3hIsyGp+vmZWVlRX6KXr9+fdCaPn78+KNHj4iwwgQzZswg+rN582Z6\nQRBVedKkSZR0Pp9fVFQECq5Zs4Ze8PLly0TNfTTsIiIiKJWLxWLydvIB5ezZs/TRbdiwgZLN\nvF+RMKQYBAJ5tjh+/Dg9EfzRf/LkyUHrRmpqqlKpPH/+PCUdN2E/WX19PfG7sLCwf80vhULx\n999/g98ffPBBP9b8rNHS0vJ0WyfHlBvkuXfu3DmVSkVJv3jxIhFtxdBtIpPJiOlBoFQqf/31\nVyMF9Sb2grKysnv37lES29vbL1y40C/1d4uh0zI4rQ8VoGEHgUAGFblcrjdRrVYrlcrB7IZC\noTAlyGy39fRLf8gQhmNzc3O/Vw4hIF+7gbiORtrV2xyO4+3t7Ub6I5fLiQz0Q+CH3gz9NTpD\n9Qza2TN0Wgan9aECNOwgEMigQlcPRhAkODiYy+VSfMAHlODgYGtr695p9ZE92IYPH97vSi6E\nmzzFGdHMeOo+7+Rv6Hqn5cC1q7c5W1tbQr7R0G3i4uKiV8yLGAtdjhH5vyPtCz4+PoQE90DU\n3y2GTsvgtD5kGJxPwv0I9LHDzd2BAAB97MwDuo/dgwcPKO8GW1vbmpoaHMfpH0Z7J0hJ916i\nIBQKHz16hOP40aNHKYdMMfU2btxIHqNed6heQ3a0wnHcjHXjVq5cqTedyWT2l/udkU0kzz33\nHPk8d3R00AN/GYKuPWm6iqGDg0N9fb1Go4mJiaEc2r9/P9GfnJwcym0ilUrB7ordu3dTCo4b\nN47wy6R/qPXy8iI/Z/roY7d161ZK/fHx8YP2oJbJZOStGwiC8Hi89PR0SjbzfkXCzRPmiXnP\nWgA07MwDvSHFbty4ERUVxWKxOBxOXFxcXl4ecej06dMBAQEoigoEgsWLFxcXF2/atAlsU5VK\npX5+flwul8lkOjs7k3fLEoG2UBQdNWpUUVHR4sWLiQUhFEUnTJgwefJkDofDYrFGjx59584d\nosXDhw+DlUKRSLR8+fLq6uovv/zSkGHBYDCIPbwEWq32008/BRG3WCwWg8Fgs9kU45K+OsVm\ns5ctW0be3shkMhctWqTRaMiVV1ZWUmxNR0fHFStWUF5voGlbW1sEQSQSSWJiYkxMDJvNZjKZ\nxIIiOWIYk8mkdwlFUeP7bY3sNabUBtoaNWpUZGQkvSGRSPTpp59qtdrLly+HhYWRq7W3t794\n8WJlZWVISIjeJT3yZQU/uFwukKIF9bDZbAaDweFwpk6devHixfnz5/P5fHBRQH4Oh7N69WqV\nSkW5jhUVFfPmzSOyiUSiDRs2/PLLL8OHD0dRFFRuYWGxevXqoqIiIA2DIIivr+8vv/xSXFw8\ne/ZsHo/HZDJDQ0NXrlwJNkpLpdLly5ePHDkSXIgZM2bk5+eD5hoaGiiVUPpz8+bN6OhocJtM\nmzYtNzeXOPTzzz97eXmB/rz00ktAZYYgLS0tODiYwWDw+fyEhITHjx+Tj/bRsNPpdDt37gQR\n/CwtLdetWzfID7HCwsJZs2aB58DIkSP/+usveh7zfkXCkGLmiXnLagNg5AnzwEjkia6uLiaT\nqdeSANJl5Pc9iGSFIIhWq9VqtcBs6uzs5HA4HR0dVlZWGIa1tbWJxWJyhTKZTKfTEVokwGbS\n++VUoVDw+XzyZOvs7EQQhM/nd3R0cLlcUD9dZA4AIk+AThLj0mq1dXV1Li4uRBgulUpla2vb\n1NTEZrMJwxQ0TWiwGaK8vNzDw4M4D+AsAUm8xsZGGxsbsMBDzqBWq1EUZbPZnZ2dXC4XnDeF\nQoFhmEgkwjCsublZKpVqtdp79+5FR0cTRZhM5pMnTxwcHHQ6HYqiMplMqVRaWFiIRCIej9fU\n1KRSqdzc3ECkNYlEolQq+Xw+UJ1IT0+fOHEihmE6nY6Q86iurrazswM9QRCEsgypUqnYbHZ7\ne7tAIKBcnYaGBj6fj2EYj8fTaDQsFovH48lkMmB5azQanU4HagMDB/8SAweV6HQ6jUbD4/FA\nh428Hdrb2zkcDoZhGIaROwmukUKhICfiOK5UKsnPYaIh8F/ytTA02+mVUOjq6iJbpWTI9dMB\nZ5W+ZtlfkSeMtz7QkJ8DdMz7FQlDipkn5j1rAdCwMw9gSDHzAIYUMxtgSLGhDgwpBoFAIBAI\nBPLfAjTsIBAIBAKBQMwEGCsWAoEMCDqd7uTJkxkZGXw+f+7cuaNGjVIqlUePHn306JG9vf2C\nBQuA9zeCIFeuXLl69apOpxs3bhxZ4EOr1R48eHDXrl1lZWUdHR1Ac04kEsXExCxatOjy5ct/\n/vmnRqOxsbEJCwtzcnLCcby9vb2xsbGkpEStVgcHB48YMSI9PV2j0UycOHH16tWGAn/l5uam\npaXV1dV1dHSIxWJnZ2eBQFBRUSGRSDgcDqjwyZMnarXaysoqPDw8MDAwOTm5vr5+06ZNBQUF\nLi4us2bNkslkMpksODh48eLFbDb7xRdfTE1NVavV9vb2586dCw8PRxAEx/Fz587dvXuXzWZj\nGKZSqezt7Zubm/fu3Qtis7q7u7u4uKxZs4YczTYnJ+eDDz6orKz09fWNjIysra21tbWdN2+e\nWCzevn37zp07VSoVj8ebM2cOh8MpKioSCAQNDQ1gSwqCIC4uLnl5eeCLf2ZmZmpqKtCYbWtr\nwzAMRdG8vDytVouiaHBwsI2NTUtLS1tbG4qiarVap9MxmUyZTKbRaBgMhoODA4ZhNTU1IOit\nh4eHQCAQi8Uikaiqqkomk3V2dqIoamNj4+fnJ5PJGAyGo6OjUCisqKjAMMze3h6Mms1mh4aG\nxsXFcbncF1544cmTJwKBQCAQqFQqS0vLf/3rXykpKcXFxWfOnGlsbFSpVMXFxeXl5Y2NjVqt\nViwWL168OCkpadSoUQiCtLa2Hjly5M6dO/X19cARcMaMGcuWLSO8cS5duvTdd98VFRXxeLzg\n4GA+n19XV6fT6SIjI+Pj49PT00+ePNnV1RUXFwfkoKuqqs6cOVNbW+vn55ecnGzip2etVpua\nmvrgwQNra+vZs2cHBgYiCCKXy48cOVJcXOzq6url5XX//n2lUjl69GhHR8dLly7J5fJRo0bN\nnz/fePi7oU5lZeWJEydqa2sRBOFwOAwGIzY2Ni4uzsRSw4cPX7JkiXk7APQ/A719o9+Bu2Jx\nc9/yA4C7Yoc0bW1tI0eOJD9q3nrrLU9PT+K/PB7vwIEDGIZRwoglJCTodDocx5ubm02XnzAF\ngUDw559/0ru6bdu2Xvj/2draGnof+/v70517Nm3apFKpyHHcjTN37lzQvU8++USvmymbze6R\nK1h6evoQimPh7u5OjqOql3ffffeff/4BG4EpSKXSf/75B8fxhQsXmt6ora3tnj17yNfO1dWV\nCNVlBJlMRlaP43A4X3311YMHDwxttSETHR3d0dHRrzdf973ty67YHnHixAm9jm7z5s0j9Fno\nnDx5klzKxcWlsLCwR+2a9ysSyp2YJ+Y9awHQsBvSrFq1qttXmlAo/Oyzz+jpX375JY7jy5Yt\n67aGnmJra9vW1kbu5z///NPvrRjipZde6lH+vXv3lpeX99dyjlkuCxkRHfTx8fn55597WiF9\n12pkZGS3sz05OZlSisfjeXt7m9jo2rVrB+o+1MegGXY1NTVkTSIK27ZtM1SKvp8sIiKiR02b\n9ysSxoqFQCBPAVOiN3Z2du7fv5+efuzYMZ1O11/RLck0NjZevXqVnDLIEUJ7lP+nn37atWsX\nhmH90np/1fNMUVNTY+hQSUnJ999/39MKtVotJSUjI+Px48fGi5w6dYqSqFKpSktLTWz02LFj\npvdwCHHhwgUjwb4MjZocMJfg3r17JSUl/dk5swYadhAIpJ/R6XRArqxb9GZra2tTq9Vqtbq/\n+4UgtLCSgxllkh703ThtbW1NTU0D1Jn/Buj2wUDUo1Kp+jhX5XI5PtR0x0zB+M1l6KwaKtVf\nV/O/AWjYQSCQfobJZPr5+ZmSU68XXVBQEJ/PJ4dk6EcosSb1BtYcINzd3XuUPzw8fPz48QPU\nGfPAiMglg8EAuyv6CI/H8/X1NZJBJBLpnaumf/sOCgoyS7VO4zeXoeC8ekvxeLzBDCQ95Bms\nj8L9BvSxw83dgQAAfeyGNL///jvlUUMPqzB+/PiSkhKKPw0RxfX06dP9/rijhAfFcVyhUIAN\njAONRCK5ceNGt0FsCXg8XlVVFY7jPTUHDREZGUlsQzYP3Nzc1q5da+jov/71r4qKCr0R641A\ntzY+//zzbmf7iRMn6PVs3LjRxEb17ukZOAbNxw7DsBkzZugdslAofPjwoaFSM2fOpOT/7LPP\netS0eb8i4eYJ88S8Zy0AGnZDndOnT/v7+6MoyuVyExISysrKfvjhBxBiUiQSpaSkgACXmZmZ\n48aNY7FYTCYzJibm9u3bRA3Hjh0zFM5BIBBQFjkYDAYlehI5NCqXy12/fr3eu6aqqiopKQlY\nAMB3XiAQWFtboyjK4XAsLCyIQLQEbm5u33zzzcyZM8GqDIqilpaWoAZvb+8DBw5s2bKFXMTO\nzq6urg7H8cuXL4eGhjIYDBaLBfLzeDz60o6Xl9e9e/dA9+rr64lwqyiKglJCoXDp0qWrVq2i\nR2jVe7r8/PzUanVpaWl8fDw9xtSAQj97ACaTGRUVpTeKmlgsPnXq1IoVK4DIBX1DA4qi8fHx\nxcXFarV68+bNNjY25KMWFhabN29Wq9U4jmdlZZHX24iQr6DayMhI4u8KoVD47bffVlVVvfLK\nK8Dl39nZefv27WCPdrccPXoULCnx+fzFixc/efJEo9F8/vnnQGFHLBZ7eHiAILaBgYFjxozh\ncrkoio4YMeL8+fN9v9d6xGDuim1paVm7di04nyBIIJPJjI6OvnXrlpFSra2tr776Kijl5OT0\nzTffmHgVCMz7FQljxZon5h0vBQBDipkHTU1NWq3WysqKkBTp6OgQCoWUywpew3oVLjo7Ozs7\nO9lstlwuZzAYIFwpsG/q6uoYDIZEIiHCkgJFN5VK1dra6uzsjCBIe3s7hmHdnmSdTqdSqUBE\nVxCXXaFQgEDjCoWCw+Go1WqVSmVlZQW07oiCdXV1YLOtpaWlUqkkC261tbWVl5cHBQVRTDel\nUslisdhsNnEqtFptQUHBiBEjGhoaLC0t9cqvVFdXgxF1dHQIBAKiTrlcXlhY6O/vLxKJtFpt\ne3u7UChkMBjNzc15eXlSqTQ4OJhcj1arBQJ1YrEY1Ilh2NmzZ6dOnSoWixUKhVarBUp4Dg4O\nzc3NQJfu5s2bHh4eXC4XmKR//PHH7NmzVSoViBcHAneCt69cLgdyd0AC0MHBQaPRdHZ2CgQC\njUYDYu8CPTMwTK1WW1ZW5ujoKBKJwA9iXRPDMIVCIRKJVCpVR0eHtbV1aWmpi4sLm82mWHty\nuRxE6dVqtfSdmGq1uq2tTSQSgWitSqUSWO2gko6ODq1WC14ZREgxYhr0iM7OTh6PRzGd5XI5\n6JJGo9FqtWDqgqvwVJ7hTyWkGDifarUahP3tUaleNGfer0gYK9Y8Me9ZC4CGnXkAY8WaBzBW\nrNkAY8UOdWCsWAgEAoFAIJD/FqBhB4FAIBAIBGImQMMOAoFAIBAIxEyAhh0EAoFAIBCImQAN\nOwjEbGlra/uf//kfX19fOzu7uLi49PT0p9ufCxcujB07ViqV+vj4BAYGOjg4SKVSqVQqkUjC\nwsJ+/PFHIuxVVVXV8uXL3d3dbW1tgSAIkE1ZuXJlXFwcn89ns9lCoXDOnDmFhYV79uwJDw+3\nsrIaOXLknj17MAyrq6t74YUXRCIRKMhisYDA6b///e/Ozs7Tp0/HxMTY2NgEBQW9++67s2bN\nEggELBZLKBRGREQAHQoglbJ69Wocx2/fvm1tbQ2q4vF4mzdvXr16tUQiYbFYHA7Hzs7O0dFx\nxIgRycnJqamp4eHhYMsn+n9hs9nAK5/L5UokEiD9wOfz/fz8yJldXFxqamp0Ot13330XEhIi\nEAg4HA65HgaDYW9vv2vXrldeeYV+iMlk8ng8GxsbFotFaX3lypUNDQ3r1q3z9va2s7MLCAiw\nsrIiNw36Ex8fn5WV9fjx44SEBJFIBPafMhgMNptta2srEonI1YLmUH1QzgCHw3FycrKxsbG2\ntra0tGQymfQiLBbL2tra29t73bp1zc3N5JmTnZ3t7OwM6gRjoURIa25u5vP5RFUCgWDWrFl8\nPh/0XCgUenp6vvzyy/X19eRSaWlphJyNQCAgB7hLTU2NjIy0trYOCQnZvn07PdQYnVu3bk2d\nOtXW1tbX1/ftt9+mR1DAcXz79u1SqZTJZHI4nJEjRxYWFlLyaLXaHTt2hISEWFtbu7u7u7m5\nWVtbR0VF0aXyIBCDDILmSv8CdexwcxfpAUAduz6iVqujo6PJNzuPxzMuHzUQKBSKxsbGrq4u\nU2K/vv322ziOy2QyoOthCnTphLffftvHx8dQ/oCAABNrBiQkJKCDuy+bx+OtWbNmIGoWiUQm\nduCpqwqEhIQolUowhfLz8+lSfzNmzCBPMxPDPPj6+ra3t4Mi9+7do2c4cuSIWq3esWMHJX3N\nmjXG5/m1a9coRcaOHavRaMh5PvzwQ0oeHo9XWVlJzrNu3TpDnd+5c2cfb0bAYOrYPS3M+xUJ\nBYrNE/OetQBo2PWRn376if5uCAsLG6DmDAEMO4VCoVeKlk5JScn69etNyWmIQbbDIAPE119/\nDaaQochgJSUlIMOqVatMr/bDDz8EpYB0MAWhUCiTyfTGq7h//76Rea43ENb+/fuJDDKZTK80\nNPkN/fDhQyM95/P5IKpsH4GG3VCnW8MOfoqFQMyTf/75h554//79PgYs7x0VFRUNDQ2m5Lx3\n757enpsOPtS0OSF6ycjIAD/y8vL0Zjh79iz4cerUqV5UK5PJ6EcVCsXDhw+VSqWRgnSUSqXe\nTpKL3L9/X6fTGc+jdxGR3Epubq6RDBAIABp2EIh5ojeKA5vNHuSIUgDTteb5fL7pmSFmDDEN\n6CHFAESEiR5pXxPVGvp6a2j6GQk7y2Kx9HaSXJWhasn3qd571sQ+QCAE0LCDQMyTWbNm0RNn\nzpz5VAw7JyensLCwbrNJJJLY2Nj4+Pi+tNW/oRGeyoddaNoiCDJ79mzwQ+9MZjAYixcvBr+3\nbdvWi2r9/f3pRx0dHYOCgkBEYzJCoXDSpEmG6mSz2XFxcfR0cs8jIiL0RseaP38+8XvChAlG\n/CDd3d2DgoIMHYVA/n8G66NwvwF97HBzdyAAQB+7vrN27Vryze7i4lJdXT1wzemF2DyRk5Nj\n3CWfy+WeOHECx3GtVjtlyhQTn2A+Pj6UNY8zZ87MmzfPUH6wW9bEyhEE2b9/v+mBiQytLfWI\n4cOHHzp0yPgqlIl7BShQgsYawtPT05Bb26CxYsUKYgq1tbXRw7V9/vnn5GlmYoCs+fPnE48U\nhUJBCR3GYDCKi4vVavW1a9coF33fvn3G53llZaWjoyO5yBtvvEHJc/HiRcqF8/LyIvaIAA4c\nOKD30guFwuvXr/f2Lvw/QB+7oU63PnYwVuyQxLwD4QFgrNh+4cKFC+fOnWtpaRk5cuTq1at7\nF1G7L5BjxTY0NHz//fcPHz60tbUVCoW1tbVdXV1dXV1sNtvT03PVqlV+fn6gFIZhhw4dunLl\nikajKSwsLC0t7erq8vDwOHTo0M2bN48ePSqTyezt7ZctW7Zy5cry8vKffvqppKTEx8dn1apV\nw4YNw3E8NTV17969mZmZOp1OJBJ5e3sHBgYmJiZOmDChpqZm165d+fn5Li4uS5YsycnJ2bt3\nb319vaOjY1JS0r59+x4+fIhhmIODw/nz50NDQ3U6XUJCwvXr1zUaja+v78WLF7Oysr755pvS\n0lKxWDxs2DAul6tWq6dOnZqQkLB9+/azZ882NDS0t7crFAowHB6P5+fnx2azOzs7ORyOm5tb\nWVlZS0uLvb39Cy+88OGHHwJ/LxaL9fbbb3/88ccoij58+HDv3r3Z2dn19fUVFRWdnZ0IgqAo\namVlNWHChK+++qqqqmr16tXgzKAoCqK4crlcqVTq5uZWXV2dmZkJNEFQFA0KCtq2bdvUqVN/\n/fXX3377TS6Xu7u75+fnp6enKxQKDMOA3kdgYODcuXNffvllDoezf//+gwcP1tbWcrnc2tpa\nhUKh0+lQFMUwDGh/oCgaEBDA4/GKiorIL1FwyNnZuaGhQaPRoCjK5XK9vLyio6PlcnlHRweT\nySwqKiovL9fpdEBcRqVSsVgsDw+P0NBQCwuL+Pj4uXPnkmeRSqV67bXXTp8+rVKpnJ2d9+zZ\nExsbS5lpCQkJ586dw3EcRdHExMQFCxZs2bKlrq6Oy+X6+vq6urpOmzZt0aJF5OdJV1fXjBkz\nwIkKDAy8ePEiEHBhs9lVVVW7du0qKipydXVdsWJFSEhIt1NdLpfv2rUrKyvLyspq/vz506ZN\no+cpLS1dv359bm6uQCBITEx877336HFpc3Nz9+/fX15eLpFIUBRta2sbPnz4K6+84urq2m0f\nTAHGih3qdBsrFhp2QxLznrUAaNiZB2TD7mn3ZQBpbm6mryqZE2B1GZiPT7svA0h7ezsw7J52\nRwYQaNgNdbo17KCPHQQCgUAgEIiZAA07CAQCgUAgEDMBGnYQCAQCgUAgZgI07CAQCAQCgUDM\nBGjYQSAQCAQCgZgJ0LCDQCAQCAQCMRP6QU4TAoFA9NLQ0PD7779bWlpOmzYNyJ1gGFZeXt7S\n0jJ8+HCyULBarX706FFZWZmXl5ebm1tOTk5ra6ufn59SqeTz+d7e3s3NzcePH9dqtWPHjmUy\nmf7+/jweT6PR5KNwj5MAACAASURBVOfnV1ZW2tvb+/n5/fbbbyUlJcuWLcMwrLm5GUXRR48e\nIQgSGxvr6emJIIhOp7t169bNmzdjYmLGjh1LqFqUl5dv3ryZwWDMmDGjo6MjNja2tbWVwWB0\ndXW1trZKJJLhw4fX19dXV1ezWCwbG5urV6/qdLrk5GQej1dYWMhmsxsbG729vd3d3VEUra6u\nvnv37qNHj3Jzc9vb26dPnz579mwPDw8URVtbWzMzM69fv25ra/vcc8/V1tbeu3fPxcXl4MGD\nWVlZixYtmjFjBoPB8PDwOHPmTG1t7Zw5czo6OlgsVklJibOzc2RkJBD2y8vLKykpiY+PDw8P\nP3fu3E8//eTs7GxnZzd27FgWi1VdXe3s7NzZ2Zmeno5h2EsvvSQUCu/duyeRSEJCQh48eFBQ\nUODp6clisa5duyYUCnNzcysrK2fNmjVx4kRra+vi4uLffvsN9Fyr1Q4fPvzJkydXrlwRCoV+\nfn6hoaHp6elyuVwoFAYFBUkkkvPnz8tkMg8PD2tr6x9++KGpqWn16tXu7u6XLl1CEOT5558H\n0ncWFhZKpbK4uFgul0skErFY3NLSIpFIVCrV6dOn/fz83n33XZFI1NjYmJWVhWFYaGioSqVq\nbGzkcrkYhjk7O5eXl9fU1Mjl8gkTJiAIUlBQwGAwAgMDlUplaWkpm81WKpVpaWmBgYHR0dFA\n8+/evXuWlpaRkZGlpaWXL1/29fWNiYmpqqqyt7d3c3Nramq6fft2WlpaQ0NDYmIiRTwPzFWZ\nTMbhcHAc9/f3R1G0oKCAy+X6+PiYIkZdVVV1/fp1Dw+PqKiovohXq9XqgoICFovl6+vbdymW\nlpaWiooKf39/JycntVpdWFjIZDL7pWaAoXscMngMqD7yQAAjT+DmLqsNgJEnhjrkIEtcLnfP\nnj337t0jhF75fP5//vMfcH1PnjxpXHiSHmLL0tIyJSXFdOm4wMDAH3/8kVwPm80+ePCgTqdz\ndnbutrghMUVKIIHQ0NDIyEi9Ob28vBYvXtxHUUYWi2XeEmsODg6D1pa9vT3lcqAo+u6776rV\nahzHyXMVQNbwc3NzS0tLMzL5GxsbydE7BALBkSNHencfHTp0yNbWFtTj4uJy9uzZ3tWD43hn\nZ2dKSgoxaYODg21sbMBvZ2fnM2fO9LpmgqysLCJ4II/He//995/KM9y8X5HdRp6Aht2QxLxn\nLQAadkOa5557jvIeZTAYxPuJYPv27Xfv3n1a2sVgveepNA15Zjl+/HhjY6OTk5PxbAKBICcn\nR+/kxzCMbt8zmcz79+/39D66cuUKpR4+n5+Zmdm7uzIlJcXIiHg83r1793pXM6CpqcnFxYVS\n7ZdfftmXOnuHeb8iuzXsoI8dBALpf44ePUpJwTCssbGRkvjJJ5988cUXarV6sPpF7dLDhw+f\nStOQZ5Z33nln3759NTU1xrMpFIovv/xS76Hs7OyMjAxKok6n27RpU087s2XLFkqKUqncunVr\nT+tBEKS+vn7Pnj1GMqhUqi+++KIXNRP88ssvT548oSR++umn+FALcDXUgYYdBALpZ9RqNQgn\n2i0NDQ3FxcUD3R8IxHQaGhoeP35sSs7S0lK96YaKl5SU9LQzeqsysXsUysrKQOzgnjZnOnqL\nNzU1tba29qVaSE+Bhh0EAulnOBwOxfnMEGKxuNtvXhDIYGJhYWFvb29KTkdHR73phor3woNQ\nb5HeeSKaUqqPPo56By4UCi0sLPpSLaSnQMMOAoH0PzExMfRE+h6Il1566eWXXx6UHunH9O0X\nkP8S1qxZs2zZMmKfhBFeeuklvekxMTGurq709A0bNvS0M3qb6N0t4+HhQd7PpJc+3oxLly6l\nb4NdtWoVk8nsS7WQHjNY3n79Btw8gZu7ZygAbp4Y0nR2drq5uRHPGRRFX3vttZMnT5INqQUL\nFqhUKhzHP/roI+OPfr2bSUeNGmW6hISFhUVKSgq5HhRF33jjjc7Ozn7cumFtbU1sM6TA5/Ph\nRo1u6YsmSE/Ru7945syZYFcsZa5S4PF4xvcE5Ofnk2cCiqJvv/12726ljRs3ElOUx+Nt2bKl\nd/XgOF5TUxMVFUX0ytLSkjgJXC4XOMP1kTNnzpAHPm/ePKVS2fdqe4p5vyK73TyB4kPNq7Gl\npeXVV189fPiwoQwymYzFYhlXTxjqqFQqDMMEAsHT7sgAIpfL1Wq1jY1NHxUinmV0Ol1HR4dE\nInnaHRkoTp8+fe7cObFYvGbNmoCAAARBWlparl692tLSEh4eHhoaSuQsLy8/d+5cYWGhn5/f\nsGHDbt261dzc7OXlJRaLLS0tx44de/Xq1f3796vV6pEjRwYHB0dFRfn5+VVUVKSlpRUUFLi7\nu3t5eR08eLCxsXHy5MlBQUGVlZUymSw/P5/FYk2fPn3x4sUCgaC6unrLli05OTn+/v4bNmzw\n8vICrf/73//ev38/hmG+vr5eXl4BAQG2tratra1KpVImkzk7OwcHB8tksuzsbIFAoFKpMjMz\nEQRZsWKFv79/VlZWU1OTTqcLCAiYMGECh8P5+++/Dx06lJWVVVtbi+O4p6fn+vXrExIS/j/2\n7js+ijL/A/j07Um2p0MaARKq0qUoAic/EEUs2Asqh5564lnRU7yipyKcJ/auKCCIKAJKB0F6\nDy0JpGd7z+5O/f0xsC67m7Dpyfp9/5FXMs/szLMlu599Zp7vpKSk7Ny589NPP927d69Wq73n\nnnucTue+ffucTue2bdu8Xq9er//LX/6Snp6O4/iSJUs8Hs/gwYMLCgoqKipsNltqaupdd93l\n9/u3bNny/fffezyeIUOGPPTQQ3PmzDl06BDP86mpqYWFhampqSaTKTk52W63nzhxgiTJ6dOn\nDxgwYPv27SqVatiwYTt37jx27FhmZqbT6SwvL3e73Wazmabp1NTU++67Lzk5effu3Zs3b+Y4\nLjMzs6ioSKPRnDp16tSpUwiC9O/fPyMjo7S0tK6uTq1W9+/fnyCITZs2ORwOjUZD0/SZM2d4\nntfr9cOGDduzZw9BECNHjuzXr5/H40lOTrbZbCUlJW63W6lUimUI5XJ5ZWVlWVmZWq3+17/+\nNX369M2bN69du5bn+VGjRqEoWllZyTCM0WgU19y7dy9BEKNGjUpNTf31118JghgyZAjDMPv2\n7aMo6tChQ2VlZSkpKddcc02PHj28Xm9JSYlSqbzmmmu2bt26bdu2jIyMcePGoShqNBrHjBlz\n8ODBxYsXb9u2jabpAQMGvPbaa7169ZJKpWLcEV+r586dY1nWYDAMGzYMx/HffvuNoqhRo0ZF\nT/+MwDDM119/vXXr1rS0tPvuu0+spNgyZWVlu3btEh/M8O9LLSAIwpo1a8rKygoLC8eNG1db\nW7tr1y4Mw0aNGtXKLYc4HI6tW7fa7faBAwcOHjy4TbbZXIn9EckwzIwZM77//vvGVoBg1y0l\n9qtWBMEuMYhfnZOSkjqrpknHsNvtiX1UVxxdDq/llpA8Hk8o2CUqm82GYZhare7sjrSjxP6I\nvGSwg3PsAAAAAAASBAQ7AAAAAIAEAcEOAAAAACBBQLADAAAAAEgQEOwAAAAAABIEBDsAAAAA\ngATRcdUgu6MTJ06E/9mnT5/O6gkAAAAAwCVBsGuGiJwngrQHAAAAgC4Cgl1rQdoDAAAAQBcB\nwa5dQNoDfxA1NTWffPJJWVlZTk7OXXfd1aNHj5bdPCUl5fTp0wcPHuR5PisrKxAI1NfXEwQx\nfvz4999/Xy6Xr1u37ueffz5x4oR4fS2SJEeNGjVv3rx+/fohCFJeXn7NNddUVFQIgqDVarOy\nsqxWq8lkYhgGx/Hs7Oybb7757rvvzsnJqa2t/eijj9avXx8MBgcMGDBjxozRo0c/99xz4kW0\ntFptz549s7OzCYJYsWJFWVkZTdMIguA4LpFI1Gr1pEmTFArFkSNHaJru3bu3VCq12Ww7d+50\nuVx+vx/DMKlUShAEwzBSqZTjOJqm9Xr9Cy+8cPvtt1dXV7/xxhtffvml1+slSZKiKEEQCIJw\nOBwMwyAIgqLou+++m5aW9sYbb5w4cYLneQRB/H4/TdMsyyIIIpVKhw8fXl9f73a7DQbDtdde\nO2bMmJdffvnIkSMej4fneYIgLr/88g8//PDjjz9euHChuFkMwyQSCY7jPM+LF+4U96VQKPx+\nP8dxoadDLpeL10xraGgQBAFF0YKCgkmTJi1dutRmsyEIolKpeJ53u93iRtRqtSAILpdLvCM4\njgeDQXGPSqXS5/OJdwHHcRzH5XJ5SkqK2+32+/0oiooXtcRxXNwXgiAKhWLgwIFTp05dt27d\n8ePHfT4fx3E8z5MkmZaWhmGYxWLxer0EQRQVFWEYdvbsWZqmVSqVIAhOp5PnefHSwwiCyGSy\n9PT05OTkQCDg9/uTkpJUKhWO48XFxRMmTMjNzX3++edPnTrF87zH42loaMAwrFevXn/6059m\nzJhRWFi4a9euVatW2e324uLi8vLyLVu2cBwn3tmGhgaj0ahSqRiG8fv9aWlpAwYMmDVrllKp\nDH9hOxyOjz766MSJE6mpqTNnziwuLg41+f3+jz766PDhwxqNZtq0aSNHjjx+/PiSJUt27doV\nDAb79u07aNAgk8lUWVmZn59/zz33pKenN///Ejl79uxnn31WUVFRUFAwfPjwhQsXnjlzRq/X\nz5kzZ8KECaHVzp079+mnn1ZUVLRmX21l8+bNP/30k8vluuyyy+6++26JRNKJnenW4JJiTYmZ\nz9pWy9JeYl8vRQSXFOv6Nm/efO2113q9XvFPuVy+fPnyyZMnh6/TxCXFIm7eGIlEcv3113/z\nzTfRTRiGvf3220ajcfr06ZfsrUwmmzdv3r///e+IPZIkKQagdjVixIiDBw8GAoH23hFoDalU\nOm3atKVLlzbrVhkZGTt27OjZs6f4Z0lJybhx4ywWi/gnRVFvvfXWAw88gCBIfX39yJEjz549\nG7rt1KlT169fL35/iKZSqX744YexY8c2qz+rV6++5ZZb/H5/zNaJEyd+8803arX6hx9+uPnm\nm0OrtWxfbWXu3LkLFiwI/dm7d+9ff/21xZfpS+yPSLhWbKt0QLCLFk/US+xXrQiCXRcXCATy\n8vJqa2vDF+p0urKysqSkpNCSxoJdzJu3AEVRHMeFDzs1AcMwcQwJgLZ11VVXbdy4Ufz98ssv\n379/f3irTCY7evRoXl7e9ddfv2rVqmZtOTMzs7S0NP7hK6fTmZeXZ7fbm1jnnXfeueWWW/Lz\n88WB2Bbvq638/PPPkyZNilh4xx13fP755y3bYGJ/RMK1YrufE7F0dqcAiLR79+7oWGa1Wrdt\n2xbPzffs2dP6VIcgCE3TcaY6BEEg1YF2snnzZofDgSBIZWVlRKpDEMTv969du5Zl2TVr1jR3\ny9XV1Xv27Il//a1btzad6hAEWbZs2bZt2yJSXQv21VZiht3vvvuu43uSGOAcu+4hItsxDCMI\nwoABAzqrPwA0NDQ0a3nLVgOgWxBPv1Or1Y29sH0+XzAYbNlBf5/PF//K8fxneb3eJvoZ/77a\nSsyd+v1+nucxDIafmg2CXTcGUzRAJ+rfvz+O49GjZYMGDWrNzQHojtLS0tLS0hAEycnJSU5O\ndrlcESsMHjxYoVD06tXr9OnTzdoyjuMDBw6Mf/14Vh42bFjM1Zq7r7YyaNCg6KOuAwcOhFTX\nMvCoJRo4kgs6RkZGxlNPPRWx8NFHHy0oKIjn5unp6U8//XSc+9LpdI01TZo06frrr49zO5df\nfnmca7Y5mOLXXRgMhhbcatGiRWIKkUgkr7/+ekTrtddee/XVVyMI8t///jeiSa1WN73l5557\nLjU1Nf6e9OnTZ86cOU2skJKS8vjjj/fu3fuhhx5q5b7ayoMPPlhUVBSxcOHChR3fk8SAv/ji\ni53dh+YJBAI//fTTDTfc0NgK4tx1qVTa+n1ZrdbWb6Q9hIoIxLm+NRa9Xt+efWytYDDIcZxc\nLk/gyROCINA03Sav1U4xbtw4nU5XWlrqdrtzcnKee+65559/PuJlybIswzBiuY2Im48dO1av\n14s3T0pKYllWfGGjKBp60hUKxcKFCxcsWGAymaqqqsKPZEml0vvvv/+DDz649dZbjx8/fvLk\nyVBT9GvGaDS+8MILYjGRY8eOud1uFEXlcvkNN9zw4IMP7tu3T5wbSBAEiqJpaWkSiSTmpEKZ\nTKZQKBiGwTCMoiiNRkPTdNOn7qEo2qdPn99++y07O3vnzp2NzX8UkSR55ZVXVlZWNjatLXTX\nUBTNyMgoLCw0m83hHcBxfMKECbW1tWKFlNaTSCQdNrCKYZharWYYJvohDX9OURTFMOySM/9C\nLyTxFxzHFQrFn/70pwEDBpSVlUU8PuLj+be//W3x4sWBQKC6upqm6dzcXEEQxBou4gtY3I5E\nIhEEgSRJFEUHDhy4ePHi8I+kwYMH9+rV68yZM3a7PSMjY86cOYsWLRInD+Xn548YMeLMmTMW\ni0Wn091xxx1LliyRyWSlpaU+n0/85CooKJBKpQ0NDfn5+S+99NLcuXObO3A1YcIElUpVXl7u\n8Xjy8vIKCwutVivLsjiODxs2bNmyZampqTKZbMKECUqlUlytxftqEyRJTp8+3eFwVFVV8Tw/\nbNiwzz//fMyYMS3eIMuy4hPUhp3sOnieX7Zs2cyZMxtbAWbFNqXLjnWJ59hF149opS51GBdm\nxSaGJsqdJBK73d7i0gzdAsuyTqdTzLWd3Zd25PF4pFJpogYCkc1mEwN0Z3ekHf3BZ8XCOXbg\nd3DSHgAAANCtQbADlwBpDwAAAOguINiBloC0BwAAAHRBEOxAm4G0BwAAAHQuCHZNyfzLXwSC\nYA0GNjWV1esZ8afRKHTbaYwdLzrtQdQDAAAA2gkEu8bxvHLLFjTWPH9BImENBlavZ/R6JiuL\n1et//zM9HYm7CskfEwzsAQAAAO0Egl3jTKaYqQ5BEDQYJKuqyKoqWaxWLikpIu2JvzPZ2ZxK\n1a5d7r6i014gEMjKyuqUzgAAAADdFAS7xjU0BPr0ISwWwmZDmlPtD3e78ePHYzbxSiVjNLIG\nA2swMEYjq9ezaWmsTscYjZxOJ8BQ38XOnj1rNpsj6tjB2B4AAADQGAh2jcvLO7tiBYIgKMMQ\nVitRX09aLITJRJhMhMVC1tcTFgtRX48FAvFvEvN6JV6vpKwsRhuOs1otYzCcT3t6/fkIaDQy\nBgMPQ30XwJFcAAAAoDEQ7C5NIEkmLY1JS4txgSEEwT2e6LR3PgLabEiT1xq6CMcRZjNhNsds\n5KXS8LQX1OlovR5JS2PS0jidTiD+6M8jpD0AAAAAgWDXepxKxalUwfz86CaU43CrVUx7pMlE\nmM3hERBraIh/L1ggQJ09S509G6MNRVmtNjS2x+r1TFoadyECcgl9uaqmQdoDAADwRwPBrh0J\nOM4ajazRGLMVa2g4P8Inpj2zmbyQ+XCrtbF5G7F2IxBWK2G1IiUlMRql0t8P7+p0YsWW8xHQ\nYBAS+vKdMUH5FQAAAAmsE4JdTU3Nm2++WVpaumrVqtBCr9f7/vvvHzlyhGGYwsLC2bNnGwyG\nju9bR+Ll8mBubjA3N1YbT9hshMlEWiyEOOAXFgFxjyf+vaCBAFVZSVVWxmzlNJpQfb5Q2hOL\n9rEJfUXzcDCwBwAAIGF0dLDbvn37hx9+OGjQoNLS0vDlCxcu9Hq9f//73yUSyZIlS+bPn//f\n//4Xw7AO7l5XgWGsXs/q9THnZWCBAFJdTVgsMqs1lPbOR0CrFWWY+PeD2+243S45dSq6SaCo\n8LT3+xxevZ4xGBK+RDOkPQAAAN1RRwc7hmFef/31srKyLVu2hBZarda9e/e++eabOTk5CILM\nnj37jjvuOHr06IABAzq4e90CL5UyPXoEsrPp6AOpgkDYbGLaI81mwmz+fcDPasUdjvj3gtI0\nWVND1tTErtWXnCymPS7sghxiGRdWp0MuLlCSMCDtNWHHjh3XXXed3W5HEESj0axataqkpOTx\nxx9vaGhAUbRnz57bt29PT08XV66urn7ppZfWr19vtVoZhkEQhCCI5OTkYDDo9/sxDCsuLl6+\nfLnH43n55Zf37dvncrn8fj+CICRJ0jQdCASEsApEKIpKpVJBEILBoPinRCJhWZZl2fDVNBrN\n3r17T5w48dprr+3evVvcr1QqpWmaufB1CEVRjUajUqkIgqiurg4Gg4IgoCgasTvxZ2pq6ssv\nv/zll1/u2bOHZdnU1NS+ffseOXLEbrcTBEHTtNif0MoGg8HhcFSGDZ+HSvmEtq/T6f7xj3/M\nnj07tA5FUQzDhFaI6Ez8MAxTKpVer5ePf0ZXp0JRFMMwnudj3l8URVEUDd0XHMdJkhSfL+TC\nAxu6IUVRzzzzzE8//XTs2DGe53v27HnzzTd/+OGHFosl/Knv3bv33Llz77jjDp7nH3vssS+/\n/NLr9SYlJf3lL39JTU39+9//brfbKYpKSUmRy+UYhgmCwLJsXl7eo48+mp2d/fLLLx8+fFit\nVk+ePLmmpmbFihVer1epVN5www1qtXrjxo1er/eyyy7zer2bN2/2+XwoiuI4ThBEfn7+Lbfc\nUllZuWPHDvGlcvbs2erqakEQZDIZRVE0TeM4npeXd/PNNz/88MO//vrra6+9dvr06fT09Lvu\nuuvuu+/GMGz79u2vvvrqqVOn0tLSbr/99lmzZsUzMmKxWObPn79t2zZBEK644orMzMw1a9ZY\nLJa+ffs+++yzAwYMWLRo0YoVK5xO54ABA+bNm9e/f/82eXLjx7LsO++8880334R6NXTo0A7u\nQzxOnTo1f/78/fv3JyUlTZky5YknnpDL5Z3VmRa+R7TSb7/99sorr4QOxe7ateuNN95Yvnx5\n6G3uL3/5y+jRo2+66abo2zocjocffvirr75qbONWq5UgiJSUlNb3M+YHeVcgvtFTzTxDDg0G\nCbM58vCuOIfXbEYvfAi1kkAQrFZ7UcUWg4FJTWV1OjYtjZfFDIoxBAIBlmUVCgXaDWNinGmP\n4ziv15vc/Se4HDp0aNCgQU2vQxCE2+2WyWS1tbUDBw60WCxNr0+SJI7jgeaUE7qkFqci8Mfx\n9NNP79y5c9u2bc26FUEQLMu2U5fCDR06dM+ePeFLHnnkkXHjxk2fPj184ezZs995552YW7DZ\nbBiGqdVqp9M5cODAioqKOPcllUq3bNkybNiw1t2D5rn99tsjPu5//vnnCRMmNH2rQCDA83yH\nRavjx48PHTq0IWxC5OjRozdv3oy3T21ahmFmzJjx/fffN7ZCl5g84Xa7VSpV+Od3cnKyy+UK\n/fnJJ5/s3btX/J2iKI7jwlujXXKFOImDBF2QIAiCIHDxT7AI0ekQnQ6JFTsIp5O0WimTibBY\nKLOZtFhIi4UymwmLhbTb498DyrKkyUSaTDFbOYWCMRoZvZ7R62mjkdFqaaOR1etpg4HVaoWw\nr5jid/G2/VzvMAcOHIhemJeXF7FEfBLb5LXaua6++upLrsOy7Pjx49euXTt37txLpjoEQRiG\nYZpzXkE8INWBS3rllVdacKuOSXUIgkSkOgRB/vvf/37xxRcRC999990ZM2Zcfvnl0VsIve08\n++yzTaS66H0FAoFZs2bt2LGjRR1viW3btkUP4tx3331Hjhxp+gu/ONDb5m8gjZk9e3bDxWUu\ntm/fvnjx4jvvvLM9dscwTNPj7u0b7Hbs2PH666+Lv//73/9uYhij6SeprKws9ApLTk7W6XRN\nP2Ft9Yy2JDl1oLbtHqdSBVUqJCcnugllGNJqpcxmymoV054YAUmbjTKbm1WiGff58PJyaXl5\ndJOAYYxWy1xIe7RWyxiNtE7H6PW0wcApFC2/b13D6dOnoxfm5OR02LtP+3HEd5T/2LFjDMPs\n2rWrvfsDwB9KzH/AnTt3NnE6E8Mwu3fvbu6Ojh075nQ6FR31brxz587ohVVVVefOncvMzLzk\nzTvmrANBEKLTNoIgO3funDlzZnvs8ZIfGe0b7AYPHrxo0SLx99TU1MZWS0lJcbvd4rks4hKX\ny6VWq0MrPPfcc08++WSo6emnn9ZqtY1tzWaziefrtL7/5kbKBXc68eQhkiQ7bpcpKUJ+fhBB\noo/XiiWaSbOZsFiIujrSaiXq6wmzmTSbm1WiGeV5ymKhGhnL4WWy88dzU1NZg4ExGNjUVEan\nY9PSWK22m5ZoFgShsrJSevFMlO540l6ch8txHNdqtc09hQAA0AJJSUkxPyjtdjuGYSkpKRKJ\npLnbRFFUr9fL4j6jppVUjVxySa/XN5EBEAQJBoM8z3dYP2MeclUoFE13ssUYhmn6BMr2/TiU\ny+U9evS45GoFBQUMw5SVleXn5yMI4na7q6qqwj/eZDJZ6BkSh6ku+UHSJidmdfGzu7pI9/ik\nJDopiS4oiG46X6K5ro6wWiNLNNfVYc050o35/ZKzZyWNlWjW6cKvwMukpp6f1WEwcElJLb5r\n7S38RO+QkydPRq/ZxdNeRkZGZSMldcKNHTsWRdGJEyeeOXOmA3oFQAtIJJJgG51w3DEkEonB\nYKiqqopYfvXVVzfxGSH+JzZ3+Hz06NEdOSdgwoQJzz33XMTC4uLijIyMeG7eYR+REyZMWL16\ndcTCSZMmtVMHLrnZjh7ncDgcHMd5PB4EQaxWK4IgSqVSo9GMGDHi7bfffuSRRyiK+vDDD/Py\n8vr27dvBfQNt7hIlmn2+UNoTizMTFyJgs0s0WyyExYIcPx6jMbxEc/h8jm5VormLT8jdtWtX\nVlZW+IGP6GkKcrlcnC/1z3/+c/369REFj6IplUqSJOM8yBsncZZlG24QJJ5XX3119+7dX3/9\ndbNupVAofD5fO3Up3HXXXbd27drw6PnSSy+NHDly4sSJ4Wckz58/v6ioqOlNPf3006tXrz54\n8GDMVrlcPmHChPAz9JOTk99///3Wdb95hgwZ8re//e21114L79Wnn37akX2Ix//+979du3aF\nnzp84403VAtgFQAAIABJREFU3njjjZ3Vn46eIzZr1qyI45uzZs269tprGxoa3n///YMHD3Ic\nV1RUNHv27PBDseFgVizS0lmx3QnHcbW1WF2dyusVa7W0uERz01iNhgsv0Rw24NcBJZp5ng8G\ng217vKCz0l59ff348ePLysoQBMnNzd20aVNFRcX06dMtFguO48OHD9+wYUPogIXX6120aNFP\nP/109uxZ8eNQLpdnZWU5nU673U6S5OjRoz/99FOPx7NgwYI9e/ZYLBan0ykIgkKh8Pv99fX1\nPM+HsiNBEDqdDkEQm83G8zxJkmq12u/3u1yu0HmoKIr269dv69atFRUVb7311po1a9xuN47j\nSUlJwWDQZrOJmyJJsnfv3hqNRiqVHjx40G638zyP43iocgqKoiRJchxHEET//v0XL1783nvv\nrV27NhAI9O7de/jw4du3b6+srJTJZMFgsLa2Vtw7RVGDBg0yGAwMw6xfvz70rivO/A0V6UAQ\nZMyYMW+99dZll10WOhk/PT3dbDaH/hQPwYTufsx5vmIdjfDT+TEMk0gkmZmZdXV1Xq+3xc9y\ndOWXxj5BwptCXQ1fLv6MiNpiEROxCcdxsRgNTdPR2xcfOpqmxYdCqVSq1eqamhqO48SNYBjG\ncZy4X7VavXTp0g8++GD79u0MwwwcOPDRRx996aWXysrK3G632AGCICZPnjxnzpwrr7ySJMn3\n3nvv9ddft9lsRqNx/vz5BoPh0UcfraysVCqVGRkZcrmcIAie51mWLSgoeOihh4xG44IFCw4e\nPKjRaKZNm2YymT744AOz2WwwGB588EGFQrFu3Tq32z1s2DAMw7744guLxSI+KRKJ5LLLLrvr\nrrtKS0u3bt0q1gY6ffr0kSNHEARJTk5WKpUej4eiqKKioptvvvn6668/ffr0okWLTpw4kZmZ\nedddd40fPx5BkNLS0oULF544cSI9Pf2OO+6YOHFiY09iaFYsgiCBQODtt9/esmWLIAhjx47N\nzc1dsWKF2WwuLi7+61//mp2dvXz58hUrVtjt9oEDB86dO7eJU6razw8//PD111+HehXPYcAO\nnhWLIIjNZnvjjTf27duXnJw8derU22+/vf0K8V5yVmz3m/wPwQ75IwS7JsudYIHA+bG9UMWW\nC2f4EVYr2kbT034v0Xwh7YkVW8TRPqH5p6dEa49gF1Mnju35/X6fz5eUlJTYL1e73a5J6Iu1\nsCzrdDplMlmHnTjfKTwej1Qq7dDTlztceLBLVB0f7DpS9yh3AkCz8FIpnZNDx5rAe1GJZvHw\nbljRPtzpjH8vlyjRnJLy+9XYQvM5xOtzaLVdrURzFz+SCwAAoK1AsGtK+Cdflx29AxcRJ1I0\nUqsPDQZ/vyBH+IBf80s0404n7nRKYk0CEAji/NieXn8+7YmzOppZorm9QdoDAIDEA8EuXs39\nwIMg2AUJEgmdlUVnZcVsxe326LQnHt4lbLb494KyLFlfT9bXx2zlVapQ2qMNhkBKCpqdfX7A\nT6tF2qdSefwg7QEAQLcGwa69QBDsdjiNhtNogr16RTehDPP7yXxmM2ky4eLhXbOZNJvR5pRo\nxjweiccjKSuL0YbjrE4nju0xaWniGX6hMi68Utniu9ZKkPYAAKC7gGDXVTTrYzIQCJw6dar9\nOgMiCCTJpKcz6ekxK+/hLtdFac9iOV/GpZklmhGOE6f9xmzkZbLIii2hObw6XceXaIa0BwAA\nXRAEu+6qoKCgWVN+YESw/XDJyVxycrCxEs1i2gsr0Rwq2tfcEs1UeTkV62psMUs0h2Z1dGSJ\n5uiXGcMw6enpHdYBAAD4g4Ng90cBh4Y7hYDjbGoq21jxJ49HqKpSut2RJZrr6wmbDWnDEs1R\naY8Rf9Hrhfav7HD27FmpVEpcPKYIY3sAANAeINiB2CAIdgBeoQjm5KAx58lyHGGz/V6fL3R4\n12QizWasOSWa0UCAqqigKipitrJa7UVpLywCcu1ZmA2O5AIAQHuAYAfaBgTBNobj4qHVmI1t\nWKKZsNkImw2JdYFagaLC6/NFFO1rkxLNESDtAQBAK0GwA53jkp/WbrebpmmtViteeQKCYLhL\nl2i+MLYXMeCHu1zx7wWlabK6mqyujtnaYSWaIe0BAED8INiB7gFGBOMVKtHct2+MxkAgOu2R\nZjNhMhEWCxrropyNaapEM0myOh2bmsro9bRe79dokLQ0PiNDjIC8VNrye3cBpD0AAIgJgh1I\nTM36jP/jpEBBKqWzs+ns7JithN0eqthyfsDvwlU6mleimWHIujqyri721dhUKtZoPH94Vxzh\nC12lo3UlmiHtAQAABDsAYDjwPFajYTWaYGFhdBNK0+Fp73zRPnHYr5klmnGPB/d4JKWl0U0C\njnNiiebQQV4xAoolmlt0+XlIewCAPxQIdgA02x8wCAoUxWRkMBkZTZdo/v0gr3h4VyzRLAhx\n7gVtukSzXB6d9n4v0dycoT5IewCARAXBDoB211hi4DjO6/UmJyeHL+yOKbCJEs2s38/X1iod\nDondHkp7pMlEWK3NLtHc0CApL5fELNGMYaxW28oSzY098hD4AADdCAQ7ALqWBBsOFAiCNhob\nevSgY130DPN6Q5dfu2jAz2RqXolmnm+/Es0wvAcA6EYg2AHQvXXrIMgrlUGlMpiXF6NNLNEc\nkfYuREDM641/L80r0Zya+nuJZrW6sW1GPIw+n89kMkHaAwB0Ogh2APyxdJv5wk2XaPb7I9Ne\ne5Rolkii097532OVaIaDuQCATgfBDgDQqC6bAnmZjM7NpXNzY7S1YYnmYJCsqiKrqmK2hpdo\nbtBosMzMxko0Q+ADAHQYCHYAgLYRM6b4/X6fz5eUlERRVPjydkyBnVGiOeXipvASzb8X7bsw\n4Bcq0Qxn7wEA2hwEOwBAbAzDfPvttyUlJXq9/tprr+3Zs2fECnv27Nm0aVN9fX0gEDAYDBs3\nbjx58iSKouPHj1+6dCnLsi+88MKaNWv8fv/QoUPvv//+sWPHhm4rxpeamprHHnts27ZtgiCM\nHDmyV69eP//8s8/nGzx48EMPPbRixYpDhw7RNJ2WluZyuXieJwhCLpdTFCUIgkQikcvlPM/T\nNK3RaIYMGRIIBNatW+dwOFAUJQiCpmmO4xAEwXFcqVRyHEfTNM/zgiDgOJ6cnJyenm4KBLwY\n1pCUxCuVSF5eJkXdNHo0ZTbX7tsns9v1NK0JBnV+f3JDQwrDxP/QNV2i2UsQVonEr9HYKKqC\n5y04bqIoG0VZJRKvQoGRpNhPqVQ6cOBAjUazbds2j8ezbds2FEUlEonYiiAIiqIoioZ+RxAE\nwzCSJKVSqdfrZVk2tBzHcS5sJoogCCkpKTqdrqKigmVZQRDENdPT09PT048cOULTtCAIKIqS\nJEmSZENDQ2gd8ebi7+GbFdcPtZIkKW4ZvTByiaKoVCplGEbsWGj7dFiYxnFc/ALAsizHccKF\nQjnhGxfvdejOiiuEnlnxT4lEIu5FoVAMHTq0d+/eH3/8sd/vRxBEKpX27dtXIpG43W63222z\n2QKBAEEQffr0Wb169fLlyy0WS1JSUnl5+dGjR+12u1KpLCoqGjVq1Ouvv26xWPR6/TvvvJOb\nm/v0009v3bpVEIQePXpcccUVeXl5Uql0//79+/fvR1G0V69eLMtWV1dnZGQ8+eSTFRUVH3/8\nsdPpTEpKYhimrq4uNTV12rRpycnJr7/+usvlKi4u/uabb1QqFYIgNpvt22+/rayszM7OJkny\n7Nmzer2+T58+hw8f9nq9gwcPnjp1KnrxJftYll25cuWxY8c0Gs3UqVPzLpy0evTo0XXr1vl8\nvssvv/z//u//0EYu9FdVVbVq1SrxJNEZM2ZILnUZaIvFsmLFiqqqqvz8/JtuuknRnAKTZrN5\nxYoV1dXV4bc1mUwrV66srq4uKCi46aab5HJ5/BsE0VAh7hJTXYTD4Xj44Ye/+uqrxlawWq0E\nQaSkpDS2QgIIBAI8zyf2qz/iWrEJKWa5ky6ivr7+qquuCg0pyWSy995774477git8Nhjjy1a\ntKixmxMEIZPJPB5P+MKZM2d++eWXGIaJf3755Zd33nlnm78FjR49um03KCJ5XkfTeprWBYNa\nmjbQtJamDcGglqZ1waCE59tkLxyK2knSIpHYKMoskYhpz0JRVoqySCQNjdTq2759e5vsHcQJ\nRdv+oxNF0e3bt3McN336dFuTF3oZNWrU+vXrQ3HKarWOHz/+yJEj4p9SqfStt96aNWvWP//5\nz3nz5oVuNWbMmLVr18rlcpvNhmGY+sLEoOXLl99zzz0+n0/8s6CgYOPGjVlZWY3tfcOGDTfd\ndJPD4RD/zMzMXLt2bXFxcTz38ZdffrnpppucTmfotuvWrauurr755ptdF86RyMrKWr9+fSvH\nrRP7I5JhmBkzZnz//feNrQDBrltK7FetCIJd55o6deqPP/4YvkQulx85ckQcDPj2229vvPHG\nFmx20aJFjzzyCIIgDQ0NKSkpTHOGwdpcG0bAJJbVBoMRaU9P0zqaVtN0W72C/Tgupj0LRVkl\nEjHtiRHQTpJc1H8KBL7uRSqVarXampqaS645Z86ct99+W/z9pptuWr58eXirTCZ7991377rr\nrohbPfroowsXLgwPdlVVVUVFRRFfwK666qqNGzfG3K/L5SosLDRdXEK8uLj48OHDoS9sjXE6\nnYWFhWazOXxh3759LRaLxWIJXzhw4MADBw605p0/sT8iLxns4FAsACCSy+X66aefIhY2NDR8\n9913TzzxBIIgX3/9dcu2vGTJEjHYffHFF52b6pDm5J5LRkA3QbgJ4mysY1KEIGgupD1xwE9H\n03px2C8YlDZnqE/GcT0aGno0NEQ38QjioCgLRdkkEjNFiWlvSHGxOODnjSoiCJmvCwoEAvGk\nOgRBvvrqKzHY+f3+7777LqLV7/eHYl/ErRYuXBi+5Mcff4xIdQiCbNq0qa6uLi0tLXoLGzdu\nNEVdGObYsWNHjhwZOHBg033esGFDRKpDEKSkpCR6zUOHDpWUlBQVFTW9QdAYCHYAgEjiCW3R\ny0PHX0IHU5ordMP6+vqWbaFTtCYCsihqlkjMjZy3pGBZcWAvPO1paVofDGoYBov7iAqGIFqa\n1tI0EqvCXwDDfh/boygzRV3Zp09owI+Bob7uxuPxcByH43joZMoIrlizv51OZ8Qxusb+kZ1O\nZ8xg19j6oXeGJjTrTSOeDYLGQLADAERKT09Xq9XR762hM2n69u27adOmFmw59C18/PjxL774\nYiv62EXFGYlC+c9HED6COBfrmBEmCBqGiUh7oQioiP+yHAgi5fksvz+rkQu42UnSRlEWicR6\nYcBveFGReMzXFeuyHBD7Ol3v3r1xHEcQRKvVGo3G6FG0vn37njp1KmJhUVFRxPHNvrFmjisU\niuiZUqEtRC/EMCyeU+Iau230d0gcx3v37n3JDYLGQLADAEQiCOKf//znnDlzwhcOHTp0xowZ\n4u9PPfXUkiVL7HZ7szYrk8leeukl8fcrrriiuLj42LFjbdLhbieebDR69GgrRVkvLhMTIuU4\nw4VpHDqaDj/DT03TRHNOntYwjIZhCi6cPh+OxjDrhakbofkc/QcMEBfSsU6rgtjXYiNGjMjJ\nyVmyZMkl13z11VfFXzAMe+WVV+65557w1gEDBixevHj//v2VlZUxbxUyZcqU0aNHRzxlL730\nkkwWcz43MmzYsOnTp69cuTJ84eOPP56amnrJPo8YMeK6665btWpV+MInnnji5MmTq1evDl/4\n5JNP6nS6S24QNAaCHQAghj//+c+CIPzjH/+oq6ujKGrGjBlvvPFGqLpEZmbmhg0bHnnkkZ07\nd/I8Hz1JcObMmTqd7p133gkdJyooKPjwww/DZ89t3779T3/60+7du0NLwreTnJzs8XhiHhGO\nSalU+v1+rjnjWDGJAyGt307rXTIhjR49ujLmUB+CqBs5vKujaVVzLstB8Xx6IJAeCMRsdZHk\n+bRHkuKAn1UiKRg0yEZR9kbyaJx3rVOo1epAIOD3+wmCCC+2glw8sCSXy3v27BlxclioLoz4\nZ/grOTMz0+fzRY9/i1V7QmeaDh06dOfOnV6vNzU19b333vP5fDKZTDzYSpKkwWAwmUwsy+bl\n5f3rX/+aMmVKaDt33303y7IvvvhiTU0NRVHTpk1bsGBBamrqL7/88sgjj2zcuJFl2fz8/Fde\neWXSpEkRfcBxfMWKFX/729+++eabYDBoMBieeeaZRx99tIlH6ZNPPsnIyPjoo4/E+U9//etf\nn3766Tgf4c8+++y55577+OOPGxoa1Gq1eNuGhoZnn332008/FRfOnTv3ySefjHODICaYFdst\nJfaUHxHMiu0irFZrSkoKEXX2vaihoYFhGLlc7na7tVqtOAIXnt5KSkpQFDUYDFqttrFdHDt2\njOf54uJiDMOqqqpoms7IyJBKpYIglJaWZmZmchx37NgxgiCSkpIMBoPL5ZLJZAzDKJVKDMPE\n8mDiw2gymaqqqiiKyszMrKurs9lsNTU1V1xxhVwuN5lMSUlJdXV1CoXC6/X27t1bKpW63e66\nurrk5ORDhw4RBDF8+HCNRsPzvMVicblcEokkLS3tzJkzBoNh7969586dmzJlilKpVKlUdXV1\nDMPk5eUFAoGqqqr6+nqVSoWiaHJyslKprKmpwXG8qqrq66+//utf/zpo0KBAIFBWVub3+3v3\n7r1ly5aioqKcnJxAIHD48GGFQlFZWfntt98+/vjjBEEoFAqe5w8dOjR8+HCj0ehwOORy+dGj\nR5944oklS5bU1tamp6evX7/+xIkTDzzwgMlkcrlcy5Ytmzx5clJSkkaj0Wg0giCo1ervvvvu\n3LlzEolk4sSJTqfT5/OxLKvX68+WlKRyXK5MpnI63adOEfX1EpuNrajIQBCJ3d6sEs1NEEgy\nqFY7lUpap0MzM80kiaSlqfv3rxWEWgyT63QIgjidzry8PEEQlixZkpOT07NnT4qixLdup9OZ\nmZnpdrstFktRUdHGjRsHDhyYnZ395JNP/uc//ykpKTEYDCRJMgyj1Wpra2szMjJ2795tMBg+\n/fTTefPmkSRZWVkZDAb79euHoqher3e5XCtXrtTr9YWFhbm5uXV1dTKZjOf5mpoaqVRaUVEh\nhh7xgcrIyKBpur6+PikpyeFwpKamymQymqY3btw4btw4cTSLZdnS0tJAIDBw4ECLxaLRaHAc\nt9lsUqm0rq4uOzuboqiTJ0/m5uaKlfnE+5Kbm2s2m8XvQuIol9vtPnHixLBhwy569ATBbDYb\njUYk7B+Qpmmv16vRaBp7zK1Wa3JyMnnx0fNgMOjz+cJvFVHuRMSyrMPh0Ov1cT6/PM9brVZD\nI1f8a8FtW7PBaIn9EQnlThJTYr9qRRDsEkNjV55IMHa7vYlP3C4r+tIXhN1OWCxEfX34BTnI\n+nrcbCba7nx2TqWKviDH+at0aLXIpQpnhLT5VTo8Ho9UKiVjnVmYMGIGuwST2B+RUO4EAABA\nbE0Eo4jMx/O83+mUO50KpzOU9gir9fyV2cxmtJHDtTHhHg/u8UhKS6ObBBzndDomLY3V6Rij\nkRUvv3shAvIXf1THf2E6uFAb+OOAYAcAACBSRBJiWdbpdMpkMoVCETNO4S4XYTaTJtPv1+EV\nB/zMZsJmQ+I+NIRyHGEyEVHTPEW8XH5+hE9MewYDcyHzcTqd0MiVOZC4I2BmZmac/QSgy4Jg\nBwAAoBniH+cToSwbPrYnpr1QBMSaM9SHNTRIyssl5eWx2jBWq2WNRkavZ1NTfz/IazSyRiOn\nUsWz/dLSUpIk8cYDYjgYBQRdEwQ7AAAAbaOxrHPixAmmkYoYmMcTnfbOR0CrFYn/yhw8T1gs\nhMUijdkolUanvfMRUKcTWnRSHRwIBl0TBDsAAADtq4nAx6tUQZUqmJcX3YpyHG61kqEJHBYL\nKR7bNZtJkwmLdY2NxmCBAHXuHHXuXMxWVqsVJ3AENBo+NZVLSwtFQK4tJhnEHwERSIGg1SDY\nAQAA6BxNBD4EQQQcF4fWYq6D+f3nx/bC0t75AT+bDW1OuT7CZiNsNuTkSWVUkyCRsAZD5OFd\ncQ6vwSA0cqW41oAUCFoJgh0AAICupenAJ+JlMjo3l87NjbEezxM2W3jaI+vrQ2VccLc7/p6g\nwSBZVUVWVcVs5TQaVqeLSHtsWhqr07GNF25sQ5ACQTQIdgAAALqHeAIfgiAIhrF6PavXI7Gu\nT4oGAtGHd0NF+5pVohm323G7XXL6dHSTQJIXpb3QgJ9Ox6am8tKYpwK2L/FR8vl8KIrGU+MN\ngmA3BcEOAABA9xYzgjQ2miVIpXSPHnSPHtFNgUBA5vFI7PboEs2ExYI35+LIKMOQtbVkbW3M\nq67+XqLZYAilvRaUaG5XMBzYTUGwAwAAkICalfZCWI1G0OuRwsLoJpSmL0p7dXWE1RqKgGgw\nGH/fmi7RzOp07MUlmsVyfWxaGi+LGRQ7X7NSIAJBsD1BsAMAAPBHEe/B3FgEimIyM5lGihjj\nTmd02hMjYHNLNJMmE9lYiWaFgklNZQ2GUHHmUARsukRzVwPDge0Hgh0AAIA/OjE6RFwrtlnh\ng0tJ4VJSggUF0U0owxA2W9uUaPb5JGVlkrKyGG04zmq1jMHwe9q7ULSPMRr5+Eo0d03NHQ7M\nyclpp550CxDsAAAAgBhadjA3mkCSTGpqXCWaL74Cb/NKNHOcOBE4ZiMvlYppz6/TsXo9kpER\nPuAnEAkVBk6fPi0IAkVRca6fYCOCCfVcAgAAAO2qrdJeyKVLNNfXE+JPi4UwmYj6enE+B9bQ\nEP9esECAOnuWOns2xmxYFGW12vMn80UN+HEpKS2+a91Fgh0XhmAHAAAAtEqbpz3RJUo0NzT8\nnvbCDu+S9fW41YpyXNy7EQirlbBakZKSGI0SCWMwnK/YEl60z2BgDQYh7lGxhNH009oVYh8E\nOwAAAKDttVPaC+Hl8mBubvCSJZrDB/xaVKKZqqqi4izRHBYBO6ZEM4gGwQ4AAADoIK2ZltsM\njZdo9vl8OE0nuVwXpT2TiRR/sVpRhol/P02VaKao81M3Lszh5fR6RrwOr8EgdEaJ5j8ICHYA\nAABAJ2vv4b1wvERC9+xJ9+wZs5Ww2UJje+fTnlii2WptXolmmiZrasiamtglmpOTf0974mU5\nxKJ9BkPXKdHcTUGwAwAAALqijkx7IaxWy2q1SO/e0U0XlWgOzeFtWYlmlwt3uSRnzkQ3XVSi\nOapoX5ct0dx1QLADAAAAuo1OSXuiS5RodjhCY3tEXR1psRBmszjgR9hs8e/lEiWalcrQ2N7v\nFVvEAT+tFuk+JZrbDwQ7AAAAoHuLTnsdE/XCcWo1p1YHe/WKbkIZJjrthQb8mlei2euVeL1N\nlWg2GhmtljYahdTU36/JZjB06xLNzQLBDgAAAEg0nTiwF00gSSYtjUlL88dqxT2e8xVbwgb8\nCIuFNJkIm60FJZpjHqw9X6I5dDLfxUX7EqlEc+LcEwAAAAA0oU+fPjabDcMwtVodWthZaS+E\nU6k4lSqYnx/dFFmiWTy970IEbFmJ5hhtMUs0X4iAXHJyi+9ap4BgBwAAAPxxdamxvQiXKNHs\n85EmU0TaOz/gZ7O1WYlmqTQ67TEGA2s0snp9FyzRDMEOAAAAABfpymkvhFcoYpZoZhhG4Di5\nxxNen4+sqwtdpQP3eOLfCxoIUJWVVGVlzFZOo2FDFVtSU5G33mr5/WkjEOwAAAAAcGndIu2d\nd6FEc8x5GVggEJ32WlWi+dQpBEFYrRaCHQAAAAC6se6U9i7gpdImSjTjbjdhNhPiBN4LP6nq\nasJsbnoyB2s0doVQ1RX6AAAAAIDE0R3TXgiXlMQlJcWezBEMkmbz+YotYddkE8u4sAZDx/c2\nGgQ7AAAAALS7bp32RIJEQmdl0VlZMVvRQCDG9To6HAQ7AAAAAHSOBEh7IYJU2tldQBAIdgAA\nAADoUhIp7XU8CHYAdA8sy5aVlclksqysrIqKCoZhcnNz8a50YUSz2TxlyhQEQX788UetVlte\nXu50Ol999dVgMLhgwYKCggJxtbq6OofDkZGR8dlnn23atCk/P99gMIwdO7a6utrlcnk8noKC\nguzsbK1Wq1art2/f7vF4Dhw4gGHY8ePHKYq69dZbTSZTTU1NZmYmy7Jms9lsNlMU5fF4JBIJ\nhmFDhw799ddfMQzr06fPZ599dvLkyXHjxs2ZM6eurs5qtSYnJ/t8vl69egUCgZSUFLfb/f33\n3xcUFJw+fVqhUPz73/+2Wq2rVq3S6XQHDx48fPjwLbfcMmXKlP/85z+fffYZjuOvvfbaxIkT\nd+/eXVVV1atXL6lUunbt2mPHjpWUlNx444233HLLyZMnOY47d+6c2+1etmxZbW1tjx49nn/+\nebfbXV9fX1hYyHHcJ598kpGRUVBQIJFIZDKZRCKZP3++w+GYO3duUVHRoUOH+vTps2PHDo7j\nZsyYsXz58lGjRn322Wf79u0bOXLkuHHjLBZLRUWF3W6/6qqrysrKDh06VF5eLpPJ/vznP6tU\nqkAgIJFI+vfvv2jRol9//VUul996663p6eklJSUOhwNF0QMHDng8ngcffLC2tra8vPz//u//\nli1bdvLkyYKCggceeODHH3+02+11dXVut/vRRx/1eDzHjh1jGGbPnj04jg8dOnT48OFLly6t\nqanp2bOnRqPx+/0Oh0OlUo0fP95qta5bt87r9fbr1y8QCFRXV2MY9thjj7lcrh9++AHH8Xvv\nvffo0aObN29WKpUymczn86WkpJhMpvr6epIkCwoKMjMzU1JSamtr/X7/sGHD6urqjh8/npWV\nxfO8Uqk0mUwHDx6cMGGC0WgcN26cuGuDwVBeXm4wGCoqKrxe78aNG2fMmJGenl5VVbV79261\nWm0ymXr27IkgyLFjxwRB0Ov1vXv3Hjp0aHZ2dnl5+YkTJ2pqavLy8nr37u3z+fLz8xmGOXfu\n3KHLbGBsAAAeLElEQVRDh3w+3x133IEgyN69e61Wq9fr3b179+DBg5999tm6urqysjK3211e\nXp6WlkYQhLhllUqFYVhxcXEgEKioqEhJSTEajdXV1RqNxmq1lpaWTps2zWAw7N+//+jRoyNH\njuzTp4/4As7Pz6+vrxf/qcvKyn7++WeSJPv161dcXJyUlIQgCMMwx48ft1gs+fn5Bw8eVKvV\nx48fHzZs2JAhQ0pKSk6fPj169OgjR45gGFZYWGgymcR/NBzHjUYjRVFqtdpsNufl5cnlcp7n\nS0pKTpw4MXLkyOzs7AMHDshkskGDBqEoeujQIZfLNWzYMJVKVVtb+9tvvw0dOlSv1+/Zs4fn\n+eHDh1MUFXrzCQQCJ0+edLvdOI4XFRVptdp43iVomj59+rTX69VqtYIgkCTZs2dPFEXF1v37\n95tMpnHjxsnl8ogbMgyDIEhSUlJGRkZoYXjU83g8Ho/HYDAQCXQNiZZBBUHo7D40j8PhePjh\nh7/66qvGVrBarQRBpKSkdGSvOlggEOB5Pvqln0jcbjdN01qtNvQ/n3g4jvN6vclxlDX/8MMP\nn3rqKbvdjiAIRVE0TSMIYjQaFyxYcOutt7Z7R+NAURQTViMARSPfWyiK+vHHH+fNm7dnz54O\n7x0AXUX4vwaO4xzHhS+M/seZOXPmgAED/v73vweDwVbumiTJiRMn/vLLL+IbSDgMw1AU5S5U\n9JXL5Q1RF3VAUTQpKcnlcokr+P3+8K6OGzfu888/z2rk5DPR66+//vzzzwcuvjJsbm7u4sWL\nLRbLAw884Pf7xR3dcMMNy5cvD62zaNGiF154we12IwgyePDgDz74YPDgwaHWM2fO3H///Vu3\nbkUQRCaTvfjii5MmTUIQhOqM0sExxxrbFsMwM2bM+P777xtbAYJdtwTBLjHEGexWrVp1/fXX\nN9a6YcOG8ePHt3XXmsdoNJrN5kuuhmEYH/9lHwEA3crgwYN37drVWJx6//33H3zwwZhNUqmU\npumIN4fZs2e/8847CIJ8/vnnd911V3hTamrq4cOHDQYDgiA+n++yyy47depU+Arz58+fO3du\n+Edkhx3G7QrBDmvvHgAAWunll19ucWvHiCfVIQgCqQ6ABHbgwIE1a9Y01jp//vzGmsShioiF\nH330UWM3rK+vf++998Tfly5dGpHqEAR57bXXIkYl+8TS5L3pxv7oh6IB6PrOnDnT4lYAAOgw\np0+fjrm8oaGhpqamWZtiGIZlWRRFy8vLo1tD73ulpaXRrR6Px2QyXfLAXaJO0YBgB0BXZzAY\nPI1f2dDQNUpiAgCA0WiMuVwmk6lUqibex6JhGCZOg9BqtVarNaI19L6n1+ujb0sQhFqtjn9f\n4RIg7cGhWAC6uvvuu6+J1lmzZnVYTxrTpSbnAgA6hV6vnzp1aswmFEXvvffeZm1t2LBh4i/R\nb4BSqfTOO+8Uf58xY0b0aco33nijUqls1u6a1s2O5Ardjd1uv/XWW5tYwWKxOByODutPp/D7\n/T6fr7N70b5cLpfFYuF5vrM70o5YlnU6nfGsdtttt8X8/33wwQe7yEMUz7vNY489JtZuAADE\nqUePHuPGjWurrel0utbcvImpbGq1etOmTU28RTQ0NEyePDn6hhRFvfTSS/fcc0/4wvT0dI/H\nI94wGAxed911oSaFQvHJJ5+Eb3n16tUajSa0wtixY+vr6zvlI7KkpKQD9kLT9LXXXtvECjAr\ntluCWbGJIf5yJwiCHDhwYM+ePXK5PDc3t7S0NBgMjhw5sl+/fu3dyfgVFRWJByz69OnzzTff\n7Nq1a9GiRWfOnBEEYfDgwRs3bhRLkW3atMlms504cWLp0qUulwvDMLVanZ6ejuN4XV0dhmFG\no3HEiBHDhw9vaGhYu3bt4cOHbTab3+9nWRbDMI1GQ1FUQ0MDQRAYhnm9XpZlOY4T38ooiiII\nQiwMIb4Din1LT0+XSCRer1dcR6lUqtVqiqLOnDnjcDjEm4vFIDQazZYtW9xut9vtZlk2KSmp\nT58+e/fuFStBEARx77337tu3z2KxaDQamqYrKirEwhAYhg0YMIDneZPJ1NDQ4PV6Q+eDUxQl\n1g9TKBQul0us6SB2VSKReDyeUKWY7Oxsh8PB87zYc5IkBUFAUTRUe0Imk9E0LXYGwzAkbEoK\niqIymQxFUYlEwnGcWJZCJJFIxEcp/PkS/7NQFA3vZ8Qp5yRJhlexQRqf2hxdpyO0PLSLUHWP\nJoS2g2GY+CkVvY5UKpXL5QqFIhgMil9sxG6LR/pQFNXpdKGnNXRPhbAiI6mpqWlpabW1tTab\njWVZsdKbSqXS6XQcx5WVlYmPnl6v1+v1FRUVfr8/GAwKgoBh2MCBAw0Gw5kzZywWC8MwGIZR\nFIVhmFwul8vler2+oKCA47iqqiqVSqXRaAKBAMMwtbW1gUBg9OjR119//f/+97+KiooBAwbM\nnj27rKzMZDLpdLpAIEDTdM+ePVeuXLlp0yaWZfv373/33XdPnjxZIpHs2bPniy++sNvtNE03\nNDRUVVVxHFdQUPDnP/957dq1NTU1arVa3FFeXh7LslVVVVarFUXRtLS0/Px8giCkUmnfvn3H\njx+/a9euN998s6KiorCwsH///iUlJRKJ5KabbpJKpV988YXP57v66quvueaaxYsXHz16tKio\nqG/fvj///LMgCNddd11xcfG2bdtomharDK5Zs6a8vFyr1Y4fP3769OnxfG3buXPnunXr6uvr\nk5OTxaJ0Y8aMyc/PRxBk//79H330kd1unzRpUkTOQxBk9+7d+/fvT05OvvLKK9PT0yNabTbb\nxo0bTSZT//79x4wZI74wEvUjEsqdJCYIdomhWcGumxJHl5OSkjqlplSHsdvt4WMGiUccXZbJ\nZAqForP70o48Ho9UKiVJsrM70o5sNpv4baqzO9KOEvsjEsqdAAAAAAD8UUCwAwAAAABIEBDs\nAAAAAAASBAQ7AAAAAIAEAcEOAAAAACBBQLADAAAAAEgQEOwAAAAAABIEBDsAAAAAgAQBwQ4A\nAAAAIEFAsAMAAAAASBAQ7AAAAAAAEgQEOwAAAACABAHBDgAAAAAgQUCwAwAAAABIEBDsAAAA\nAAASBAQ7AAAAAIAEAcEOAAAAACBBQLADAAAAAEgQEOwAAAAAABIEBDsAAAAAgAQBwQ4AAAAA\nIEFAsAMAAAAASBAQ7AAAAAAAEgQEOwAAAACABEF0dgeaTRAEQRCCwWDT6zS9QnfHsizP84l9\nH3meRxAkGAyiKNrZfWkvPM8n/PPIsiyCIAzDCILQ2X1pRwn/nsNxnPgz4e8mwzDim08CS/iX\nK8uyCXwfL/l22v2CHYIggiAwDNOaFbo7juMS/j6KL1wxFiQq8VtKYj+P4mek+Irt7L60r8R+\nHsWnj+f5hL+b4tfmzu5IOxKfysR+HhP7I/KS96v7BTsURTEMUyqVja0QCASaXiEBBAIBnufl\ncnlnd6Qdud1ujuMUCkUCj9hxHOf1ehP7ter3+xmGkUqlFEV1dl/aEU3Tif08siwbDAZJklQo\nFJ3dl3bk8XikUilJkp3dkXYUDAbhI7JbYxim6Y9FOMcOAAAAACBBQLADAAAAAEgQEOwAAAAA\nABIEBDsAAAAAgAQBwQ4AAAAAIEFAsAMAAAAASBAQ7AAAAAAAEgQEOwAAAACABAHBDgAAAAAg\nQUCwAwAAAABIEBDsAAAAAAASBAQ7AAAAAIAEAcEOAAAAACBBQLADAAAAAEgQEOwAAAAAABIE\nBDsAAAAAgAQBwQ4AAAAAIEFAsAMAAAAASBAQ7AAAAAAAEgQEOwAAAACABAHBDgAAAAAgQUCw\nAwAAAABIEBDsAAAAAAASBAQ7AAAAAIAEAcEOAAAAACBBQLADAAAAAEgQEOwAAAAAABIEBDsA\nAAAAgAQBwQ4AAAAAIEFAsAMAAAAASBAQ7AAAAAAAEgQEOwAAAACABAHBDgAAAAAgQUCwAwAA\nAABIEERnd6CL2rVr144dOwiCGDdu3KBBgzq7OwC0mX379m3duhVBkDFjxgwZMqQFWzh8+PC7\n775bUVHRt2/fBx54oFevXgiCbNiwYfLkySzLIggik8nS09PtdrvD4QjdavXq1VdfffU999zz\nyy+/+Hw+mqYFQRCbUBTFMIzn+dCS8OUIgkQ3NReKoiiKEgRB03R0Uys33n5bA90UjuPCBeHL\nQ69nBEHEFyTLsoIgiK9PnucJgiAIIhAIRG8TRdHQLwRBSKVSkiQxDPN6vcFgUNwmcuG/hiCI\nzMxMHMcdDkcwGGQYhqIotVrt8XjcbjeKohRFaTSaq6++esSIER9//PGBAwfEf0mCIDAMwzCM\noiiVSpWdnX3LLbesXLny6NGjNE0bDIbc3Fy73X7mzBme56VSqbhfo9FoMBhomi4sLMRxvKSk\nxO/3a7VajUYjk8nUanUwGDxy5IjP59Pr9RkZGTRNZ2VljR07tqGhYfv27VarNScnZ9y4cTRN\nHzp0iKZpFEUlEsmQIUPGjx8f8xE+cuTIggULampqBg4cOG3atL1793IcN3r06GHDhokruFyu\nZcuWVVVV5eTkkCR55swZiqJQFKVpuri42GAw7Nix4/Tp0xKJpLi4eOLEibm5uQiCHD9+fOPG\njX6/f/jw4WPHjm2TF0MnELobu91+6623NrGCxWJxOBwt3j7P83feeWf4Q/T444+3eGvtxO/3\n+3y+zu5F+3K5XBaLRfxET1Qsyzqdzo7c45w5c8Jf2/fff39zH+HHH388fAsYhr3xxhvTpk2L\n590m9MkEAADxmDx5cjAYjHgXevrppxt7M7n77rt5nt+0aZNer49/L1Kp9N13333xxRcpigot\nvO666xiGabt33zZD0/S1117bxAoQ7CK99dZb0c/6N9980+INtgcIdomhg4PdZ599Fv3a/uCD\nD+LfwsqVK+N/rwQAgNZ75plnwt+Fdu/e3fRXxAULFmRmZjZ3L+GRLmT+/Plt/TbcBi4Z7OAc\nu0hffPFFnAsB6F5ivow///zz+Lfw5Zdftl13AADg0iLeo958802hybMd3nvvverq6ubuJfok\njehddxcQ7CLZ7fbohTabreN7AkDbav1rO+YWAACg/US8R1mt1qbXdzqd7bTr7gKCXaTCwsLo\nhX379u34ngDQtlr/2o65BQAAaD8R71GXnM4oTudqj113FxDsIr344otSqTR8iUqleuaZZzqr\nPwC0lXnz5snl8vAlMpns+eefj38LTz31VMQWEATR6XQwKwIA0E7+8Y9/hP85b948pVLZ2Moy\nmWzhwoVTp05t7l6ys7O1Wm3Ewpdffrm52+kKINhFuvzyy7/77rvQyMSgQYPWrl2bn5/fub0C\noPX69u27Zs2a4uLi0J8//PBD//79499CTk7Oxo0bc3JyQkv69eu3ZcuWdevWxZPtuunXXwAS\nWCd+K1MoFBKJJPRnSkpKUlJS+ApZWVlff/31NddcE74wKSlp+/btWVlZ4p8oiqpUKvH33r17\nr1q1avDgwR988MG9995LkiSCICRJinsRa8ogCKJSqYxGY2iDKIpOnDhxw4YN69evHz58uLiw\nZ8+eK1asuPLKK9v8XneA7ldyyeFwPPzww1999VVjK1itVoIgUlJSWrkji8WC47hGo2nldtpD\nIBDgeT567CSRuN1umqa1Wm0CjwZxHOf1epOTkzt4vzabTRAEnU7X4i04HI6ampoePXqE3lIR\nBKmvr581a1aPHj3eeuut48ePV1ZWFhcX9+vXjyTJnTt3il+WeJ7/+eefOY5LSUn56quvXC7X\nbbfdtnXr1oceemjDhg3Hjx/ftm2bUqm8++67z507d/nllzscDr1eb7FYzp07t2zZMpZln3nm\nmYaGhhMnTtjt9qSkpPT09EOHDmm12pMnT4r1ulQqVWZmplgPbOXKlS6Xa9asWRMnTqyoqBgy\nZEhDQ8Pnn3+uUqnq6+uHDx8+ePDgY8eOOZ3Ozz//3Gq1zpw502639+jR4+DBg1KpFEVRlmVT\nU1NdLteaNWsEQVizZs2uXbswDDt58uTu3btTU1Nvu+22LVu2/PDDD6NGjXr55Zerq6t37dpl\nsVjy8/NXrFhRUlKSkZExduxYm83mdrtxHJ8yZcorr7ySmZn54IMPbtiwgSAIcbPBYPChhx5i\nGObYsWNDhw5dvXp1IBB48sknd+7caTQaP/nkE6/XO2XKlMLCQr/fX1lZefDgwZkzZ+7duzcQ\nCOzevZskyZEjR4r1z2pra9VqNUEQBw4cwDDs8ccfP3PmzLp16zIyMgiCKCsr4zjutttuUyqV\nS5cuLS4u3rt3bzAYVCgUgwcPrqio6NWr1969e10uV1ZWFo7jLMtKpdLS0lKpVJqenp6enl5f\nX19aWpqZmckwTEpKyunTpwVBGDBgwLlz5zweD8/zI0eOVCgU+/btc7lcQ4YMcTqdtbW1OI6P\nGDHi3LlzJ0+eHDVqlNVqLS8vNxqNDMPU19fjOD527Nhj/9/evce0VfdxHD8cem+hgAXKCpm7\n4pxBBCXbBJxb3JZsfxhnMqcGIskGarIYYlz2h8ZkRqMzJnPGzMXbFjKjUaOJydCZzURl2XRB\nbgtMRkqAcGs7eqGjpZfnj0YeHhys2UM566/v11897e8s3+XXb3+fnnN66OxctmyZRqPp7u4u\nLCwMBAJ+v7+4uHhkZGRycjIUCg0PD+/cudPlck1NTZlMJqfTabFYBgYG8vLyJicnfT6f3W7f\nunWrwWDo6+sLh8Nbtmy5du1aTk5OV1dXZ2enVqu1Wq1ms3lycrKsrEyj0Zw7d87n80UikYaG\nhsHBwdgdFicmJtLT0+12++7duzUajdPpjEQiPp/v999/LykpeeihhzIzM7u6ulatWtXd3a3T\n6UZGRkZHRzdt2tTR0WG1Wrdv367RaCwWy+XLl81ms9lsvnTp0vr16wcGBiorK6PR6OXLl/v6\n+p5++mlZljs6OsbGxrZv32632+12e0lJSUVFxalTp37++eeysjKPx5OVlTU2NrZnzx673Z6e\nnp6dne3xeNRq9ebNmy9evLhmzRpZlr/++uvS0tL29vZgMLhjx46//vqrqKjowQcfdLlc4+Pj\nBoPB4/FMTEzs2rXr0qVLV69etVgsOp2uoKCgoKDAYrE4HI6WlpbR0VGXy7Vly5br16+Pj49X\nVVX19PTce++9NpvN6XReuHAhNzc3Go2WlpYODQ0NDg46nc7y8vLBwcGJiYkNGza43e5AIGCx\nWHJyctrb241Go8lkCoVCJpMpEAhkZmY6HI6RkZF169Z5PB6NRhMOhwsKCiKRyNDQkE6nC4fD\nVqtVkqShoSGz2RyJRPx+f+yZBT6Fent7H3jgAZVK5XK5IpHIzGdabIlUq9XDw8M2m02W5cHB\nwVj9TqfTZrOlpaU5HA5ZlqempjIzM2cfAnS73VNTU7OT351menr6ySef/P777+cbQLBLSgQ7\nMSgV7JZS7NY8mZmZN72bgDBcLted+SVwscRuzaPX641Go9K1JJDX643d+FfpQhLI6XTKspyd\nna10IQkk9hJ5y2DHqVgAAABBEOwAAAAEQbADAAAQBMEOAABAEAQ7AAAAQRDsAAAABEGwAwAA\nEATBDgAAQBAEOwAAAEEQ7AAAAARBsAMAABAEwQ4AAEAQBDsAAABBEOwAAAAEQbADAAAQBMEO\nAABAEAQ7AAAAQRDsAAAABEGwAwAAEATBDgAAQBAEOwAAAEEQ7AAAAARBsAMAABAEwQ4AAEAQ\nBDsAAABBEOwAAAAEQbADAAAQhErpAm6Hy+U6efLkfK9OTk7KsqzX65eypCUWCoWi0aharVa6\nkAQKBAKhUMhgMKSlpSldS6JEIpFgMKjT6ZQuJIGmp6dj/8f09HSla0kgv99vMBiUriKBIpHI\njRs31Gq1RqNRupYECgQCKpVK+PdqWloaS2TyCofDCw9Ii0ajS1PKYgkGgz/88MMCA86cOWMy\nmaqqqpaspKUXiUQkSZJlkQ+4Xrx40eFwbNu2TdTmlCQpGo2Gw2GVKim/X8Wpt7e3p6envLzc\narUqXUsChUIhsefx+vXrLS0ty5cvv++++5SuJYHC4bAsywJ/mZQkqbm5Wa/XP/LII0oXkkDC\nL5Emk2nbtm3zvZp8we6WNm7cuGrVqqamJqULwf/lwIEDLS0t58+fz8jIULoW3L6TJ08eO3bs\nyJEjjz76qNK14PZ1dXXV1tY+9dRTL7/8stK14P9SXV1dUFDw5ZdfKl0IEkXYPAsAAJBqCHYA\nAACCINgBAAAIQsBr7AAAAFITR+wAAAAEQbADAAAQBMEOAABAEEl/R80DBw7Y7faZTZ1O99VX\nX80Z4/P5Tpw40d7ePj09XVxc3NDQkJeXt6RVIg4ul+vTTz9ta2sLBoMrV6587rnn1q5dO2dM\nPNMNRcTTZXRiUqATxcDimLKS/scTdXV1TzzxxIYNG2Kbsizn5OTMGfPGG2/4fL76+nqtVnv6\n9Gm73f7+++8LfE/qJNXY2KjRaPbv36/X60+fPt3a2vrxxx/P+XNb8Uw3FBFPl9GJSYFOFAOL\nY8pK+vnzer1Wq9Xyj3+/cR0Oxx9//LF///4VK1YsW7asoaFhaGioo6NDkWoxH6/Xm5ub++KL\nL65cubKgoKCmpsbj8QwMDPx72MLTDUXE02V0YlKgE4XB4piykvtU7PT0dCAQuHDhQlNTk9fr\nXb16dU1Njc1mmz3m77//VqvVK1asiG2aTKbCwsKenp77779fiZJxcxkZGYcOHZrZdDqdsixb\nLJbZY+KZbigini6jE5MCnSgGFsdUltxH7Px+f1ZWVigUeuGFFw4ePBgMBg8dOjQ5OTl7jMfj\nycjImP1Hnc1ms9vtXvJiES+v13vs2LHHH388Ozt79vPxTDcUEU+X0YlJh05MXiyOqSzJjtj9\n9ttv7777buzxW2+9tW7dulOnTs28+sorr9TW1ra0tDz22GOz95r9xsUd4t9TGXs8ODh4+PDh\n0tLS2traObuYzeZ4phuKiKfL6MQkQicmtTjniJYUUpIFu7KysqNHj8YeW63WOa/q9frc3FyH\nwzH7yaysLI/HE41GZ97Bbrd7zhdQLL2bTmVbW9s777yzd+/eXbt23fJfuOl0QxHxdBmdmETo\nRMGwOKaUJDsVazAYlv9Dq9X29/d/8MEHoVAo9urU1NT4+PicwLdmzZrp6elr167FNmMXAs8c\nH4JS5kylJElXrlx5++23Gxsb51tL4pluKCKeLqMTkwWdKAAWx1SW/vrrrytdw+1LT08/fvz4\n0NDQ3Xff7Xa7P/roI5/P9/zzz6tUqrNnz165cqW4uFiv1/f3958/f764uNjv93/44YdGo/GZ\nZ57hEPQdJRgMvvbaazt27CgrK/P/Q5bl2VO5wHQrXX6qW6DL6MTkQieKgcUxlSX9fez6+vo+\n++yz2K97iouL9+3bl5+fL0nSkSNHPB7P4cOHJUny+/0nTpxobW0Nh8Pr169vaGjgaPOdpq2t\n7dVXX53zZH19/c6dO2dP5XzTDcXN12V0YnKhE4XB4piykj7YAQAAICbJrrEDAADAfAh2AAAA\ngiDYAQAACIJgBwAAIAiCHQAAgCAIdgAAAIIg2AEAAAiCYAcg1VVWVt5zzz1KVwEAi4BgBwAA\nIAiCHQAAgCAIdgDwX9XV1VVVVa2trVu3bs3MzMzLy9u7d+/Y2Fg8+5aXl2/cuPHcuXMVFRUG\ngyEnJ6eurs7tdie6ZgCYoVK6AAC4g2g0mqtXr9bX17/55pslJSW//vrrnj17tFrt559/fst9\ntVptb2/vwYMHjx49unbt2ubm5rq6uomJiW+//TbxhQOAJBHsAGCOgYGBL7744uGHH5Ykaffu\n3Zs3bz579mw8O8qyPD4+/s0332zatEmSpGefffaXX3755JNPBgYGioqKEls0AEiSxKlYAJjD\nYDDEUl1MYWHhyMhInPsajcbKysqZzerqakmSOjs7F7dCAJgPwQ4A/kdubu7sTZVKFYlE4tw3\nPz8/LS1tZvOuu+6SJGl0dHQRywOABRDsACBRQqGQJEmyzCctgCXCxw0ALJrh4eFwODyzGTtW\nl5+fr1xFAFILwQ4AFs2NGzd++umnmc0zZ85otdqKigoFSwKQUvhVLAAsmqKiopdeeqm/v3/1\n6tU//vjjd999V1NTk52drXRdAFIFwQ4AFo3RaGxqampsbPzzzz+1Wu2+ffvee+89pYsCkELS\notGo0jUAgAgqKysdDkd3d7fShQBIXVxjBwAAIAiCHQAAgCAIdgBwa83NzWkLOn78uNI1AgDX\n2AFAHHw+n91uX2CAzWbj168AFEewAwAAEASnYgEAAARBsAMAABAEwQ4AAEAQBDsAAABBEOwA\nAAAEQbADAAAQxH8AIYmJIlycgAsAAAAASUVORK5CYII=" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ], - "id": "dNujhir1q_0N" - }, - { - "cell_type": "code", - "source": [ - "p2 <- ggplot(data, aes(x = ln_p, y = ln_q)) +\n", - " geom_smooth(method = \"lm\", color = \"red\") +\n", - " labs(title = \"Regression Line Only\")\n", - "print(p2)" - ], - "metadata": { - "id": "t7axTjsnq8qI", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 454 - }, - "outputId": "84a6aa7b-41cd-43f6-e371-2c46d15c86eb" - }, - "execution_count": 17, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\u001b[1m\u001b[22m`geom_smooth()` using formula = 'y ~ x'\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAIAAAByhViMAAAACXBIWXMAABJ0AAASdAHeZh94\nAAAgAElEQVR4nOzdd3wUdd7A8dnd9J4QQgglha4oKhZEnrOc4PlQgxCpggdSBIHjUQQUiaAE\nRSliRBABAUEQqXdnwVNPURRPaWqoCSEECCSQZFN2s+35Y2BuSSMkOzuzs5/3H77CL5PdX5I1\nfJjJ7lfncDgEAAAAeD690hsAAACAaxB2AAAAGkHYAQAAaARhBwAAoBGEHQAAgEYQdgAAABpB\n2AEAAGgEYecVpk+frtPp3n33XaU3Iggq20w9vPjiizqd7u2331Z6IwAAVOaNYXfgwAFddYKC\ngtq3bz9hwoQTJ04ovUcXi4yMjI+PDw0NlfuOxK9t69at1bAZSV5e3ssvv3zffffFxMT4+fk1\natTorrvumj59+qlTp9y2BwAA3EDnhZMnDhw4cPvtt4eEhPTs2VNadDgceXl5hw4dunz5cmBg\n4M6dOx9++GEFN+mhxK9tq1at1BPHK1asmDJlSnl5uZ+f3y233NKoUaOioqKDBw+aTCYfH585\nc+bMmDHjhm7wxRdffPXVV5cuXTpx4kSZ9gwAQP34KL0BxTRp0uSjjz6qtFhWVvbss88uW7Zs\n1KhRWVlZer03ntHUkvfee2/s2LE+Pj6vvPLKpEmTpNOE5eXly5cvnzFjxsyZM61W66xZs5Td\nJwAALkG4XCMoKOjtt98OCws7ffr04cOHpXWHw7Fy5cp77703NDQ0MDCwQ4cOs2bNKi0tdf7Y\n06dPDxkypHHjxkFBQXfdddfWrVsvXbqk0+nuuece8YAXXnhBp9Pt3LnznXfeadasWURERN1v\nfMuWLQ899FBUVJSfn19cXNyjjz766aef1v2Aqr/WZrFYlixZctddd4WGhgYEBLRu3XrixIln\nz56VDpg1a5a4299//z05OTkmJiYgIOC2227buHFjA7/IlTZTxzuqy1epkry8vMmTJwuCsHHj\nxhdeeMH54m9gYOCUKVN27Nih1+tTU1OPHj1av8+6W7duOp3uH//4R6X1b775RqfTdevW7Qa/\nNgAANAhhV5ler4+JiREEwWw2S4tPPPHEU089lZ2dPXbs2KlTp0ZERLzyyivdunUzGo3iAQUF\nBd26ddu4cWO7du2mTZvWqVOnoUOHLlu2TBCEgIAA8Rg/Pz9BEP79738/99xz999/f0pKSh1v\n/L333hs4cOBvv/2WkpIyffr0Rx99dN++fT179ly3bl0dD6jEbrf37dt3ypQpJSUlo0aNmjZt\nWtu2bdPT0++8887s7Gzn3e7fv/++++4rKysbNmzYww8/fPDgwSFDhnz11Vcu/ILX8Y6u+1Wq\nasWKFeXl5T179hwwYEC1B/To0WPQoEF2u33p0qX1+6yffPJJQRBWr15daX3z5s2CIAwfPvyG\nvhQAADSUw/vs379fEIRWrVpV+96jR4/q9XpfX9/CwkJxZdOmTYIgdO7cubi4WFyx2+3i71dN\nnz5dXHnhhRcEQRg4cKB0O99//31gYKAgCPfff7+4Mm/ePEEQwsPDP//8c+mwutz4LbfcIgjC\niRMnpI/KyckJDQ3t0qVLHQ94/vnnBUFYtmyZ+McVK1YIgnDvvfeaTCbpQ1588UVBEFJSUsQ/\npqWlCYLg5+e3bt066Zhnn31WEIQRI0bU72tb7Wbqckd1+SpVdf/99wuC4HyzVYkn29q2bVv3\nzYjf66VLlzocjuLi4qCgID8/v/z8fOl4q9UaExPj7+9/+fLlWu4aAACX44zdf+Xl5W3btq1n\nz552u33MmDHh4eHi+nvvvScIQlpamnQtT6fTzZ0719fX94MPPhBXdu3aJQjCc889J91a165d\nBw0a5Hz7Op1OEIQOHTr06NFDWqzLjRcWFup0uuDgYOmjmjdvnp+fv3fv3joeUIl4y7NmzfL3\n95cWn3vuOT8/v+3bt5eXl0uLd91117Bhw6Q/Dhw4UBCEY8eOVf8VbIDa76guX6WqMjMzBUHo\n2LFjLfd72223iUc6nJ5FVPfPOjQ09LHHHquoqNiwYYO0+PXXX1+4cKF3797S1XYAANzDe8Pu\n5MmTlV7uJDY2tn///idOnBg1atTChQulI3/88UdBELp27er84RERER07djx37tzp06ftdvuR\nI0f0er1YCRLnZ91K7r33Xuc/XvfGBUHo3bu3w+F48MEHV61adf78efEY8aKh6LoHOHM4HL/8\n8kvVOw0LC2vXrl1FRcXvv/8uLXbp0sX5mMjISEEQnMvPVWq/o7p8laoSr9KGhITUcr9hYWGC\nIFitVudP6oY+a/Fq7Jo1a6QVrsMCAJTivc+KrfRyJ7m5uXv27Gnfvv3WrVs7dOggrZeXl5eU\nlAg190Fubm5ERERFRUV4eLivr6/zu+Lj46se37hx4xu68ZYtWy5evNhms61atWrUqFGCINx0\n0029evUaN25cYmKieNh1D3BWUlJiMpn8/PykU5KV9pafny+txMbGOh8gnnR0yPASObXcUR2/\nSlXXw8PDCwsLCwsLa7lf8b1+fn5BQUF12UxVDzzwQGJi4q+//nr48OFbbrnFarVu3bo1Ojr6\n0UcfreV+AQCQg/eGXaWXOykvL+/YseORI0eys7Odw078S12n07300kvV3k5sbKz4V754pLOq\nK4IgOMdfXW5c/JB333139uzZO3fu/PTTT7/66qvXX3998eLF69atE5+Bcd0Dqu6q2kyx2+01\nbVtBdfwqVdW+ffvs7Oxff/31zjvvrOnGDxw4IB7ZkO098cQTL7/88po1a958880vv/yyoKDg\nmWeeqVT5AAC4gfeGXSWBgYHp6emPPvro2LFjf/vtN+l3uQICAsLDw4uKiiZMmOB8ss2ZzWYz\nGAxGo1F8Q1rPycmp/U7rcuOSpk2bjh07duzYsSaTac2aNc8888zYsWP79u0r/Z7cdQ8QhYSE\nBAUFlZWVFRYWVvolsIsXLwrXnlNUgxv6Kjnr0aPH559/vmbNmjFjxtR0jPg6Jn/5y18assOR\nI0fOmTNn06ZNb7zxhvjLdk888URDbhAAgPrx3t+xq+ovf/nLwIEDT58+LT5tUyK+EN23335b\n6fhLly6JbxgMhsTERJvNduTIEecDPvvss+ve6XVvXBCE7Ozsc+fOSX8MCAgYN25c165dCwsL\nxecHXPeASsQzWN9//32lezx69GhgYODNN9983W27WV2+SlX99a9/DQsL27t37/Lly6s94Ouv\nv/7oo4/8/PwaOEMiISHhgQceyM3N/eyzz7Zt29ahQ4dazhECACAfwu4aixcvDgsLe/fdd50b\nQvzFtdTUVPGElui7775r0qSJ+HxJQRAeeeQRQRCkl0MTBGHfvn3Oz5SsyXVv/ODBgwkJCcOG\nDauoqJAOMBqNmZmZBoMhJibmugfUdKfz5s1z/pB58+ZZrdahQ4dWOsOnBnX5FlQVEREhJt3T\nTz/94osvOr/indlsXrZsWa9evex2+6JFi1q0aNHAHY4cOVIQhAkTJpSUlPC0CQCAUrgUe424\nuLi5c+dOnjx51KhRhw4dEl+ILiUlZfv27Rs3brz99tsff/zx0NDQ3377befOnYGBgdLrmzz7\n7LPr169fvnx5dnb23XffnZ2dvXXr1hdffLHSyb+qrnvjnTp1GjJkyIYNGzp06PDoo482atQo\nPz//H//4x5kzZyZPntyoUaNGjRrVfkDVOx0+fPjWrVt37NjRuXPnRx991NfX96effvrXv/7V\ntm3b+fPnN/zLeO7cuV69elVdHzJkyJAhQ+pxg3X5FlRr0KBBNptt3Lhxr7766oIFC26//XZx\nVuyBAwdKS0sDAgKWLVs2bty4emypkgEDBkycOFEcQ+f8UikAALgTYVfZxIkT165d+8svv8ya\nNeuNN94QF9evX//QQw+tWrXqvffeKy8vj42NHTx48IwZM6Rfuk9ISPjmm2+mTZv2/fff//DD\nD7fffvuOHTsaN278/PPPX/e5CNe98XXr1t13330ffvjhpk2bLl++HBIScuutt86ePVs8S1SX\nAyrR6XRbtmxJT09fu3Ztenq6zWZLTEycMWPGtGnTXPLSa2VlZVWnbAlXLwHXz3W/SjUZOnRo\njx49li1b9tlnnx0/fvzXX38NDQ29+eabe/To8fTTTzdt2rTeW3IWFBSUkpLy/vvvP/DAAw0/\n/wcAQP3o5HjpCoh++umnLl269OzZ8+9//7vSe4Hs0tLSZs6cuW7dOs7YAQCUwu/YuUZeXt4/\n//nPQ4cOOS+K87WqfTE5aIzFYlm2bFl0dHQtv/MHAIDcCDvX2L17d8+ePcePH2+xWMSVoqKi\nN998UxCEan/bDBozbdq0nJycSZMmqfCpJwAA78GlWNeoqKj485//vGfPnptvvvl///d/y8rK\nduzYcebMmeTk5K1btyq9O8jlyJEja9as+f777/fs2dOpU6e9e/eKT7gBAEARhJ3LGI3GRYsW\nbd68OTs722aztWvXbujQoVOmTPHx4RkqmvXVV1917949KCiod+/eS5YsUdtrOwMAvA1hBwAA\noBH8jh0AAIBGEHYAAAAaQdgBAABoBGEHAACgEYQdAACARhB2AAAAGkHYAQAAaARhBwAAoBHe\nNRShpKRk2rRpNb3X4XDYbDa9Xq/Xa7N3xc9Op9MpvRFZWK1WnU5nMBiU3ogsHA6Hw+HQ6iPT\nbrfb7XaDwaDVB6fNZtPwI5Mfm55L2z827Xa7IAiafGRGR0fPmTOnpvd6V9hZLJb8/PxFixZV\n+16r1Wo0GgMCArQ67rO0tNTf31+rI84KCwv1en1YWJjSG5FFRUWF1WoNCgpSeiOyKC0traio\nCAsL0+RfMA6Ho7i4ODw8XOmNyKKioqK0tDQwMDAgIEDpvcjCaDQGBQVp9ZFZWFjo4+MTGhqq\n9F5kYTKZBEHQ3iPTarVOmjSplgO0+Xd8LXx9fZs1a1btuywWS1FRUVBQkFb/+hSz1dfXV+mN\nyCIgIMBgMERERCi9EVmYzWar1RocHKz0RmRhNBrNZnNkZKRW//oMDg6OjIxUeiOyMJvNRqMx\nODhYq/8eLioqCgkJ0eojU/wbQav/6igvLxcEQXuPTIvFUvsBGjxFCQAA4J0IOwAAAI0g7AAA\nADSCsAMAANAIwg4AAEAjCDsAAACNIOwAAAA0grADAADQCMIOAABAIwg7AAAAjSDsAAAANIKw\nAwAA0AjCDgAAQCMIOwAAAI0g7AAAADSCsAMAANAIwg4AAEAjCDsAAACNIOwAAAA0grADAADQ\nCMIOAABAIwg7AAAAjSDsAAAANIKwAwAA0AjCDgAAQCMIOwAAAI0g7AAAADSCsAMAANAIwg4A\nAEAjCDsAAACNIOwAAAA0grADAADQCMJOFhkZGUpvAQAAeB3CTi60HQAAcDPCTka0HQAAcCfC\nTl60HQAAcBvCTna0HQAAcA/Czh1oOwAA4AaEnZvQdgAAQG6EnfvQdgAAQFaEnVvRdgAAQD6E\nnbvRdgAAQCaEnQJoOwAAIAfCThm0HQAAcDnCTjG0HQAAcC3CTkm0HQAAcCHCTmG0HQAAcBXC\nTnm0HQAAcAnCThVoOwAA0HCEnVrQdgAAoIEIOxWh7QAAQEMQdupC2wEAgHrzUXoDV5w/f371\n6tV//PGH2Wzu3LnzuHHjwsPDnQ84fPjwCy+8UOmjxo4d27Nnz0mTJp06dUpaDAgI2Lx5sxv2\nLJOMjIwOHToovQsAAOB5VBF2FoslNTW1efPm8+fPt1qtK1eufO211+bNm+d8TPv27VetWiX9\n8cKFC6mpqbfeeqsgCCUlJWPGjOnSpYv4Lr3e409D0nYAAKAeVNFAWVlZZ8+eHT9+fLNmzeLj\n4ydPnvzbb79lZ2c7H+Pr6xvtZOPGjcnJyS1atBAEwWg0xsbGSu+KiopS6PNwJa7JAgCAG6WK\nsLNYLIIg+Pn5iX+MjIw0GAwnTpyo6fjvvvvu3LlzAwcOFD/WbDbv3bt3ypQpo0aNSktLy83N\ndc+25UbbAQCAG6KKS7FJSUlhYWEbNmwYNWqUIAjib8gZjcZqD7bb7Rs2bBg0aJCPj48gCGVl\nZREREVar9emnnxYEYePGjTNmzFi2bFlwcLB4/I4dO37//Xfxbb1eb7fbS0pKarplQRAqKirE\nNxrCbDY38BZEBw4caN26tUtuShAEq9VaXl7uqr2pjcPhqOWb6+lsNpvD4dDqZ2e1WgVBKCsr\n0+l0Su9FFtp+ZAqCYDabxTe0x2azafWR6XA4BEGw2WxafXCKP1i098i0WCzi964myoTdnj17\n3njjDfHttLS0Dh06TJ8+fenSpZ999pm/v3+fPn1iYmIMBkO1H/v999+bTKYHH3xQ/GN4ePja\ntWul906bNm3EiBE//PBD9+7dxZWff/75s88+kw6Ojo42mUy17M1qtYqPhoYQz0G6REZGRmJi\noqtuTXsPcWd2u732b66na/gjU820+k8OkeYfmRp+cGr7kcmPTY+j0rC74447lixZIr4dGxsr\nCELHjh2XL19eWlrq7+8vCMKWLVsaN25c7cd+/fXXXbt2rSn7AgMDGzdunJ+fL61MnTp1/Pjx\n4ttGo3HOnDmRkZHVfqzVajUajYGBgQEBAfX9zK7Iy8tr4C1UurX27ds3/HbEL694plN7CgsL\n9Xp9WFiY0huRRUVFhdVqDQoKUnojsigtLa2oqAgLC6vp/2uP5nA4iouLKz3NXzMqKipKS0td\n8mNTnYxGY1BQkFYfmYWFhT4+PqGhoUrvRRZisGrvkWmxWGp/kqgyf8cHBQXFx8dLf7TZbD/8\n8EPHjh3F5Pr5558dDsdNN91U9QNLS0v379/ft29faSU7O3vXrl3jxo0Te8VkMl28eFGMRZHz\ncykuX76s0+lq+l9UvAJbywF15/Jn5h47dqzhz5PV6XR6vV6TP6FELvneqZNer9fw9068zmUw\nGDT5CTocDm0/MsX/avUTFL93mvzsxLM+mn9wau+zu+5vi6ni5I3BYPjkk0/27Nnz1FNP5eXl\npaen9+jRQzz1snv3bpPJ1Lt3b/HIEydO2Gy2pk2bSh8bFRW1d+9eq9U6aNAgm822du3akJCQ\nrl27KvOZyInXQAEAALVTxbNiBUGYNm1aSUnJ+PHj58+f361bt6eeekpcP3DgwL59+6TDxFNu\nzifhQkND586dW1BQMGXKlOnTp9tstrS0NPF6rvZkZGTwVFkAAFATVZyxEwQhLi7u1Vdfrbr+\n3HPPOf/xgQceeOCBByodk5SUNHfuXPn2pjacugMAANVSyxk73BDO2wEAgKoIO09F2wEAgEoI\nOw9G2wEAAGeEnWej7QAAgISw83i0HQAAEBF2WkDbAQAAgbDTDNoOAAAQdtpB2wEA4OUIO02h\n7QAA8GaEndbQdgAAeC3CToNoOwAAvBNhp020HQAAXoiw0yzaDgAAb0PYaRltBwCAVyHsNI62\nAwDAexB22kfbAQDgJQg7r0DbAQDgDQg7b0HbAQCgeYSdFzlx4oTSWwAAADIi7LzL0aNHld4C\nAACQC2HndbgmCwCAVhF23oi2AwBAkwg7L0XbAQCgPYSd96LtAADQGMLOq9F2AABoCWHn7Wg7\nAAA0g7ADbQcAgEYQdhAE2g4AAE0g7HAFbQcAgKcj7PBftB0AAB6NsMM1aDsAADwXYYfKaDsA\nADwUYYdq0HYAAHgiwg7Vo+0AAPA4hB1qRNsBAOBZCDvUhrYDAMCDEHa4DtoOAABPQdjh+mg7\nAAA8AmGHOqHtAABQP8IOdUXbAQCgcoQdbgBtBwCAmhF2uDG0HQAAqkXY4YbRdgAAqBNhh/qg\n7QAAUCHCDvVE2wEAoDaEHeqPtgMAQFUIOzQIbQcAgHoQdmgo2g4AAJUg7OACtB0AAGpA2ME1\naDsAABRH2MFlaDsAAJRF2MGVaDsAABRE2MHFaDsAAJRC2MH1aDsAABRB2EEWtB0AAO5H2EEu\ntB0AAG5G2EFGtB0AAO5E2EFetB0AAG5D2EF2tB0AAO5B2MEdaDsAANyAsIOb0HYAAMiNsIP7\n0HYAAMiKsINb0XYAAMiHsIO70XYAAMiEsIMCaDsAAORA2EEZtB0AAC5H2EExtB0AAK5F2EFJ\ntB0AAC5E2EFhtB0AAK5C2EF5tB0AAC5B2EEVaDsAABqOsINa0HYAADQQYQcVoe0AAGgIwg7q\nQtsBAFBvhB1Uh7YDAKB+CDuoEW0HAEA9EHZQKdoOAIAb5aP0BtzK4XDY7fby8vJq32uz2QRB\nsFqtNR1QdxaLpYG3IAe73W61Wu12u9IbqatDhw61adOm7sfX8s31dOI3Tqufnfi/nslk0us1\n+E9Nh8PhcDi0+r2zWq2CWn/iuYTdbtfwI1PQ9I9NrT4sLRaL+L2riXeFnUin09W+XtMBcL/j\nx4+3bdu2jgfrdDqtfu/Ez0urn51Iw98+QbvfO+mRqdVPUND6Zyd4wYNT6Y242HU/I+8KO51O\np9frAwICqn2vxWIpLy/38fGp6YC68/X1beAtyMFms/n4+BgMBqU3cmOysrI6dOhw3cNKS0t1\nOl3Dv3fqZDabNfzZWSwWq9Xq7+/vcQ/OunA4HCaTSavfO7PZbDKZXPJjU53MZrOGH5mlpaW1\n/J3o6cTTWtr77AwGQ+1tp8HTy9CejIwMfuUOAIDrIuzgMWg7AABqR9jBk9B2AADUgrCDh6Ht\nAACoCWEHz0PbAQBQLcIOHom2AwCgKsJOHg6H3mRSehMaR9sBAFAJYSeLyA0bkvr0Cfz1V6U3\nonG0HQAAzgg7GWRkxLz5pu+ZM/EjRsQsXKizWpXekJbRdgAASAg7GTzzjHgdVmezNVq5Mn7E\nCN+cHKX3pGW0HQAAIsJOBuvXl/zP/0h/Cty/P6l//4jNmxXckebRdgAACISdLGJjc95991xq\nqv3qiDp9aWnT1NTmkycbCguV3ZqGZWVlKb0FAAAURtjJQ6crTEk5tXGjuU0baS109+6k5OTg\nvXsV3Je2nTx5UuktAACgJMJORuZ27bI2by4YPVrQX/k6++TltRw9umlqKi+GIhOuyQIAvBlh\nJy+Hv/+FqVNPL19ujYm5uuSI2Lw5YfBg/+PHFd2aZtF2AACvRdi5Q+l992Vu327s3l1a8T96\nNDElJWrdOsHhUHBjWkXbAQC8E2HnJraIiDNLlpxNS7MHB4srOrO5SVpay9GjffLylN2bJtF2\nAAAvRNi5VVHfvpnbt5fdcYe0Erx3b1KfPmH/+IeCu9Iq2g4A4G0IO3ezNGt2+oMPLkyd6vDx\nEVcMRmOz556Lmz5dX1am7N60h7YDAHgVwk4BDoOhYPTo7A8/rIiPlxbDd+5M6ts3iPGyrkbb\nAQC8B2GnmPJbbsnatu3SsGHSim9ubkvGy8qAtgMAeAnCTkn2gIC8mTPPLFlii4gQV66Mlx06\n1C87W9m9aQxtBwDwBoSd8ozdu2fu2lXypz9JK4GHDycOGMB4Wdei7QAAmkfYqYK1UaOcZcsY\nLys32g4AoG2EnWqI42U//tjUvr20Frp7d1Lv3iHffqvgvjSGtgMAaBhhpy7mVq1OffTRNeNl\nCwpajB/PeFkXou0AAFpF2KmOw8/vwtSpp1eutDZpcnXJEbF5c8LAgQFHjii6Ne2g7QAAmkTY\nqVRply6ZO3cW9+wprfifPJkweHCjlSsFu13BjWkGbQcA0B7CTr1soaG5CxZUGi8bs3Ah42Vd\nhbYDAGgMYad21YyX/fFHxsu6Cm0HANASws4D1DZetrRU2b1pAG0HANAMws4zXBkvu25dRYsW\n0mL4zp2J/fsHHjig4Ma0gbYDAGgDYedJyjt1ytq2rbB/f2nFLycnfvjwxunpOptNwY1pAG0H\nANAAws7D2IOCzr3ySqXxstHp6fHDhvnl5Ci7N09H2wEAPB1h55GM3btnbt9e2rWrtBJ48GBi\ncnLE1q0K7koDaDsAgEcj7DyVNSbm9HvvXTNetqys6YsvNp80ifGyDUHbAQA8F2HnyaodL/vl\nl4yXbSDaDgDgoQg7j2du1erUxo2XnnhC0OnEFXG8bJPXXtOZzcruzXPRdgAAT0TYaYHD3z9v\n+vTT771njYm5uuSI+uCDxMcf9z92TNGteTDaDgDgcQg77Sjt2jVz165rxsseO5b4+OOMl603\n2g4A4FkIO00Rx8uee+WVyuNlx4zxuXBB2b15KNoOAOBBCDsNKuzfv/J42R9+SOrdO/KzzxTc\nleei7QAAnoKw06Zqx8smzJzZfOZMxsvWA20HAPAIhJ1mVTteNmLXLsbL1g9tBwBQP8JO4xgv\n60K0HQBA5Qg77bsyXvatt6yVxssOGeKXna3s3jwObQcAUDPCzlsYH374yKZNJc7jZQ8fTnzs\nMcbL3ijaDgCgWoSdF7E0bnzq3XerHy97+bKye/MstB0AQJ0IOy8jjZft0EFaC/3yy6Q+fUL+\n/W8F9+VxaDsAgAoRdt5IHC9bMHq0oL/yAPApKGjx9NNNU1P1JpOye/MgtB0AQG0IOy/l8PO7\nMHXq6ZUrLU2aXF1yRGzenDBwYMCRI4puzZPQdgAAVSHsvFpply6ZO3cW9+olrfifPJkwaBDj\nZeuOtgMAqAdh5+3soaG5r7+eu2iRLTRUXNFVVMQsXNhy9Gif8+eV3ZunoO0AACpB2EEQBKH4\nkUeytm0r69xZWgn+8cdWffuG/f3vCu7Kg9B2AAA1IOxwhSUuLnvNGufxsnqjsdm0ac3+9jeD\n0ajs3jwCbQcAUBxhByfieNkNGyoSEqS1sM8/T+zfP+iXX5Tblseg7QAAyiLsUFl5x46ZW7de\nGjZMWvHNzY0fOTJm4UKd1argxjwCbQcAUBBhh2o4AgLyZs4889ZbtsjIK0s2W6OVK+OHDPE7\ndUrJnXkC2g4AoBTCDjUyPvxw5s6dJfffL60E/vZbUv/+UevWKbgrj0DbAQAUQdihNtZGjXLe\necd5vKzOZGqSlsZ42eui7QAA7kfY4XoYL1tftB0AwM0IO9SJOF720pNPVhov2yQtTWc2K7s3\nNaPtAADuRNihrhx+fnnPPXf6/fedx8tGrVuX+NhjAeRLzWg7AIDbEHa4MaX33H/uhckAACAA\nSURBVJO1fXvxI49IK/6ZmQmDB0etXs142ZrQdgAA9yDscMNs4eG5ixadnTfPHhwsrugqKpos\nWMB42VpkZGSQdwAAuRF2qKeifv0yd+yoOl42nPGyNaPtAACyIuxQf9WOl41jvGytaDsAgHwI\nOzSMOF52/fqKli2ltbDPP0987LHA/fsV3Jea0XYAAJkQdnCB8ltvzdy+/ZrxsmfOJDzxRMzC\nhTqLRcGNqRZtBwCQA2EH16hxvOzw4X6nTyu6NZWi7QAALkfYwZWMDz+cuX17abdu0krgoUOJ\n/ftHbNmi4K5Ui7YDALgWYQcXszZufHr5cufxsvqysqYvvcR42WrRdgAAFyLsIANxvOyWLYyX\nrQvaDgDgKoQd5GJOSjq1aVP+hAmVxss2TU3Vm0zK7k1taDsAgEsQdpCRw8fn4oQJlcbLRmze\nnDBwIONlK6HtAAANR9hBdqX33JO5c2dRr17Siv/JkwmPP944PZ3xss5oOwBAAxF2cAd7aOjZ\n11/PXbTIFhoqruis1uj09JajRjFe1hltBwBoCB/332Vubu6iRYtOnDixfft2abGkpGTFihWH\nDh2yWCzt2rUbN25cTExMpQ+s6Zi6fCzUoPiRR8pvuSVu+vSg//xHXAn+6adWffuenzXL+Xye\nl8vIyOjg9KQTAADqzt1n7L777ruZM2c2b9680vrixYsvXLgwe/bsBQsWBAUFzZkzx17lIl1N\nx9TlY6ESlri47NWrL0yd6vD1FVcYL1sV5+0AAPXj7rCzWCxvvPFGly5dnBfz8/N//vnnMWPG\nJCYmxsXFjRs3Ljc39/Dhw3U5pi4fC3URx8t++GFFQoK0Fvb554nJydKZPNB2AIB6cPel2Ice\nekgQhJMnTzovHj9+3NfXNzExUfxjSEhI8+bNjx492qlTp+seU1ZWVvvHnj17tqioSHy7tLTU\n4XBYrdZq92az2QRBsNvtNR1Qd6o9ZWi323U6ndK7EARBKL3pppOffBKzcGGjDz8UV3zPnm05\ncmTBk09emDhROp93o1T7la+H33//vV27duLbNpvNJY9MdXI4HIIg2Gw28Q2NcTgctfzY8XQu\n/LGpTg6HQ8OPTPG/Wv3eiX8daO+zu+5npMDv2FVVXFwcGhrqHBzh4eFSjdV+THh4eO0f+847\n73z22WfSu6KjowsLC2vZjMlkMjX4VdbKysoaeAsyUdtDvGTy5IK7706cM8e3oEAQBJ3dHv3+\n+4F792bOmWNq2fJGb83hcKj2K18/+/fvl/7RIgiC2WxWcDNyKy4uVnoLMqr9x46nKy8vLy8v\nV3oXctH2I9NqtWr+wan0FlzMYrHUfgpD3rDbs2fPG2+8Ib6dlpZWy6+E1+U0Uk3H1P6xd911\nV1BQkPi2Xq8/ePBgwNVRV5XY7faKigofHx8fn4Z+WXzre8JJVjabTa/Xq+SMnaTsT386unlz\ny5dfDvv2W3El+I8/bh469NykSReHDKn77VgsFp1O1/DvndqcOXOmdevW4jkD7X12IovFYrPZ\n/P391fbgdBWz2ezv76/0LmRhs9ksFotLfmyqU0VFha+vryYfmQ6Hw2w26/V6Pz8/pfciC/FE\nhvYemQaDofYHpLyf8B133LFkyRLx7djY2JoOi4iIKC4udjgc0l6LiooiIyPrcsx1P7Zv3759\n+/YV3758+fLEiRNDQkKq3YbFYqmoqPDz85NCsN7U+UPcZDL5+voaDAalN1JF06a5y5aVfvxx\nk/nzxaEUerO52YIF4Xv3nps3zxodXZfbsFqtOp1OnV/5BsrJyUlKSrJarcHBwUrvRRZGo9Fm\nswUFBanxwdlgDofDYrHU9GPH05nNZovF4u/vHxgYqPReZFFUVKThR6bZbDYYDFp9cIrn6rT3\nyBTPYtRygLxPnggKCoq/qpa/cdu0aWOxWKRfvCsuLs7Jyal0eq+mY+rysfAA1Y2XDdmzJ7Ff\nv5BvvlFuW2px7NgxpbcAAPAA7n5W7OXLl/Pz841GoyAI+fn5+fn5JpMpKirq3nvvTU9Pz8rK\nEl/lrlWrVjfddJMgCLt37961a5cgCDUdU8vHwuNUM1720qUWEyYwXlYQhOPHjyu9BQCA2unc\n/GSf0aNHX7hwodJKnz59ysrKVqxYsX//fpvNdvPNN48bN068nLpgwYLi4uK5c+cKglDTMTWt\nVyVeiv3w6tMwK7FYLOJZ94ZfilXna1Wo91JsFcE//dR0+nTfvDxpxdy69dkFC0xXnyVaVWlp\nqU6na/j3Tp2sVqv4W2iaPBttNBrNZnNkZKRHPDhvlMPhKCwsrOmHkqczm81GozE4OFh7F7xE\nRUVFISEhWn1kFhQU+Pr6hoeHK70XWWj4UuyAAQN27NhR0wHuDjtlEXaeEnaCIBiKi2NTU8Ou\nPqNZEASHn9/FyZMLRoyQzuc585KwEwRBe21H2Hkuws5zEXYe6rphx6xYqJQtLCx34cLcRYts\nYWHiiq6iImbBAsbLqvOfDQAANSDsoGrFjzyStXVr2Z13SivieNnwv/9dwV0pjrYDAFSLsIPa\nWeListesyZsxo5rxspp+4dDa0XYAgKoIO3gCvf7S8OHVjJft39+bx8vSdgCASgg7eIzyjh2z\nPvmkMCVFWvE9ezb+yScbL1miU9moNLeh7QAAzgg7eBJ7YOC51NQzb79ti4q6smSzRS9fHj9k\nSMDp04puTTG0HQBAQtjB8xgfeihz586SBx6QVgJ/++2mIUNiNm4UvOnleyS0HQBARNjBI1mj\nonLS08+/8IIjIEBc0ZvNLd58s/nEiYZLl5TdmyJoOwCAQNjBg+l0l4cOzdqyxeQ0QS7066+T\n+vTxzvGytB0AgLCDZzMnJZ366KP8CRMcVcfLlpcruzf3o+0AwMsRdvB4Dh+fixMmHEtPr2jS\n5OqSI2Lz5oSUlADvCx3aDgC8GWEHjTDeeecfGzcWP/qotOJ/8mTCoEGN3n9fsNsV3Jj70XYA\n4LUIO2iHLSws9803z772mj00VFzRWSwxb74Z/9e/+nrZeFnaDgC8E2EHrSnq3Ttz69ayO+6Q\nVoL27Uvs1y/s008V3JX70XYA4IUIO2iQpVmz7LVrncfLGoqLm/3f/3nbeFnaDgC8DWEHjdLr\nLw0ffsrrx8vSdgDgVQg7aJmJ8bK0HQB4E8IOGlfLeFm/U6eU3Jkb0XYA4CUIO3iFasfLJvXv\nH7VunZeMl6XtAMAbEHbwFlXHy+pMpiZpac2fecZLxsvSdgCgeYQdvEm142W/+sp7xsvSdgCg\nbYQdvI40XlYwGMQVrxovS9sBgIYRdvBG4njZU2vXWlq0uLrkiNi8OWHgwIA//lB0a+5A2wGA\nVhF28F7lt9+euWVLUe/e0op/ZmbCoEGN09MFm03BjbkBbQcAmkTYwavZQ0PPvvZa7qJFtrAw\ncUVntUanpyc88YRvTo6ye5NbRkYGeQcAGkPYAULxI49kbd1adued0krg/v1JAwaE79ql4K7c\ng7YDAC0h7ABBEARLXFz2mjXO42X1RmPc8897w3hZ2g4ANIOwA66SxssmJkprXjJelrYDAG0g\n7IBrmDp2zNqy5XLV8bKLF2t7vCxtBwAaQNgBldkDA8+npp5escLauPGVJZstesWKhCFD/LKy\nFN2avGg7APB0hB1QvdJu3bK2bSt58EFpJeC335Iee0zb42VpOwDwaIQdUCNrVFTO22+ff/HF\nyuNlJ07U8HhZ2g4APBdhB9RKp7s8ZEhmpfGyX3+d1KdPyNdfK7gvWdF2AOChCDvg+iqSkk5t\n2lR1vGzc9OlaHS9L2wGAJyLsgDpxGAwXJ0zIfv99S9Om0mL4zp0JAwcGaLSBaDsA8DiEHXAD\nyu6+O3PHjqI+faQV/8zMhJQUrY6Xpe0AwLMQdsCNsYeEnJ0//+zrr9tDQ8UVnc0WnZ4eP2qU\n77lzyu5NDrQdAHgQwg6oj6JevTK3bXMeLxu0b19iv35h//iHgruSCW0HAJ6CsAPqyRIXl/3B\nB87jZQ1GY7PnntPkeFnaDgA8AmEHNIBOd2n48KyPPza3bSuthX3+eWJyctDPPyu4LznQdgCg\nfoSdLDp06KD0FuA+5rZtT3300eXHHxd0OnHF99y5+L/+NUZz42VpOwBQOcJOLh06dCDvvIc9\nIOD87Nmnly93Hi/baMWKxAED/I8dU3RrLkbbAYCaEXby6nCV0huBO5R265a1dWvJ/fdLK/7H\njiUMGhS5aZOWxsvSdgCgWoSdm5B3XsLaqFHOO++cnzVLGi+rN5liX365xYQJPhoaL0vbAYA6\nEXZuRd55BZ3u8uDBlcbLhnzzTaK2xsvSdgCgQoSdArg+6w28YbwsbQcAakPYKYm807baxsv+\n8YeCG3Mh2g4AVIWwUx55p21ld9+duX17Ue/e0op/ZmbC449rZrwsbQcA6kHYqQXXZzXMHhp6\n9rXXzr72mlbHy9J2AKAShJ3qkHdaVdS7d+bWrWWdO0srV8bL/vOfCu7KVWg7AFADwk6lOIGn\nSZZmzbLXrq08XvbZZ7UxXpa2AwDFEXZqR95pjU53afjwUx9+WJGYKK1pZrwsbQcAyiLsPAN5\npzGmjh2zPvlEk+NlaTsAUBBh50m4PqslNY2XTRgyxC8rS9GtNRRtBwBKIew8EnmnGVXHywb8\n9lviY495+nhZ2g4AFEHYeTDyThuk8bJ2bY2Xpe0AwP0IO4/H9Vkt0OkuDx6ctWWL6eabpTUN\njJel7QDAzQg77SDvPF1FUtKpjz7S2HhZ2g4A3Imw0xpO4Hk0cbzsqXXrLC1aSIvhO3e2HjQo\nyGMLibYDALch7DSLvPNc5bfdlrVlS3GvXtKKf1ZWmxEjGr33nmC3K7ixesvIyCDvAMANCDuN\nI+88lC00NPf113Nff90mjZe1WGIWLYofOdL37Fll91ZvtB0AyI2w8wpi3rVu3VrpjeDGFPfq\nlbV9e9ldd0krQf/5T1Jyctjf/67grhqCtgMAWRF23qVdu3acwPMslqZNs9esOff889J4Wb3R\n2GzatGZ/+5uhqEjZvdUPbQcA8iHsvBHXZz2MTpc/dOjRDz80t20rrYV9/nli//5B+/YpuK96\no+0AQCaEnffi+bOexdSmTdamTZeGDbtmvOyTTzaZN09nsSi7t3qg7QBADoQdOIHnMRz+/nkz\nZ55eseK/42Udjqj16xMHDvQ/dkzRrdUHbQcALkfY4QryzlOU3ndf5rZtxgcflFb8jx1LfPzx\nqHXrPG68LG0HAK5F2OEaXJ/1CLaoqDPp6WfT0uyBgeKKzmxukpbWcswYnwsXlN3bjaLtAMCF\nCDtUj7xTv6K+fbM+/th5vGzw998n9e8f6mnjZU+cOKH0FgBAIwg71IYTeCpXdbys4dKl5h44\nXjYrK0vpLQCAFhB2qBPyTrWk8bIV146XTRw4MOD33xXc2I06cuSI0lsAAI9H2OEGkHeqVX7b\nbVmffFKYkiKt+GVmJgwa1Dg9XbDZFNzYDeH37QCggQg73DCuz6qTPSTkXGpq7qJFtvBwcUVn\ns0WnpycMH+6Xk6Ps3uqOtgOAhiDsUH/knQoVP/JI1rZtZXffLa0EHjiQ+NhjEZs3K7irG0Lb\nAUC9EXZoKPJObSyxsdmrV+fNmPHf8bIlJU1TUz1ovCxtBwD1Q9jBNbg+qy463aXhw7O2bDG3\nayetedZ4WdoOAOqBsIOLkXfqYW7TJuujjzx3vCxtBwA3ykfpDbiVw+Gw2WyXL1+u6b2CIJhM\nJrPZ7N59uYndbrdYLLqrf8fLKjY2VhCEzMxMN9yXSPz2lZWVue0e3Un87Gz1en5r1pQpl+66\nK/7ll30vXRJvK2r9+oD//Cdr7lxTYqJr91k/4mdXXl5e9cH5yy+/JCUlKbEpV6rlx46nk/6/\nM5lMSu9FFna7vaioyD0/NhVhtVq1/eDU3iPTYrHY7fZaDvCusNPpdAaDITIystr3WiyWoqKi\ngICAoKAgN2/MPYxGY0BAgO/V37tyg86dO4tvuOHUS2lpqU6n0+r3zmq12mw2f3//+n24pXv3\nU507N501K+TqUIqgI0duGj4879lnLw8eLCj9l5bJZLJarYGBgXp9NdcQzp8/79HngB0OR2Fh\nYU0/djyd2Ww2Go1BQUGBV6fbaUxRUVFISIjh6guAa4nD4SgoKPDx8Qm/+jx6jSkvLxcEQXuP\nTIvFUu2PSgmXYuEOXJ9VnDUqKkccL3u1fXUmU+wrr3jEeFmuyQJAHRF2cB/yTnHVjpdN7N8/\nRPXjZWk7AKgLwg7uxvNnlVWRmFhpvKzPpUstxPGy6v4NRdoOAK6LsINiyDul1DReNik5OXD/\nfgU3dl20HQDUjrCDwjiBp5Ty227L2rKlqFcvacU3Jyd+xIhGK1aoebwsbQcAtSDsoBbknfvZ\nQ0PPvv567oIFttBQcUVntcYsXhw/cqTv2bPK7q0WtB0A1ISwg7qQd+5X3LNn1vbtzuNlg375\nJalfPzWPl83IyCDvAKAqwg5qxPVZN7M0bZq9atWFqVMrj5d99lmD0ajs3mpB2wFAJYQdVI28\ncx+9vmD06MrjZf/5z8S+fdU8Xpa2AwBnhB08AHnnNtWMlz1//sp42YoKZfdWE9oOACSEHTwG\n12fdw+Hvnzdz5ukVK6wxMVeXHFHr1ycOHOh/9KiiW6sRbQcAIsIOnoe8c4PS++7L3LrV+NBD\n0or/8eOJgwZFrVsnOBwKbqwmtB0ACIQdPBd5JzdbVNSZt9++Zrys2dwkLU2142VpOwAg7ODZ\nuD4rt6K+fTO3by+//XZpJfj775P69w/96isFd1UT2g6AlyPsoBGJiYmtWrVSehfaZGnePHvt\nWufxsoZLl5pPnKjO8bK0HQBvRthBUzh7J5Mr42XXr688XrZfPxWOl6XtAHgtwg4axPVZmZR3\n6pT1ySeFKSnSiu+ZM/FPPNE4PV1t42VpOwDeibCDlpF3LmcPCTmXmpq7eLEtPFxc0dls0enp\nCcOG+eXkKLu3Smg7AF6IsIP2kXcuV9yjR9a2bWX33COtBB48mPjYY2obL0vbAfA2hB28Bddn\nXcsSG5u9alXejBkOPz9x5cp42SlTDEVFyu7NGW0HwKsQdvA65J3L6HSXhg/P+vjja8bLfvFF\nYnJy0E8/KbivSmg7AN6DsIOX4gSeq4jjZQtGjxb0V36e+J4/H//Xv6pqvCxtB8BLEHbwduRd\nwzn8/S9MnZqzbJk1OvrqkiNq/fqEwYP9T55UdGv/RdsB8AaEHSAI5J0rlPzP/2Ru3+48XjYg\nIyNxwIBGK1cKdruCG5PQdgA0j7AD/ovrsw0kjpc9l5pqDwwUV3Rmc8zChS3Gj/fJz1d2byLa\nDoC2EXZANci7hihMScn65JPyjh2llZDvvkvq1y/0668V3JWEtgOgYYQdUCPyrt4qEhKyN26s\nPF52wgSVjJfNyMgg7wBoEmEHXAfXZ+vnv+NlW7aUFq+Ml/31VwU3JqHtAGgPYQfUFXlXD+Wd\nOmVt2VLUp4+04nvmTPzIkdHLl6thvCxtB0BjCDvgxpB3N8oeEnJ2/vxrxstarY2XLEkYOtTv\n9Gll9ybQdgC0hbAD6oPrszequEePzG3bSp3Hyx46lDhgQPiOHQruSkTbAdAMwg5oEPKu7qyx\nsafff//Cc885fH3FFX1JSdyMGQnPP+9jNCq7N9oOgDYQdoALkHd1pdcXPPlk1pYtzuNlI3fv\n7piSErJnj4L7Emg7AJpA2AEuw/XZOqpmvGxBQfz48YqPl6XtAHg6wg5wPfLuumocLztokLLj\nZWk7AB6NsAPkQt5dlzhetuiBB6SVgCNHFB8vS9sB8FyEHSAvrs/WzhYVlblw4akZM1Q1Xpa2\nA+ChCDvATci7WlxMTj758ceqGi9L2wHwRIQd4FbkXU1UOF6WtgPgcQg7QAFcn62WCsfL0nYA\nPAthByiJvKtKHC9bmJIirfieORM/YkTj9HRFxsvSdgA8CGEHKI+8q8QeEnIuNfWM83hZmy06\nPT1h2DBFxsvSdgA8BWEHqAXXZysx9uiRuWtXSbdu0krgwYOJAwZEbN7s/s1kZGSQdwDUj7AD\nVIe8k1ijo3OWL8+bMcPh5yeu6EtKmqamNpsyxVBU5P790HYAVI6wA1SKvLtCp7s0fHjWli0m\np/GyYV98kdS7tyLjZWk7AGpG2AGqJuZd27Ztld6IwsytW5+6drysT35+i7FjFRkvS9sBUK06\nhZ2vr29wcHBIHci9XcBrtWnTxstP4InjZU+vWGGNibm65Ihavz5xwAD/o0fdvBnaDoA6+dTl\noDFjxnzzzTfHjh2788474+Li7Hb7qVOnDh482KlTp/bt2zscDrl3CUAktp03V0Vp166Z27Y1\nfeml0H/9S1zxP3EicdCgixMmFPz1r9L5PDfIyMjw8tQGoEJ1CrsHH3zwu+++y87OjouLkxaP\nHDnSr1+/oUOH9urVS7btAaiGl+edLTLyzNKl4Tt2xM6dKw6lEMfLBv/449l58/57Pk9+tB0A\ntanTv25TU1Nfeukl56oTBKF9+/ZTpkx58cUX5dkYgOvw8pdHKerbN3P79vI77pBWgn/4ISk5\nWTqT5x5em9cA1KlOYXf8+PGIiIiq640aNTpy5IirtwTgxnht3lmaN8/+4IMLU6c6fK5cfDBc\nvtz8mWfcPF6WtgOgHnUKu+jo6NWrV1f6XTqbzbZu3bqoqCh5Ngbgxnhn3jkMhoLRo7OVHi9L\n2wFQiTr9jt3o0aPnzJmzb9++7t27x8TECIKQn5//9ddf//HHHzNmzJB5hwBugNR2XpUa5bfe\nmvXJJ00WLJCGUojjZS89+eTFSZOk83my4vftAKhBnX7ezZ49OyAgYOnSpcuWLZMWGzduPHv2\n7FmzZsm2NwD1521PsLAHB59LTS3t0iU2NdVQXCwIgs5ma7RyZdCvv+bOn29p3twNe6DtACiu\nTpdi9Xr9jBkzcnNzs7Ozf/rppx9//PHkyZPnz59PTU01GAzSYe++++7ly5dl2yqAG+Zt12eL\n//KXzO3bS++5R1oJ/PXXpP79w3fscM8GGCkLQFk38JpPOp2uZcuWd9999z333JOUlKSv8npR\n48ePz83Nden2ALiAVz1/1hobe3rVqkrjZeNmzGg+ZYqhsNA9e6DtACiFkWKAF/GWvNPpLg0f\nfmrTJnPr1tJa6BdfJCYnB//4o3u2QNsBUARhB3gdL8k7U7t2WR9/7Dxe1jcvr+WoUW4bL0vb\nAXA/wg7wUt5wffbKeNn33rM2aXJ16cp42QC3jJel7QC4GWEHeDvN513pvfdmbt1qfPhhacX/\nxImEQYOi1q8X5B91TdsBcCfCDoAgaD3vbJGRZ95669ycOfagIHFFZzY3mTev5dixPhcvyn3v\nmZmZct8FAIgIOwD/pe28KxwwIHPHjjLn8bJ79iT16hX2z3/KfdectwPgHoQdgMo0/Ot3lmbN\nTn/wQf7TTzuuvganwWhs9uyzTV96Se7xsrQdADcg7ADUSJN55zAYLk6cmP3hh87jZSO2bEnq\n21fu8bK8fDEAuRF2AK5Dk3knjpctTEmRVnxzc+NHjIhZuFBntcp617QdAPm4Muw+/fTThIQE\nF94gAPXQ3vVZcbxs7uLFtvBwcUUcLxs/YoRvTo6sd03bAZBJncLuwoULI0eObNasmcFg0FUh\nHfaXv/wlJCREtq0CUAWN5V1xjx6Z27eX3nuvtBK4f39S//7h27bJer+0HQA5+NTloIkTJ27b\ntu3+++/v3r27j0+dPgSAtoltp406sTZpcnrlyoiPP24yf77eZBIEQV9aGvfCC6HffHPu5Zdt\nEREy3W9GRoaWEhmAGtSp0r766qstW7b07dtX7t0A8CzayTudrjAlpbxTp2bTpvkfPy6uhe7e\nHXjo0Nl585zP57kWbQfAtep0Kba8vLxr165ybwWAh9LMr9+Z27XL2rzZebysT15ey9GjZR0v\nq4UsBqAadQq7zp07//7773JvBYCn00DeKTJelrYD4Cp1CrtFixY9//zze/fulXs3ADRAA3l3\nZbzsn/8srcg9XpaXuAPgEnUKu8mTJ587d65r167BwcEJVci8QwAeydPzzhYZeWbp0rNpaZXH\ny44e7ZOXJ9Od0nYAGqhOT57Q6/Vt27Zt27at3LsBoDFS23loshT17VvWuXOz6dOloRTBe/cm\n9elzfvbs4v/9XznukadTAGiIOoXdt99+K/c+AGib5z5/1tK8efYHH0S/806jFSt0Nptwdbxs\n8I8/5k2fLp3PcyHaDkC9MVIMgPt46PVZh8Fw8Zlnqhkv26+fTONlPbGAAahBbWfs2rdvX5eb\nOHLkiIs2A8AreOjZO3G8bJMFCyI2bxZXfM+ciR8x4tKTT16cNMnh6hdv57wdgHqo7SdRdHS0\nHHeZm5u7aNGiEydObN++XVosKSlZsWLFoUOHLBZLu3btxo0bFxMTU+kDL126tGrVqoMHD1ZU\nVCQlJT355JPir/1NmjTp1KlT0mEBAQGbr/7YBaBaUrXs379f2Z3UnThetrRr19jZsw1FRcLV\n8bJBv/ySO3++pUUL194dbQfgRtUWdnv27HH5/X333XcrV668/fbbT5w44by+ePHikpKS2bNn\n+/v7b9iwYc6cOW+99ZZef82V4ldeecXPz+/ll18ODAwUj1m5cmVAQEBJScmYMWO6dOkiHlbp\nowCoXOvWrc1mc55sTzV1ueIePcruuKPpCy+EfPeduCKOl8177rnClBTX3pd4XpO8A1BH7m4g\ni8XyxhtvSBEmys/P//nnn8eMGZOYmBgXFzdu3Ljc3NzDhw87H2M0Ghs3bjxhwoSkpKSmTZs+\n8cQTxcXFOTk54rtiY2Ojr4qKinLrpwTAFdq3b+9B+WKNjs559928adMcfn7iir60tGlqarMp\nU8Qzea7lcZetASjF3WH30EMPNW7cuNLi8ePHfX19ExMTxT+GhIQ0b9786LUv8h4aGjpjxowW\nV690FBQU6PX66Ohoi8ViNpv37t07ZcqUUaNGpaWl5ebmuuETASAHT3p2YDAdiQAAIABJREFU\nhU53aeTIrE2bzG3aSGthX3yR1K9fsAwv507bAagLF/+2b/0UFxeHhobqdDppJTw8vKjmf/Ua\njcalS5f269cvMjKyqKgoIiLCarU+/fTTgiBs3LhxxowZy5YtCw4OFg9evXr1zz//LL7t5+dn\ns9lqumWHwyEIgtlstlgsrvrUVMVms9lsNuevs5Y4HI5avrmezm63OxwOq9Wq9EZkYbPZBEEw\nGo3SgzMuLk584+TJk4ptq27KW7YsXru26fLlTdauFex24ep42fzk5DP/93/2gABBEOx2e3l5\necPv69dff23VqlXDb8eF7Ha7IAgmk6lCtlm6yrJarc6PTO2xWq0a/rEpCIL2HpkWi0X81Goi\nb9jt2bPnjTfeEN9OS0ur5R/idf/f5syZM3Pnzr3ttttGjBghCEJ4ePjatWul906bNm3EiBE/\n/PBD9+7dxZWTJ0/u27dPfDs8PFw8yVfL7Yv1U8fNeJzaHw2ezuFwaDXKRdr+9lWbrS1bthQE\nISsry+3buQE2g+H0008X3nln4ssv+128KAiC4HBEb90afOBA5pw5ZW3bClfjteGOHTsmXdxQ\nD23/2NTqP6hEmv+xqb1H5nW/X/KG3R133LFkyRLx7djY2JoOi4iIKC4udjgcUt4VFRVFRkZW\nPfLgwYOvv/764MGDe/XqVe1NBQYGNm7cOD8/X1p55ZVXXnnlFfHty5cvT5w4saZn+1oslqKi\noqCgoCAZXnFUDYxGY0BAgK+vr9IbkUVBQYHBYIiIiFB6I7Iwm81Wq1U6D60xRqPRbDZHRkYa\nDIZqDxD/n1X5tUj7Qw+duuOOprNnh+7eLa4EZmbeNGpU3pQpuQMGBLnue3fx4kX1XK02m81G\nozE4ODgwMFDpvciiqKgoJCSkpkemR3M4HAUFBb6+vuHh4UrvRRbiaXLtPTItFkvtTxKV93fs\ngoKC4q/y9/ev6bA2bdpYLBbpmov4rIiqP7n++OOP1157berUqc5Vl52d/fbbb0v/ojKZTBcv\nXqwlIgF4KPX/+p0tIuLMkiWVxsvGvvZamwkTXDteNiMjQ+WZC0Ap7n7yxOXLl/Pz841GoyAI\n+fn5+fn5JpMpKirq3nvvTU9Pz8rKEl/lrlWrVjfddJMgCLt37961a5cgCBUVFYsXL+7Tp098\nfHz+VeLH7t279+233z5//rz4sSEhIV27dnXz5wXAPTpcpfRGalTUt2/W1q3lnTpJK2H79iUl\nJ0tn8lyFtgNQlU58xoDbjB49+sKFC5VW+vTpU1ZWtmLFiv3799tstptvvnncuHHipdgFCxYU\nFxfPnTv34MGDs2bNqnRrY8eO7dmzZ2Zm5urVq8Wn1rZr1+6pp55q0qRJtfcuXor98MMPq30v\nl2I9GpdiPdd1L8XWQrVxo7PZot99t9G77+qcfsWnsH//vJkzXTteVtnG5VKs5+JSrIeyWCwD\nBgzYsWNHTQe4O+yURdgRdh6KsKudavMu8PDhuGnT/LKzpRVLs2ZnX3ut7I47XHgvCrYdYee5\nCDsPdd2wY0gDAI+n2ouz5bfckrl164VBg6QV39zcliNGxCxcqHPdcy1V27UA3I+wA6AR6sw7\ne0BAzrPPnlmyxHb1dLI4XjZ+6FDnM3kNxNMpAIgIOwCaos68M3bvnrlrV8mf/iStBB4+nJic\nHLVunQvvhbYDQNgB0CAVPnnW2qhRzrJl51JTxXEUgiDoTaYmaWnNJ082FBa66l5oO8DLEXYA\ntExdeafTFaaknPr4Y1P79tJa6O7dSb17h3z7ravuhLYDvBlhB0D7VJV35latTn30UcHo0cLV\nl4/3KShoMX5809RUvcnkkrug7QCvRdgB8BbqyTuHn9+FqVNPr1xplV500+GI2Lw5YeDAgCNH\nXHIXPJ0C8E6EHQDvop68K+3SJXPbNmOPHtKK/8mTCYMHR61dK7joFUZpO8DbEHYAvJFK8s4W\nEXFm8eKzaWn2q68+rTObm8yf33LUKJ/z511yF7Qd4FUIOwDeSyVPnr0yXva226SV4B9/TOrf\nP/SLL1xy+1yWBbwHYQcAyp/Aq2jRInvduosTJjiuTq8yFBY2nzKl6Qsv6EtLXXIXtB3gDQg7\nALhC2bxzGAz5EyZkb9hQER8vLUZs25bUr1/QL7+45C5oO0DzCDsAuIayeVd+yy1Z27ZdGjZM\nWvHNzW05cqSrxsvSdoC2EXYAUA0F884eEJA3c6Z842X5lTtAwwg7AKiRgnlX/XjZAQMiNm92\nye3TdoAmEXYAcB1K5Z04Xjbv+ecdfn7iir60tGlqavMpU1wyXpa2A7SHsAOAOlEm73S6SyNG\nZG3aZG7TRloL/eKLpH79gn/4oeE3z2VZQGMIOwC4AYrknbldu6yPP75mvOyFCy2fespV42Vp\nO0AzCDsAuGHuzzu5x8vSdoA2EHYAUE/uz7vSLl0yd+4s7tlTWvE/eTJh0KBGK1cKdnsDb5zL\nsoAGEHYA0CBuzjtbaGjuggXnXn31v+NlKypiFi5s+dRTPnl5Db992g7waIQdALiAm/OuMDm5\n8njZvXuTkpNdMl6WtgM8F2EHAC7jzryTdbwsl2UBD0XYAYCLdbhK7juSe7wsbQd4HMIOAOTi\nnrN35bfckinbeFlO3QGehbADABl16NAhKSlJ7ntxiONl33rLFhkprrh8vGzDbwSAGxB2ACA7\n91yZNT78cObOnSX33y+tBB4+nJScHLVuXcNvnLYDPAJhBwBu4oa8szZqlPPOO+dSU+0BAeKK\nzmRqkpbWfNIkw+XLDbxxLssC6kfYAYBbyZ53Ol1hSsqpjz82Od1L6JdfJvXpE/Lvfzf85mk7\nQM0IOwBQgNx5Z27V6tTGjdeMly0oaPH00y4ZL8upO0C1CDsAUIyseXdlvOz771uqjpd1RZbR\ndoAKEXYAoDBZ8670nnsyd+4s7tVLWvE/eTJh8GCXjJc9duxYVlZWA28EgAsRdgCgCvLlnT00\nNPf113MXLbKFhoorV8bLjh7tc/58w2//+PHjDb8RAC5B2AGAisiXd8WPPJK1bVtZ587SSvCP\nP7bq2zfs739v+I3zW3eAShB2AKA6MuWdJS4ue82aC1OnOnx8xBW90dhs2rRmf/ubwWhs+O3T\ndoDiCDsAUClZTt0ZDAWjR2dv2FCRkCCthX3+eWJysqvGy5J3gIIIOwBQL5lO3ZV37Ji5des1\n42XPno130XhZgbwDlEPYAYDayZF3VcfLCuJ42SFD/E6dcsld0HaA+xF2AOAZ5Mg748MPZ27f\nXtqtm7QS+NtviQMGRGzZ4pLb59Qd4GaEHQB4EpfnnbVx49PLl+fNnOnw9xdX9GVlTV96qfkz\nzzR8vKyItgPchrADAM/j4lN3Ot2lYcOytmy5Zrzsv/7lqvGyAqfuAHch7ADAI7n81J04XvbS\nyJGCTieuiONlm8ybpzObXXIX5B0gN8IOADyYa/PO4eeXN23a6VWrnMfLRq1fn/jYYy4ZLyui\n7QD5EHYA4PFcm3fVjJfNzEx4/PHG6ekNHy8r4tQdIBPCDgA0woVtJ46XPfvqq/bgYHFFZ7VG\np6e3HD3aJy/PVfdC3gEuR9gBgHa49tRdUXJy5o4dlcfL9ukT7orxshLaDnAhwg4AtMaFeVft\neNk4142XFXHqDnAVwg4AtMllp+7E8bLr11fEx0trYZ9/njhgQNCBA665C0EQyDvAFQg7ANAs\nF566K7/11sxt264ZL5uTkzRyZPO339ZZLC65CxF5BzQEYQcAGueqtrsyXnbpUmm8rM5ub7p2\nbasRI1w1XlZC2wH1Q9gBgPa58NSd8c9/zty5s+T++6WVoN9/T+rfP2rdOpfcvoRTd0A9EHYA\n4C1clXfWRo1y3nnnXGqqPSBAXNGZTE3S0ppPmuSq8bIS8g64IYQdAHgX15y60+kKU1JOfPRR\nWbt20lrol1+6cLysM/IOqCPCDgC8jqtO3ZmTkv5YvTpv3DhBf+VvE3G8bNPUVL3J1PDbr4S8\nA66LsAMAL+WStnP4+OSNH3/6/fedx8tGbN6cMGCAC8fLOiPvgFoQdgDgvVx16k4cL1sk53jZ\nSsg7oFqEHQB4O5e0nT009Ozrr+cuWmQLCxNXroyXHTXK5/z5ht9+tWg7oBLC7v/bu/P4qMs7\ngePPHMnkNCEQCBCECUeMuB7YuoLIeqy6r1qRW1SEFTmyhqWYKhWtmooFrayABcUsQqEpIHJ5\n1KLsVvcFSq2ugLgiAjmEcOZgMiHMZI7f/vGDMYRkCGTmdzzzef/hK3kymXniXB9+M8kXABCx\nQ3d1d91VtmFDw09+ElpJ/vzz3vfeG9nxsk1x6A5oirADAJwRkbbzdetWsXz58cJCJS5OXYnG\neNlmyDtARdgBAH4UmUN36njZP/2psVev0NplH37oHD486csv23vmrSPvAMIOANBcRA7dnb7q\nqtING84ZL3v4cM+HH+78yiuRHS/bDHmHWEbYAQBaEJFDd+p42YPFxf5Onc4sBQIdly7t+eCD\nER8v2wx5h9hE2AEAWhWRQ3f1gweXbdpUf8stoZXEb76JxnjZ85F3iDWEHQAgnIgcuvNnZBxc\nvPj88bI9pkyxV1W1e48XQN4hdhB2AIALi8ChO4vl5Jgx5evWeZqcVcq2bc5hw1I++aS9Z94G\n5B1iAWEHAGiTSI2XLX/rraqCgh/Hy9bU9CgoiNJ42fORd5AbYQcAaKvI/EaF3X6ioKBCw/Gy\n5yPvICvCDgBwcSJy6K5B8/Gy5yPvIB/CDgBw0SI4Xvbw3LnBlBR1RYPxsucj7yATwg4AcCny\n8vL69evX/vNx3Xtv6aZNWo6XbRF5BzkQdgCAS+d0Ott/Jup42RPTpyt2u7qijpftNmuWtb6+\n/effduQdzI6wAwC0S9++fSNwLjZbVX5+xapVTcfLpr3zTs6IEUlffRWB878Y5B3Mi7ADALRX\nRH5bVjQdL2uxqCtxhw71HD++y5w5UR0v26I9e/YcOHBA4wsF2omwAwBERkTa7sx42Tfe8Gdm\nnlkKBjNKSno9+GB8WVn7z/9ifffddxy9g4kQdgCAiIlI2wl1vOzGjU3HyyZ8803OyJEZf/yj\nUJSIXMRF2XOW9hcNXBTCDgAQSZF6WfbH8bKJieqKOl728qlT7SdOtP/8Lw15B4Mj7AAAkReZ\nQ3fqeNm1a5uOl03ets05cmTK1q0ROP9LRd7BsOx6b0BTiqIEg8H6Vn55PhgMCiEaGxuDWv3R\nc435/f7Tp097vV69NxIV4a9cswsEAsFgUNHjFSgN+P1+IURDQ4Pl7PvlJSP3LVMI4fV61Q+a\n6dGjx/79+9t/Kd7sbPeKFV1fe63zypXqUAp7VVWP/PyqMWMOP/ZY0OFo/0W0JhgMNjY2tnbL\n3LlzpxCiT58+0dtA9KiPJ4FAQNYbp/rA0uIt09R8Pl/454LYCjuLxWKxWBytPAr4/f7Gxka7\n3d7aCcwuEAjExcXZ7XJe6V6vN8yVa3Y+ny8QCMj60wUCgUAgEB8fb7VK+BqCoiiNjY2yXnc+\nn8/n84V52Ozfv78QYu/eve29JLv9+C9/eermm7OffjpOHUqhKJ3eeiv1iy8Ovfji6Qi9se98\ngUDAbreH/ydHeXm5ECI3NzdKe4gSRVG8Xq/VapX1xqmS76e74OOknM/xYVgslri4uDAnsFqt\n4U9gXlar1W63y/rTiTZcuealHq6T9adTH6fsdrvNZtN7L5GnKIrct0whhM1mC/8DXnXVVRF5\n4dIzcGDpO+9kvfBC2nvvqSuO0tKcBx6onjr1RH6+iMLtx2KxWK3WtvyTQz02GanfHdGAetRH\n4hunesROyp8u/L80JPz3MQDAaCJVPMHU1MMvvVQ5f37gssvUFXW8bK/x4+MOHozIRbQH772D\n7gg7AIAWIng0q+6uu8rWrTs9YEBoJXHHDufo0Zdt3hypi2gP8g46IuwAABqJYNv5srPLV6w4\n8YtfhMbL2urquhcWdnvySY3Hy7aGP30HXRB2AADtROqv3AkhhM1WNXVq8/Gy776bM2xY0pdf\nRuYiIoG8g5YIOwCA1iJ46K6F8bKHD/f813/VZbxsGOQdtEHYAQB0EMG2U8fL/mCY8bJhkHeI\nNsIOAKCPyP5xkFMGGy8bBnmH6CHsAAC6ieRb7lofL9tD1/GyreG3KxANhB0AQGeRPHSnjpd9\n+23PlVeG1lK2bXMOH57yyScRu5SIIu8QQYQdAEB/kX1Z1puTU75mTVVBQWgchb2mpkdBQdei\nIuvp0xG8oAgi7xARhB0AwBAi23aK3X6ioKB85Upfjx5nl5T0tWt7jR6d8O23EbygyCLv0E6E\nHQDAKCI+a/X0ddeVrlvnuuee0IqjtLTX2LGZixeLQCCylxVB5B0uGWEHADCQyP46hTD8eNkw\n+O0KXALCDgBgOBE/dFd3111l69c3nD9e9i9/iewFRQN5h7Yj7AAARhTxtvN1716xYsWJGTPO\nGS/7y192e/JJq9sd2cuKBvIObUHYAQAMKuJtJ2y2qilTyletanQ6Q2tnxst+8UWELys6yDuE\nR9gBAIwr8m0nhOeqq8rWrasdMya0EnfkSM+JEzMXLLD4/RG/uGgg79Aawg4AYGjRaLtgYuLR\noqIfiot/HC8bCHQqLu71wANGGy8bBr9dgfMRdgAAo4v4r8qqzoyXvfXW0Iphx8uGR94hhLAD\nAJhDNNrOn5FxcNGio7/+tZKQoK6cGS87bZq9pibiFxdV5B0EYQcAMJFotJ2wWGofeKBs3bpz\nxst+/LHz3nvTPv008hcXZeRdjCPsAABmEpW2U8fLvvXWOeNlq6t7T5/efdYsw46XDYO338Us\nwg4AYDJRajvFZjtRUFCxbJmva9fQYvp77xl8vGx45F2sIewAAOYTpbYTQjT89Kel77zjGjo0\ntOIoLe11330GHy8bHnkXOwg7AIApRa/tgikph1988ZzxsoFAp8WLez30kMHHy4ZH3sUCwg4A\nYFZR+jMoqrq77tqzalXDT34SWkncudM5atRlf/5zlC5RG7z9Tm6EHQDA3KLXdo1du5YvW3a8\n6XhZt7v7E090nznTFONlwysrKztw4IDeu0CEEXYAANOLXtspVmv1lCll69Z5+/ULLV72/vsm\nGi8bHkfvJEPYAQBkEL22E0J4+/UrX7Om9r77QivqeNnO5hkvGx6vz0qDsAMASCKqbRdMSDj6\n3HPNxst2LC52jhrl+P776F2uxsg7syPsAADyiGrbiZbGyzq+/945dqzpxsuGR96ZF2EHAJBK\ntNtOpvGy4ZF3ZkTYAQBkE+22U8fLlp4/Xnbo0JSPP47uRWuOt9+ZC2EHAJBQ1NtOiMacnPLV\nq6sfeURYzzyZ2mtqekyblvXCCxaPJ9qXrj3yzhQIOwCAnKL654tVSlzc8V/+8pzxsorSYdUq\n56hRCZI2EHlncIQdAEBmGhy6a7jhhhbGy44ZY+rxsuHx+qxhEXYAAMlp0Hayjpe9IPLOaAg7\nAID8NGg7IUTdXXeVbdjQwnjZDz7Q4NJ1RN4ZB2EHAIgJ2rSdr1u3ihUrjs2apcTFqSs2t7v7\n4493f+wxW12dBhvQEXlnBIQdACBWaNN2wmKpeeih8lWrGp3O0NplH37oHDEi6csvtdiArnj7\nnb4IOwBADNGo7YTw9O9ftn597X33CYtFXYk7fLjnww9LM172gsg7XRB2AIDYolnbnRkv+8Yb\nco+XDY+80xhhBwCIOZq1nRDi1ODBpRs3umUfLxser89qhrADAMQiLdsukJFxaNGio88+G2w2\nXragQLLxshdE3kUbYQcAiFEajKb4kcVSO3Zs2bp1nv79Q2spn3ySM3RoqnTjZS+IvIsewg4A\nENO0PHTXmJNTvmpV9eTJwmZTV2w1NdnTpmU9/7xVxvGy4fH6bDQQdgCAWKdl2ylxcccfe6xi\n+XJft25nl5QOa9Y4R41K+L//02wbhkLeRRBhBwCApm0nhGj4yU9KN206OWZMaCW+tLTX2LES\nj5e9IPIuIgg7AACE0LztgikpR4qKKufPD6SlqSuh8bLxUo+XDY/XZ9uJsAMA4AyN206Exsve\ncENoJXHnTufIkelr12q8E6Mh7y4NYQcAwI+0bztf164Vy5c3HS9rra/vWlTU/bHHbC6Xxpsx\nGvLuYhF2AACcQ/u2U8fLlr39trdfv9DamfGyf/+71psxHl6fbTvCDgCA5nRoOyG8/fqVvfVW\nzbhxP46XPXKk58MPd5kzx+Lzab8fAyLvLoiwAwCgBbq0neJwHHvqqR+Ki/2dO59dUjJKSpyj\nR8fOeNkLIu/CIOwAAGhZ7969dbncUzfdVLphQ/PxsvfdF1PjZS+IvGsRYQcAQKuuuOIKXS43\nkJFxaPHiw3PnBhMT1RWL19tl7tzLp0yxHz+uy5aMibffNUPYAQAQjqYjZc/luvfesrffbjpe\nNvnTT3NGjIjB8bIXRN6pCDsAAC5Mr7ZrzMkpX7OmqqDgnPGyBQXdnnzSevq0LlsyMvKOsAMA\noE30ajvFZjtRUFBeUtLYo0doMe3dd52jR8fseNnw9uzZs2/fvn379um9ER0QdgAAtJVebSeE\nOH3NNWXr1zNe9qLE4AE8wg4AgIugY9sxXvbSxFTeEXYAAFwcHdtOqONlN25kvOzFipHfnyXs\nAAC4aPq2nS8rq2LZsuOFhc3Hyz7+uM3t1nFjpiB33hF2AABcCn3bTlit1ZMmla9e7c3JCa1d\n9sEHzmHDGC/bFrLmHWEHAMAl0rnthPBceWXZunW1999/znjZiRM7v/IK42XbQr7XZwk7AAAu\nne5tpyQkHH3mmR+Ki/2ZmWeWgsGOS5c6R41y7N2r69bMRJq8I+wAAGgX3dtOqONlN248Z7zs\nvn3OsWMZL3tRJMg7wg4AgPYyQtsxXjZSTP36LGEHAEAEGKHtRCvjZZ0jRqR88ol+mzIrM+Yd\nYQcAQGQYpO0ac3LKV6+unjw5NF7WXlPTo6Ag6ze/sXo8+u7NjMyVd4QdAAARY5C2U+z24489\nVv7HP/44XlZROrz1lnPUKMbLXhqzvD5L2AEAEEkGaTshxOlrry1bt87185+HVuJLS3vdf3/H\n4mLGy14yg+cdYQcAQIQZp+2CqamHf/e7c8bL+v2dFyxwPvSQ49AhffdmaoY9gEfYAQAQeXl5\necbJuxbGy+7a1f+hhzpt2KDjruRgtLwj7AAAiBbjtN3542Vtp05d/tvfMl42IoyTd4QdAABR\nZJy2OzNeds0axstGiRHajrADACC6DNR2Qnjy8srWr68ZN+6c8bIPP9xlzhxLY6O+e0P72bW/\nyMrKyvnz5+/fv3/Tpk2hxfr6+uLi4q+//trn8+Xm5ubn53fu3LnZN06fPr28vDz0aUJCwtq1\na9v4vQAA6CgvL88Ih3NUisNxdNasquuvd86eHVdTI4QQipJRUpL0v/97+OWXmx7Pg+lofcRu\n69atTz31VHZ2drP1BQsWHD9+/Lnnnnv55ZeTkpKef/75YDDY7DT19fVTpkxZdtaSJUva/r0A\nAOjLUMfthBCum27as3at+7bbQisJe/Y4R45kvKypaR12Pp9v3rx5N954Y9PFqqqqL774YsqU\nKU6ns1u3bvn5+ZWVlbt37272vW63Oysrq9NZGRkZbf9eAAB0Z7S283focGjRosNz5waTktQV\nxsuandYvxd52221CiAMHDjRd3LdvX1xcnNPpVD9NSUnJzs7eu3fvNddcEzqNz+fzer3bt28v\nKSlxu919+vQZP3589+7dL/i933zzzdGjR9WPGxsbFUXxer0t7i0QCKj/be0EZhcIBHw+n8SH\nM8NcuWanXnGy/nTqbbKxsdFqlfBdv4qiyH3LFEL4/X5Zf8BgMBjxW2ZOTs73338fwTO8NIqi\nqP/1+/3Vd9/tvvbaHrNmJe3cqX41+dNPncOHVxYV1d16q67bvHTqc7rl7PsINRPt+4LP51PC\nHk/V4T1256urq0tNTW36fz8tLc3lcjU9TUNDQ3p6ut/vf/TRR4UQq1evnjVr1uuvv37B712z\nZs3mzZtDX+rUqZM77O91e71eWR+hhBB+v1/vLURRIBAIf+WaXaPU72s+deqU3luIIrlvmXI/\nbEbjltm1a9eysrKIn+0lCAaDHo9HCOHp2PHb11/vtmxZ1zfftASDQgh7bW3PX/yi+mc/K585\nM3Q8z3S0f9aL9p1d57Dbtm3bvHnz1I/nzp0b5hD0BZs6LS1t5cqVoU9nzpw5YcKEzz777ILf\ne+edd/bt21f9OBgMfvTRR8nJyS2eMhAIeDye+Pj4uLN/40cyXq/Xbrfbzs6ElkxDQ4PFYklM\nTNR7I1Hh9/uDwWB8fLzeG4kKr9fr9/sTExNlPWLn8XgkvmV6vV6JHzbVJ4Vo3DKvuuqqffv2\nRfxs205RFPVgZNPrrmratIYhQy5/6qn4gwfVlY4ffJC6a9cPc+Y0XHutTju9ROoRO+2f8lpr\njEjx+Xzhsye6YTdgwICFCxeqH2dlZbV2svT09Lq6OkVRQnt1uVwdOnQIc86JiYmZmZlVVVU5\nOTnhv3fIkCFDhgxRP66trf2v//qv1h5hfT6fx+Ox2+0SPwQ7HA5ZH38bGhqsVqus110offTe\nSFT4/X6/35+QkCDlvzrU12Flve7UY3VxcXGy/oCNjY3Ru2VeffXVOv6erBp2Foul2ZOC7/rr\ny9av7zJvXvratepKfGVl74kTq/PzT+TnC7PdSbV/yov2fcFut4cPu+j++zgpKannWQ6Ho7WT\n9e3b1+fzhd54V1dXd/DgwWaH9yoqKhYtWhQ6purxeE6cOJGVldWW7wUAwICM+WwVTEk5UlRU\nuWDBj+NlA4FOixf3GjcudCQPhqX1Cx+1tbVVVVXqK9BVVVVVVVUejycjI2PgwIGLFy8uKytT\n/8pd7969r7zySiHEli1b3nvvPSFERkbG9u3bFy1adPToUfU0KSkpgwYNCvO9AAAYnDHbTghR\nd+edZRs3NvzjP4ZWEnftco4cGTqSB2OyhH8LXsRNmjTp+Lm/QT0UOBINAAAaDUlEQVRp0qSh\nQ4c2NDQUFxfv2LEjEAj0798/Pz9ffTn15Zdfrqurmz17thCitLR0+fLl6q/B5ubmTp48uUuX\nLkKI1r73fLW1tdOmTfvTn/7U4ld9Pp/L5UpKSkoy7btEw3O73QkJCbK+FFtdXW2z2dLT0/Xe\nSFSoL8VG+60benG73V6vt0OHDrK+FHvy5Mnw7y0xL6/X63a7k5OTZX0p1uVypaSkaHDL1P41\nWUVRTp06ZbPZLnDdKUpGSUnn//iPpkMp6u688+hvfhM6nmdM6q9sa/+UF+1S9/l8o0aNeued\nd1o7gdZhpy/CjrAzKcLOvAg7U9Ms7ITmbdfWsBNCCOHYt6/7zJmOvXtDK76srMNz5zY9nmc0\nMRt2Ev4OGgAApmPY12SFEN6+fcvWrKmeNEmc/QXhuKNHe06cyHhZAyLsAAAwBCO3neJwHC8s\n/KG42B+axq4oGSUlztGjmx7Jg+4IOwAAjMLIbSeEODVoUOmGDU3Hyzr27XOOHdtx6VIh71gj\ncyHsAAAwEIO3XSAj4/zxsp1feYXxsgZB2AEAYCwGbzshhOvee8vWrTt91VWhleTPPssZMSL1\nr3/VcVcQhB0AAAaUl5dn8Lxr7NWrYvXqqoKC0DgKW01N9rRp3Z580trQoO/eYhlhBwCAQRm8\n7RSb7URBQcWKFb7u3UOLae++6xw1KvGbb3TcWCwj7AAAMC6Dt50QomHAgNKNG11Dh4ZW4svL\nez7wQKc33hCBgI4bi02EHQAAhmb8tgumpBx+8cVzxsv6/ZkLF/YaNy7+hx/03VusIewAADA6\n47edaG287KhRjJfVEmEHAIAJmKLtfFlZFW++efzxx5Wzs7ys9fVdi4q6Fxba6ur03VuMIOwA\nADAHU7SdsFqrJ04sW7fOm5sbWrts82bnsGFJn3+u475iBGEHAIBpmKPt1PGyq1fXPvigsFjU\nlbijR3s+8kjnefMsPp++e5MbYQcAgJmYpe2UhISjTz99cMkSf6dOZ5aCwY7LlvW67z7HgQO6\nbk1mhB0AACZjlrYTQtTffHPppk1Nx8smfPedc9QoxstGCWEHAID5mKjtGC+rJcIOAABTMlHb\nCSFc995bumnT6QEDQiuMl40Gwg4AALMyV9v5srMrVqxgvGxUEXYAAJiYudpOHS9bXlLSePnl\nocW0d9/NGTYs8auvdNyYNAg7AADMzVxtJ4Q4fc01ZevWnRwzJrQSd+hQzwkTMhcvZrxsOxF2\nAACYnunaLpiScqSo6NCCBYH0dHXFEgh0WryY8bLtRNgBACAD07WdEMJ9552l775bP3hwaIXx\nsu1E2AEAIAkztp2/U6eDb7xxbNYsJT5eXVHHy2bPmGFzufTdmxkRdgAAyMOMbScslpqHHipb\nt87TZLxs6kcf5dxzT8q2bTruy4wIOwAApGLKthPC26dP+Zo11ZMmCeuZOLFXVfWYOrXLnDmW\nxkZ992YihB0AALIxadspDsfxwsIf/vM//Z07n11SMkpKnKNGJezdq+vWTIOwAwBAQiZtOyHE\nqYEDSzdudN9+e2jFsX9/r7FjGS/bFoQdAAByMm/bBTp0OPT73zNe9hIQdgAASMu8bSdaGy87\nfHjqf/+3jrsyOMIOAACZmbrt1PGyxwsLFbtdXbHV1mb/+78zXrY1hB0AAJIzddspNlv1pEkV\nf/iDLzs7tJj27rvOkSMTd+/WcWPGRNgBACA/U7edEOL0gAGlGza4hg0LrcRXVPR84IFOS5Yw\nXrYpwg4AgJhg9rYLpqQcnjOn2XjZzFdf7fXgg4yXDSHsAACIFXl5eWbPuzPjZW++ObSS+PXX\nzpEjGS+rIuwAAIgtZm87f6dOB5csOWe87KlTZ8bLnjyp7950R9gBABBzzN526njZ8rfe8vbp\nE1pL/egj5/DhyX/7m4770h1hBwBALHI6nXpvob08ubllb79dM26csFjUlbhjxy6fNKnLyy/H\n7HhZwg4AgBjVu3dvvbfQXorDceypp3544w1/ZuaZpWAwY/nyvuPGJezfr+vW9EHYAQAQu0z/\nmqwQQohTgweXvv9+3c9+FlpJ2Lu37wMPxOB4WcIOAICYJkfbBVJTK+fNa2G87OTJ9mPH9N2b\nlgg7AABinRxtJ9Txsu+809B0vOz27TlDh172wQc67kpLhB0AAJCn7Xzdu/+wYsWxqVMVm01d\nsbnd3R9/vOszz8TCeFnCDgAACCFR2yk227FHHz2wfLmvR4/QYvr69c4RIxK//lrHjWmAsAMA\nAGdI03ZCiIZrrindsME1fHhoJf6HH3o++GCn116zyDtelrADAAA/kqntgsnJh3/728r58wNp\naeqKJRDIXLSo50MPxR08qO/eooSwAwAA55Cp7YQQdXfdVfree+eMl925M2fECCnHyxJ2AACg\nOcnartXxsr/4hWTjZQk7AADQAsnaruXxslu25Mg1XpawAwAALZOt7VoaL2uXa7wsYQcAAFol\nX9udGS+7dKm/S5czS8FgxvLlzlGjEvbu1XVrEUDYAQCAcORrOyHEqYEDSzdscP/zP4dWHPv3\n9xo7NqOkRCiKjhtrJ8IOAABcgJRtF+jQ4dCrrzYbL9tlzpzLJ00y73hZwg4AAFyYlG0npBsv\nS9gBAIA2kbXt1PGyVY8+2ny87LPPmm68LGEHAADaSta2U2y2E9OmVaxcec542XXrTDdelrAD\nAAAXQda2E0Kcvu660g0bTo4ZE1pRx8t2fuUVi9+v48bajrADAAAXR+K2CyYnHykqqlywoOl4\n2Y5Ll/YcP94U42UJOwAAcNEkbjshRN2dd5a+/379kCGhFbOMlyXsAADApZC77fwdOx58/fUj\nRUXBhAR1xRTjZQk7AABwieRuO2GxnBwzpnz1am/fvqG1M+Nlt2/XcV9hEHYAAODSSd52Qnhz\nc8vWrq0ZP775eNmXXjLgeFnCDgAAtIv0bac4HMeefPKc8bKKkrFihXPUqITvvtN1a80RdgAA\noL2kbzuhjpfduNF9xx2hFcf+/b3uvz9j5UrjjJcl7AAAQATEQtsF0tMPLVx4eO7cYHKyumLx\neru8+OLljzxikPGyhB0AAIiMWGg7oY6X3bTpnPGyf/tbztChl/35zzruSkXYAQCAiImRtjsz\nXrag4Jzxsk88IV56Sd+NEXYAACCSYqTtFJvtREFBxapVjT17qivBxEQxfLi+uyLsAABAhMVI\n2wkhTv/DP5Rt3FgzbpwQ4tiTT4p+/fTdD2EHAAAiL3baLpiQcOyppypKSk6OHq33Xgg7AAAQ\nHbHTdkKIpr9LoSPCDgAAREtMtZ0REHYAACCK8vLyyDvNEHYAACDqaDttEHYAAEALtJ0GCDsA\nAKAR2i7aCDsAAKAd2i6q7HpvQGuKogQCgRa/FAwGw5/A7BRFCQaDsv50QurrLhgMSvzTKYoi\nhOCnMyP1YVPiBxbp73d6/YD9+vX77rvvonoRoRtnVC/lfNH+/3nB84+tsFPLpr6+vrWvCiEa\nGxv9fr+2+9JIIBAIBAJWq5yHacNfuWYnd9ipP1dDQ4PFYtF7L1Eh9y1TyP6wKfEtUwgRCAT0\nunFmZ2cfOHAgeuevPqdrH3bR/v/p8/nC/1CxFXYWi8Vms6WlpbX4VZ/P53K5HA5HUlKSxhvT\nhtvtTkhIiIuL03sjUVFdXR3myjU7r9fr9/uTk5P13khUuN1ur9ebmppqOztLWyaKopw8eVLi\nW6b6wJKYmKj3XqLC5XKlpKTIesusrq622+063jgHDBiwZ8+eKJ25z+cTQmj/lBft/58+ny/8\nARo5D94AAADj4/12EUfYAQAA3dB2kUXYAQAAPdF2EUTYAQAAndF2kULYAQAA/dF2EUHYAQAA\nQ6Dt2o+wAwAARkHbtRNhBwAADIS2aw/CDgAAGAttd8kIOwAAYDi03aUh7AAAgBHRdpeAsAMA\nAAZF210swg4AABgXbXdRCDsAAGBotF3bEXYAAMDoaLs2IuwAAIAJ0HZtQdgBAABzoO0uiLAD\nAACmQduFR9gBAAAzoe3CIOwAAIDJ0HatIewAAID50HYtIuwAAIAp0XbnI+wAAIBZ0XbNEHYA\nAMDEaLumCDsAAGButF0IYQcAAEyPtlMRdgAAQAa0nSDsAACANGg7wg4AAMgjxtuOsAMAAFKJ\n5bYj7AAAgGz69u2r9xb0QdgBAAAJxWbbEXYAAEBOMfiaLGEHAACkFWttR9gBAACZxVTbEXYA\nAEBysdN2hB0AAJBfjLQdYQcAAGJCLLQdYQcAAGKF9G1H2AEAgBgid9sRdgAAILZI3HaEHQAA\niDmyth1hBwAAYpGUbUfYAQCAGJWXlydZ3hF2AAAgpsnUdoQdAACIddK0HWEHAAAgSdsRdgAA\nAEJI0XaEHQAAwBlmbzvCDgAA4EembjvCDgAA4BzmbTvCDgAAoDmTth1hBwAA0AIzth1hBwAA\n0DLTtR1hBwAA0CpztR1hBwAAEI6J2o6wAwAAuACztB1hBwAAcGGmaDvCDgAAoE2M33aEHQAA\nQFsZvO0IOwAAgItg5LYj7AAAAC6OYduOsAMAALhoxmw7wg4AAOBSGLDtCDsAAIBLZLS2I+wA\nAAAunaHajrADAABoF+O0HWEHAADQXgZpO8IOAAAgAozQdoQdAACAJAg7AAAASRB2AAAAkiDs\nAAAAJEHYAQAASIKwAwAAkARhBwAAIAnCDgAAQBJ27S+ysrJy/vz5+/fv37RpU2ixvr6+uLj4\n66+/9vl8ubm5+fn5nTt3bvpdu3fvfvrpp5ud1dSpU+++++7p06eXl5eHFhMSEtauXRvNnwAA\nAMCItA67rVu3Ll269Lrrrtu/f3/T9QULFtTX1z/33HMOh2PVqlXPP//8q6++arX+eEDxiiuu\nWLZsWejT48ePFxUVXX311UKI+vr6KVOm3HjjjeqXmn4XAABA7NC6gXw+37x580IRpqqqqvri\niy+mTJnidDq7deuWn59fWVm5e/fupqeJi4vr1MTq1auHDx/eo0cPIYTb7c7Kygp9KSMjQ9Mf\nCQAAwBi0DrvbbrstMzOz2eK+ffvi4uKcTqf6aUpKSnZ29t69e1s7k61btx45cmT06NFCCJ/P\n5/V6t2/fPmPGjEceeWTu3LmVlZXR2z8AAIBh6fAeu/PV1dWlpqZaLJbQSlpamsvlavHEwWBw\n1apVY8eOtdvtQoiGhob09HS/3//oo48KIVavXj1r1qzXX389OTlZPf0rr7zyP//zP+rHycnJ\nwWCwtra2xXNWFEUI4fF4vF5v5H44AwkGgz6fr+n/Z5koihIIBFq7cs1OvXE2NjbqvZGoCAaD\nQgiXyyXrjTPMw47ZqbfMhoYGj8ej916iIhgM1tXV6b2LKPL7/XLfOOW7Zfp8PvUxszXRDbtt\n27bNmzdP/Xju3Ll5eXmtnbLtD+iffvqpx+O59dZb1U/T0tJWrlwZ+urMmTMnTJjw2Wef3XHH\nHZe6awAAAFOKbtgNGDBg4cKF6sdZWVmtnSw9Pb2urk5RlFDeuVyuDh06tHjijz/+eNCgQTab\nrcWvJiYmZmZmVlVVhVYKCwsLCwvVj2tra6dNm9baOft8PpfLlZCQkJSUdKGfzJTcbndCQkJc\nXJzeG4mK6upqm82Wnp6u90aiwuv1+v3+0HFoybjdbq/Xm5aW1tr92tQURTl58mRrDztm5/V6\n3W53UlJSYmKi3nuJCpfLlZKSIusts7q62m63p6Wl6b2XqDh9+rQQQr5bps/nC/9LotF9j11S\nUlLPsxwOR2sn69u3r8/nO3DggPppXV3dwYMHWzy8d+rUqR07dtxwww2hlYqKikWLFvn9fvVT\nj8dz4sSJMBEJAAAgK63fY1dbWxsIBNxutxBCPa6WkpKSkZExcODAxYsXT58+PT4+funSpb17\n977yyiuFEFu2bPF4PPfcc4/67fv37w8EAl27dg2dYUZGxvbt2/1+/9ixYwOBwMqVK1NSUgYN\nGqTxzwUAAKA7rcPuiSeeOH78uPrxxIkThRCTJk0aOnTo9OnTi4uLi4qKAoFA//79f/3rX6sv\ny+7cubOuri4UdrW1tRaLpekfNElNTZ09e/by5ctnzJgRFxeXm5s7d+7cMEcHAQAAZKV12C1d\nurTF9aSkpBkzZpy//sQTTzT99JZbbrnllluanSYnJ2f27NkR2iAAAIBZMaQBAABAEoQdAACA\nJAg7AAAASRB2AAAAkiDsAAAAJEHYAQAASIKwAwAAkARhBwAAIAnCDgAAQBKEHQAAgCQIOwAA\nAEkQdgAAAJIg7AAAACRh13sDWqupqVmxYkWLXwoEAh6PJz4+Pi4uTuNdacPr9drtdpvNpvdG\noqKhocFisSQmJuq9kajw+/3BYDA+Pl7vjUSF1+v1+/2JiYlWq4T/1FQUxePxSHzL9Hq9Ej9s\nqk8Kst4yGxoabDZbQkKC3nuJCp/PJ4SQ75YZCATCn8CiKIo2WzGCxsbG999/v7WvVlVVff75\n53369MnNzdVyV5oJBAJWq9Visei9kajYvHlzYmLiP/3TP+m9kagIBoNCCCmfXYQQO3bsOHz4\n8K233pqUlKT3XqLC7/fb7XL+K7qysnLnzp15eXk5OTl67yUq/H6/zWaT8mEzEAhs3rw5IyNj\n4MCBeu8lKiR+2ExJSbnzzjtb+6qcjzWtiY+PHzFiRGtf/fvf/15cXHz99deHOQ0Ma/78+d27\nd+e6M6Ovvvpq165dzz//fPfu3fXeCy7ORx99tGLFittvv527nul4PJ6XXnrppz/9KdedZCQs\nWQAAgNhE2AEAAEiCsAMAAJBEbP3yBAAAgMQ4YgcAACAJwg4AAEAShB0AAIAkYuvv2DUzffr0\n8vLy0KcJCQlr165tdpr6+vri4uKvv/7a5/Pl5ubm5+d37txZ012iJTU1NcuWLdu1a1djY2NO\nTs7DDz/cr1+/Zqdpy/ULLbXl3sQ9zrC405kXT3YxJaZ/eWLixIkjRoy48cYb1U+tVmtGRkaz\n07zwwgv19fVTp051OByrVq0qLy9/9dVXpfxL1uZSWFgYHx8/ZcqUxMTEVatW7dixY+nSpc0G\n47Tl+oWW2nJv4h5nWNzpzIsnu5gS09eZ2+3OysrqdNb5N/SqqqovvvhiypQpTqezW7du+fn5\nlZWVu3fv1mW3CHG73ZmZmQUFBTk5OV27dh0/fnxdXd3BgwfPP1n46xdaasu9iXucYXGnMzWe\n7GJK7L4U6/P5vF7v9u3bS0pK3G53nz59xo8f32yi0b59++Li4pxOp/ppSkpKdnb23r17r7nm\nGj22jDNSU1NnzZoV+rS6utpqtXbq1Knpadpy/UJLbbk3cY8zLO505sWTXayJ3SN2DQ0N6enp\nfr//0Ucf/dWvftXY2Dhr1qxTp041PU1dXV1qamrT8c9paWkul0vzzaJVbrf797///bBhwzp0\n6NB0vS3XL7TUlnsT9zhT4E5nLjzZxZoYOmK3bdu2efPmqR/PnTs3Ly9v5cqVoa/OnDlzwoQJ\nn3322R133NH0u5re0KGX86879eNDhw7Nnj372muvnTBhQrNvSUtLa8v1Cy215d7EPc7guNOZ\nThuvF+560oihsBswYMDChQvVj7Oyspp9NTExMTMzs6qqqulienp6XV2doiihW7zL5Wr2j1Ro\noMXrbteuXb/73e/uv//+n//85xc8hxavX2ipLfcm7nEGx51OAjzZSS+GXopNSkrqeZbD4aio\nqFi0aJHf71e/6vF4Tpw40Sz4+vbt6/P5Dhw4oH6qvlk4dLgImml23Qkhvv3225deeqmwsLC1\nJ5i2XL/QUlvuTdzjjIw7nUnxZBdrbEVFRXrvQR82m23JkiWVlZW9evVyuVxvvPFGfX39v/3b\nv9nt9i1btnz77be5ubmJiYkVFRUff/xxbm5uQ0PDa6+9lpyc/OCDD3LIWl+NjY3PPvvsv/zL\nvwwYMKDhLKvV2vS6C3P96r39GBXm3sQ9zvi405kXT3axJqb/jl1paeny5cvV3wbKzc2dPHly\nly5dhBAvv/xyXV3d7NmzhRANDQ3FxcU7duwIBAL9+/fPz8/n6LTudu3a9cwzzzRbnDp16t13\n3930umvt+oVeWrs3cY8zPu50psaTXUyJ6bADAACQSQy9xw4AAEBuhB0AAIAkCDsAAABJEHYA\nAACSIOwAAAAkQdgBAABIgrADAACQBGEHAOEMHjz4iiuu0HsXANAmhB0AAIAkCDsAAABJEHYA\n0FZDhgy5+eabd+zYcfvtt1922WWdO3e+//77jx8/3pbvvf766wcOHPjXv/71hhtuSEpKysjI\nmDhxosvlivaeAcQUu94bAADTiI+P//7776dOnTpnzpyrr75669at9913n8Ph+MMf/nDB73U4\nHPv37//Vr361cOHCfv36bd68eeLEiSdPntywYUP0Nw4gVhB2AHARDh48uHr16ptuukkIMXLk\nyFtuuWXLli1t+Uar1XrixIn169cPGjRICDFu3LhPPvnkzTffPHjwYI8ePaK7aQAxg5diAeAi\nJCUlqVWnys7OPnr0aBu/Nzk5efDgwaFPhwwZIoT45ptvIrtDALGMsAOAi5CZmdn0U7vdHgwG\n2/i9Xbp0sVgsoU87duwohDh27FgEtwcgxhF2AKAPv98vhLBaeRwGEDE8oACARo4cORIIBEKf\nqsfqunTpot+OAMiGsAMAjZw+ffqjjz4KffqXv/zF4XDccMMNOm4JgGT4rVgA0EiPHj1mzJhR\nUVHRp0+fDz/8cNOmTePHj+/QoYPe+wIgD8IOADSSnJxcUlJSWFj45ZdfOhyOyZMnv/LKK3pv\nCoBULIqi6L0HAJDf4MGDq6qqvvvuO703AkBmvMcOAABAEoQdAACAJAg7AGivzZs3W8JasmSJ\n3nsEEBN4jx0AtFd9fX15eXmYE3Tv3p3ffgWgAcIOAABAErwUCwAAIAnCDgAAQBKEHQAAgCQI\nOwAAAEkQdgAAAJIg7AAAACTx/6OzExQog4ycAAAAAElFTkSuQmCC" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ], - "id": "t7axTjsnq8qI" - }, - { - "cell_type": "code", - "source": [ - "model <- lm(ln_q ~ ln_p, data = data)\n", - "elasticity <- coef(model)[\"ln_p\"]\n", - "se <- summary(model)$coefficients[\"ln_p\", \"Std. Error\"]\n", - "r_squared_adj <- summary(model)$adj.r.squared\n", - "cat(sprintf(\"Elasticity: %f, SE: %f, R2: %f\\n\\n\", elasticity, se, r_squared_adj))\n", - "conf_intervals <- confint(model, c(\"(Intercept)\", \"ln_p\"), level = 0.95)\n", - "print(conf_intervals)" - ], - "metadata": { - "id": "aZPOUT5SvR4o", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "c5186cae-c1da-4c35-91b0-08f0b9986919" - }, - "execution_count": 18, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Elasticity: -0.062508, SE: 0.012995, R2: 0.002953\n", - "\n", - " 2.5 % 97.5 %\n", - "(Intercept) -10.24409466 -10.11228387\n", - "ln_p -0.08798159 -0.03703361\n" - ] - } - ], - "id": "aZPOUT5SvR4o" - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cr2Ha215l4QK" - }, - "source": [ - "Let's begin with a simple prediction task. We will discover how well can we explain the price of these products using their textual descriptions." - ], - "id": "cr2Ha215l4QK" - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"caTools\")\n", - "install.packages(\"base\")\n", - "library(caTools)" - ], - "metadata": { - "id": "pGt-G-qpciGd", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "dfdf62d9-66b7-4787-b00c-b019f9064cf4" - }, - "execution_count": 19, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependency ‘bitops’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Warning message:\n", - "“package ‘base’ is a base package, and should not be updated”\n" - ] - } - ], - "id": "pGt-G-qpciGd" - }, - { - "cell_type": "code", - "source": [ - "library(caTools)\n", - "set.seed(124)\n", - "split <- sample.split(Y = data$ln_p, SplitRatio = 0.8)\n", - "train_main <- data[split, ]\n", - "holdout <- data[!split, ]\n", - "split_main <- sample.split(Y = train_main$ln_p, SplitRatio = 0.75)\n", - "train <- train_main[split_main, ]\n", - "val <- train_main[!split_main, ]" - ], - "metadata": { - "id": "EPX_-K9CPpuO" - }, - "id": "EPX_-K9CPpuO", - "execution_count": 48, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "library(reticulate)\n", - "use_python(\"/usr/bin/python3\", required = TRUE)\n", - "py_run_string('import tensorflow as tf')\n", - "\n", - "py$train_texts <- train$text\n", - "train_tensors <- py_run_string(\"\n", - "tensors = tokenizer(\n", - " list(train_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "train_tensors <- py$tensors\n", - "\n", - "py$val_texts <- val$text\n", - "val_tensors <- py_run_string(\"\n", - "val_tensors = tokenizer(\n", - " list(val_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "val_tensors <- py$val_tensors\n", - "\n", - "py$holdout_texts <- holdout$text\n", - "tensors_holdout <- py_run_string(\"\n", - "tensors_holdout = tokenizer(\n", - " list(holdout_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "tensors_holdout <- py$tensors_holdout\n", - "ln_p <- train$ln_p\n", - "ln_q <- train$ln_q\n", - "val_ln_p <- val$ln_p\n", - "val_ln_q <- val$ln_q" - ], - "metadata": { - "id": "IGFzO9sZPvAJ" - }, - "id": "IGFzO9sZPvAJ", - "execution_count": 49, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "ln_p <- train$ln_p\n", - "ln_q <- train$ln_q\n", - "val_ln_p <- val$ln_p\n", - "val_ln_q <- val$ln_q" - ], - "metadata": { - "id": "XQ4DMJQ0drZm" - }, - "execution_count": 64, - "outputs": [], - "id": "XQ4DMJQ0drZm" - }, - { - "cell_type": "markdown", - "source": [ - "# Using BERT as Feature Extractor" - ], - "metadata": { - "id": "gPYEMuKZ7ylj" - }, - "id": "gPYEMuKZ7ylj" - }, - { - "cell_type": "code", - "source": [ - "library(reticulate)\n", - "#Sys.setenv(RETICULATE_PYTHON = \"/usr/bin/python\")\n", - "library(keras)\n", - "#install_keras()" - ], - "metadata": { - "id": "ytZhy46hdDdr", - "collapsed": true - }, - "id": "ytZhy46hdDdr", - "execution_count": 26, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "library(caTools)\n", - "library(dplyr)\n", - "library(readr)\n", - "library(reticulate)\n", - "library(keras)\n", - "library(caret)\n", - "library(glmnet)\n", - "library(stringr)\n", - "\n", - "use_python(\"/usr/bin/python3\", required = TRUE)\n", - "py_run_string('import tensorflow as tf')\n", - "py_run_string('from transformers import BertTokenizer, TFBertModel')\n", - "py_run_string('\n", - "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", - "')\n", - "\n", - "py$train_texts <- train$text\n", - "train_tensors <- py_run_string(\"\n", - "tensors = tokenizer(\n", - " list(train_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "train_tensors <- py$tensors\n", - "\n", - "py$val_texts <- val$text\n", - "val_tensors <- py_run_string(\"\n", - "val_tensors = tokenizer(\n", - " list(val_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "val_tensors <- py$val_tensors\n", - "\n", - "py$holdout_texts <- holdout$text\n", - "tensors_holdout <- py_run_string(\"\n", - "tensors_holdout = tokenizer(\n", - " list(holdout_texts),\n", - " padding=True,\n", - " truncation=True,\n", - " max_length=128,\n", - " return_tensors='tf'\n", - ")\")\n", - "tensors_holdout <- py$tensors_holdout\n", - "\n", - "ln_p <- train$ln_p\n", - "val_ln_p <- val$ln_p\n", - "holdout_ln_p <- holdout$ln_p\n", - "\n", - "py_run_string('\n", - "import tensorflow as tf\n", - "from transformers import TFBertModel\n", - "\n", - "# Define the input layers\n", - "input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"input_ids\")\n", - "token_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"token_type_ids\")\n", - "attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"attention_mask\")\n", - "\n", - "# Load the pre-trained BERT model\n", - "bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", - "outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)\n", - "\n", - "# Define the embedding model\n", - "embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :])\n", - "')\n", - "\n", - "py_run_string('\n", - "import numpy as np\n", - "embeddings = embedding_model.predict({\n", - " \"input_ids\": tf.convert_to_tensor(tensors[\"input_ids\"]),\n", - " \"token_type_ids\": tf.convert_to_tensor(tensors[\"token_type_ids\"]),\n", - " \"attention_mask\": tf.convert_to_tensor(tensors[\"attention_mask\"])\n", - "})\n", - "')\n", - "\n", - "embeddings <- py$embeddings\n", - "\n", - "py$ln_p <- ln_p\n", - "py_run_string('\n", - "from sklearn.linear_model import LassoCV\n", - "from sklearn.model_selection import KFold\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.pipeline import make_pipeline\n", - "\n", - "lcv = make_pipeline(StandardScaler(), LassoCV(cv=KFold(n_splits=5, shuffle=True, random_state=123), random_state=123))\n", - "lcv.fit(embeddings, ln_p)\n", - "')\n", - "\n", - "py_run_string('\n", - "embeddings_val = embedding_model.predict({\n", - " \"input_ids\": tf.convert_to_tensor(val_tensors[\"input_ids\"]),\n", - " \"token_type_ids\": tf.convert_to_tensor(val_tensors[\"token_type_ids\"]),\n", - " \"attention_mask\": tf.convert_to_tensor(val_tensors[\"attention_mask\"])\n", - "})\n", - "val_predictions = lcv.predict(embeddings_val)\n", - "')\n", - "\n", - "val_predictions <- py$val_predictions\n", - "\n", - "r2_val <- caret::R2(val_predictions, val_ln_p)\n", - "\n", - "py_run_string('\n", - "embeddings_holdout = embedding_model.predict({\n", - " \"input_ids\": tf.convert_to_tensor(tensors_holdout[\"input_ids\"]),\n", - " \"token_type_ids\": tf.convert_to_tensor(tensors_holdout[\"token_type_ids\"]),\n", - " \"attention_mask\": tf.convert_to_tensor(tensors_holdout[\"attention_mask\"])\n", - "})\n", - "holdout_predictions = lcv.predict(embeddings_holdout)\n", - "')\n", - "\n", - "holdout_predictions <- py$holdout_predictions\n", - "\n", - "r2_holdout <- caret::R2(holdout_predictions, holdout_ln_p)\n", - "\n", - "print(r2_val)\n", - "print(r2_holdout)\n", - "ln_p_hat_holdout <- holdout_predictions" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 835 - }, - "id": "TsqfQ3OH-HR3", - "outputId": "a4107a7d-ce9e-4d09-a63e-1bab3174bf76" - }, - "id": "TsqfQ3OH-HR3", - "execution_count": 65, - "outputs": [ - { - "output_type": "error", - "ename": "ERROR", - "evalue": "Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).\n\nCould not build a TypeSpec for name: \"tf.debugging.assert_less_4/assert_less/Assert/Assert\"\nop: \"Assert\"\ninput: \"tf.debugging.assert_less_4/assert_less/All\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_0\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_1\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_2\"\ninput: \"Placeholder\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_4\"\ninput: \"tf.debugging.assert_less_4/assert_less/y\"\nattr {\n key: \"T\"\n value {\n list {\n type: DT_STRING\n type: DT_STRING\n type: DT_STRING\n type: DT_INT32\n type: DT_STRING\n type: DT_INT32\n }\n }\n}\nattr {\n key: \"summarize\"\n value {\n i: 3\n }\n}\n of unsupported type .\n\nCall arguments received by layer 'embeddings' (type TFBertEmbeddings):\n • input_ids=\n • position_ids=None\n • token_type_ids=\n • inputs_embeds=None\n • past_key_values_length=0\n • training=False", - "traceback": [ - "Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).\n\nCould not build a TypeSpec for name: \"tf.debugging.assert_less_4/assert_less/Assert/Assert\"\nop: \"Assert\"\ninput: \"tf.debugging.assert_less_4/assert_less/All\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_0\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_1\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_2\"\ninput: \"Placeholder\"\ninput: \"tf.debugging.assert_less_4/assert_less/Assert/Assert/data_4\"\ninput: \"tf.debugging.assert_less_4/assert_less/y\"\nattr {\n key: \"T\"\n value {\n list {\n type: DT_STRING\n type: DT_STRING\n type: DT_STRING\n type: DT_INT32\n type: DT_STRING\n type: DT_INT32\n }\n }\n}\nattr {\n key: \"summarize\"\n value {\n i: 3\n }\n}\n of unsupported type .\n\nCall arguments received by layer 'embeddings' (type TFBertEmbeddings):\n • input_ids=\n • position_ids=None\n • token_type_ids=\n • inputs_embeds=None\n • past_key_values_length=0\n • training=FalseTraceback:\n", - "1. py_run_string(\"\\nimport tensorflow as tf\\nfrom transformers import TFBertModel\\n\\n# Define the input layers\\ninput_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\\\"input_ids\\\")\\ntoken_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\\\"token_type_ids\\\")\\nattention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\\\"attention_mask\\\")\\n\\n# Load the pre-trained BERT model\\nbert_model = TFBertModel.from_pretrained(\\\"bert-base-uncased\\\")\\noutputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)\\n\\n# Define the embedding model\\nembedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :])\\n\")", - "2. py_run_string_impl(code, local, convert)" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Linear Probing: Training Only Final Layer after BERT" - ], - "metadata": { - "id": "mOc1_C5p7ta7" - }, - "id": "mOc1_C5p7ta7" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ck1xqRIrmx8I" - }, - "outputs": [], - "source": [ - "### Now let's prepare our model\n", - "\n", - "from tensorflow.keras import Model, Input\n", - "from tensorflow.keras.layers import Dense, Dropout, Concatenate\n", - "import tensorflow_addons as tfa\n", - "from tensorflow.keras import regularizers\n", - "\n", - "tf.keras.utils.set_random_seed(123)\n", - "\n", - "input_ids = Input(shape=(128,), dtype=tf.int32)\n", - "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", - "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", - "\n", - "# # First we compute the text embedding\n", - "Z = bert(input_ids, token_type_ids, attention_mask)\n", - "\n", - "for layer in bert.layers:\n", - " layer.trainable=False\n", - " for w in layer.weights: w._trainable=False\n", - "\n", - "# # We want the \"pooled / summary\" embedding, not individual word embeddings\n", - "Z = Z[1]\n", - "\n", - "# # Then we do a regular regression\n", - "# Z = Dropout(0.2)(Z)\n", - "ln_p_hat = Dense(1, activation='linear',\n", - " kernel_regularizer=regularizers.L2(1e-3))(Z)\n", - "\n", - "PricePredictionNetwork = Model([\n", - " input_ids,\n", - " token_type_ids,\n", - " attention_mask,\n", - " ], ln_p_hat)\n", - "PricePredictionNetwork.compile(\n", - " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),\n", - " loss=tf.keras.losses.MeanSquaredError(),\n", - " metrics=tfa.metrics.RSquare(),\n", - ")\n", - "PricePredictionNetwork.summary()" - ], - "id": "Ck1xqRIrmx8I" - }, - { - "cell_type": "code", - "source": [ - "from livelossplot import PlotLossesKeras\n", - "\n", - "tf.keras.utils.set_random_seed(123)\n", - "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", - "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/pweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", - "\n", - "PricePredictionNetwork.fit(\n", - " x= [tensors['input_ids'],\n", - " tensors['token_type_ids'],\n", - " tensors['attention_mask'],],\n", - " y=ln_p,\n", - " validation_data = (\n", - " [val_tensors['input_ids'],\n", - " val_tensors['token_type_ids'],\n", - " val_tensors['attention_mask']], val_ln_p\n", - " ),\n", - " epochs=5,\n", - " callbacks = [earlystopping, modelcheckpoint,\n", - " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", - " batch_size=16,\n", - " shuffle=True)" - ], - "metadata": { - "id": "XhTREb3NcZhH" - }, - "execution_count": null, - "outputs": [], - "id": "XhTREb3NcZhH" - }, - { - "cell_type": "markdown", - "source": [ - "# Fine Tuning starting from the Linear Probing Trained Weights\n", - "\n", - "Now we train the whole network, initializing the weights based on the result of the linear probing phase in the previous section." - ], - "metadata": { - "id": "MyFxR5GC8C3K" - }, - "id": "MyFxR5GC8C3K" - }, - { - "cell_type": "code", - "source": [ - "### Now let's prepare our model\n", - "\n", - "from tensorflow.keras import Model, Input\n", - "from tensorflow.keras.layers import Dense, Dropout, Concatenate\n", - "import tensorflow_addons as tfa\n", - "from tensorflow.keras import regularizers\n", - "\n", - "tf.keras.utils.set_random_seed(123)\n", - "\n", - "input_ids = Input(shape=(128,), dtype=tf.int32)\n", - "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", - "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", - "\n", - "# # First we compute the text embedding\n", - "Z = bert(input_ids, token_type_ids, attention_mask)\n", - "\n", - "for layer in bert.layers:\n", - " layer.trainable=True\n", - " for w in layer.weights: w._trainable=True\n", - "\n", - "# # We want the \"pooled / summary\" embedding, not individual word embeddings\n", - "Z = Z[1]\n", - "\n", - "# # Then we do a regularized linear regression\n", - "ln_p_hat = Dense(1, activation='linear',\n", - " kernel_regularizer=regularizers.L2(1e-3))(Z)\n", - "\n", - "PricePredictionNetwork = Model([\n", - " input_ids,\n", - " token_type_ids,\n", - " attention_mask,\n", - " ], ln_p_hat)\n", - "PricePredictionNetwork.compile(\n", - " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),\n", - " loss=tf.keras.losses.MeanSquaredError(),\n", - " metrics=tfa.metrics.RSquare(),\n", - ")\n", - "PricePredictionNetwork.summary()" - ], - "metadata": { - "id": "NzWCkTY87luH" - }, - "execution_count": null, - "outputs": [], - "id": "NzWCkTY87luH" - }, - { - "cell_type": "code", - "source": [ - "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" - ], - "metadata": { - "id": "PWauCl0T7nUo" - }, - "execution_count": null, - "outputs": [], - "id": "PWauCl0T7nUo" - }, - { - "cell_type": "code", - "source": [ - "from livelossplot import PlotLossesKeras\n", - "\n", - "tf.keras.utils.set_random_seed(123)\n", - "\n", - "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", - "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/pweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", - "\n", - "PricePredictionNetwork.fit(\n", - " x= [tensors['input_ids'],\n", - " tensors['token_type_ids'],\n", - " tensors['attention_mask'],],\n", - " y=ln_p,\n", - " validation_data = (\n", - " [val_tensors['input_ids'],\n", - " val_tensors['token_type_ids'],\n", - " val_tensors['attention_mask']], val_ln_p\n", - " ),\n", - " epochs=10,\n", - " callbacks = [earlystopping, modelcheckpoint,\n", - " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", - " batch_size=16,\n", - " shuffle=True)" - ], - "metadata": { - "id": "iDSBZWAe8nhE" - }, - "execution_count": null, - "outputs": [], - "id": "iDSBZWAe8nhE" - }, - { - "cell_type": "code", - "source": [ - "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" - ], - "metadata": { - "id": "wchpbXoqBAJu" - }, - "execution_count": null, - "outputs": [], - "id": "wchpbXoqBAJu" - }, - { - "cell_type": "code", - "source": [ - "# Compute predictions\n", - "ln_p_hat_holdout = PricePredictionNetwork.predict([\n", - " tensors_holdout['input_ids'],\n", - " tensors_holdout['token_type_ids'],\n", - " tensors_holdout['attention_mask'],\n", - " ])" - ], - "metadata": { - "id": "jpUmDHYfkJEZ" - }, - "execution_count": null, - "outputs": [], - "id": "jpUmDHYfkJEZ" - }, - { - "cell_type": "code", - "source": [ - "print('Neural Net R^2, Price Prediction:')\n", - "get_r2(holdout['ln_p'], ln_p_hat_holdout)" - ], - "metadata": { - "id": "g_XK81hpkQMN" - }, - "execution_count": null, - "outputs": [], - "id": "g_XK81hpkQMN" - }, - { - "cell_type": "code", - "source": [ - "import matplotlib.pyplot as plt\n", - "plt.hist(ln_p_hat_holdout)\n", - "plt.show()" - ], - "metadata": { - "id": "GR4QP4DJPQk0" - }, - "execution_count": null, - "outputs": [], - "id": "GR4QP4DJPQk0" - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "RAGwE4peL1Me" - }, - "id": "RAGwE4peL1Me", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bafy9ftcoBed" - }, - "source": [ - "Now, let's go one step further and construct a DML estimator of the average price elasticity. In particular, we will model market share $q_i$ as\n", - "$$\\ln q_i = \\alpha + \\beta \\ln p_i + \\psi(d_i) + \\epsilon_i,$$ where $d_i$ denotes the description of product $i$ and $\\psi$ is the composition of text embedding and a linear layer." - ], - "id": "bafy9ftcoBed" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qiteu6FaoctV" - }, - "outputs": [], - "source": [ - "## Build the quantity prediction network\n", - "\n", - "tf.keras.utils.set_random_seed(123)\n", - "\n", - "# Initialize new BERT model from original\n", - "bert2 = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", - "\n", - "# for layer in bert2.layers:\n", - "# layer.trainable=False\n", - "# for w in layer.weights: w._trainable=False\n", - "\n", - "# Define inputs\n", - "input_ids = Input(shape=(128,), dtype=tf.int32)\n", - "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", - "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", - "\n", - "# First we compute the text embedding\n", - "Z = bert2(input_ids, token_type_ids, attention_mask)\n", - "\n", - "# We want the \"pooled / summary\" embedding, not individual word embeddings\n", - "Z = Z[1]\n", - "\n", - "ln_q_hat = Dense(1, activation='linear', kernel_regularizer=regularizers.L2(1e-3))(Z)\n", - "\n", - "# Compile model and optimization routine\n", - "QuantityPredictionNetwork = Model([\n", - " input_ids,\n", - " token_type_ids,\n", - " attention_mask,\n", - " ], ln_q_hat)\n", - "QuantityPredictionNetwork.compile(\n", - " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),\n", - " loss=tf.keras.losses.MeanSquaredError(),\n", - " metrics=tfa.metrics.RSquare(),\n", - ")\n", - "QuantityPredictionNetwork.summary()" - ], - "id": "Qiteu6FaoctV" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "aaxHV0gGMqpw" - }, - "outputs": [], - "source": [ - "## Fit the quantity prediction network in the main sample\n", - "tf.keras.utils.set_random_seed(123)\n", - "\n", - "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", - "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/qweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", - "\n", - "QuantityPredictionNetwork.fit(\n", - " [\n", - " tensors['input_ids'],\n", - " tensors['token_type_ids'],\n", - " tensors['attention_mask'],\n", - " ],\n", - " ln_q,\n", - " validation_data = (\n", - " [val_tensors['input_ids'],\n", - " val_tensors['token_type_ids'],\n", - " val_tensors['attention_mask']], val_ln_q\n", - " ),\n", - " epochs=10,\n", - " callbacks = [earlystopping, modelcheckpoint,\n", - " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", - " batch_size=16,\n", - " shuffle=True)" - ], - "id": "aaxHV0gGMqpw" - }, - { - "cell_type": "code", - "source": [ - "QuantityPredictionNetwork.load_weights(\"/content/gdrive/MyDrive/qweights.hdf5\")" - ], - "metadata": { - "id": "TfyQV3lw-xf2" - }, - "execution_count": null, - "outputs": [], - "id": "TfyQV3lw-xf2" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YADpNj0jMygZ" - }, - "outputs": [], - "source": [ - "## Predict in the holdout sample, residualize and regress\n", - "\n", - "ln_q_hat_holdout = QuantityPredictionNetwork.predict([\n", - " tensors_holdout['input_ids'],\n", - " tensors_holdout['token_type_ids'],\n", - " tensors_holdout['attention_mask'],\n", - " ])" - ], - "id": "YADpNj0jMygZ" - }, - { - "cell_type": "code", - "source": [ - "print('Neural Net R^2, Quantity Prediction:')\n", - "get_r2(holdout['ln_q'], ln_q_hat_holdout)" - ], - "metadata": { - "id": "jh4criU1hGIP" - }, - "execution_count": null, - "outputs": [], - "id": "jh4criU1hGIP" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ir-_yAfkPM6f" - }, - "outputs": [], - "source": [ - "# Compute residuals\n", - "r_p = holdout[\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))\n", - "r_q = holdout[\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))\n", - "\n", - "# Regress to obtain elasticity estimate\n", - "beta = np.mean(r_p * r_q) / np.mean(r_p * r_p)\n", - "\n", - "# standard error on elastiticy estimate\n", - "se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout[\"ln_p\"].size)\n", - "\n", - "print('Elasticity of Demand with Respect to Price: {}'.format(beta))\n", - "print('Standard Error: {}'.format(se))" - ], - "id": "ir-_yAfkPM6f" - }, - { - "cell_type": "markdown", - "source": [ - "# Heterogeneous Elasticities within Major Product Categories\n", - "\n", - "We now look at the major product categories that have many products and we investigate whether the \"within group\" price elasticities" - ], - "metadata": { - "id": "VCqeRTB_BNEH" - }, - "id": "VCqeRTB_BNEH" - }, - { - "cell_type": "code", - "source": [ - "holdout['category'] = holdout['amazon_category_and_sub_category'].str.split('>').apply(lambda x: x[0])" - ], - "metadata": { - "id": "XRUZEXqc8HPG" - }, - "execution_count": null, - "outputs": [], - "id": "XRUZEXqc8HPG" - }, - { - "cell_type": "code", - "source": [ - "# Elasticity within the main product categories\n", - "sql.run(\"\"\"\n", - " SELECT category, COUNT(*)\n", - " FROM holdout\n", - " GROUP BY 1\n", - " HAVING COUNT(*)>=100\n", - " ORDER BY 2 desc\n", - "\"\"\")" - ], - "metadata": { - "id": "ymWJv4Ej7lt9" - }, - "execution_count": null, - "outputs": [], - "id": "ymWJv4Ej7lt9" - }, - { - "cell_type": "code", - "source": [ - "main_cats = sql.run(\"\"\"\n", - " SELECT category\n", - " FROM holdout\n", - " GROUP BY 1\n", - " HAVING COUNT(*)>=100\n", - "\"\"\")['category']\n", - "\n", - "dfs = []\n", - "for cat in main_cats:\n", - " r_p = holdout[holdout['category'] == cat][\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))[holdout['category'] == cat]\n", - " r_q = holdout[holdout['category'] == cat][\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))[holdout['category'] == cat]\n", - " # Regress to obtain elasticity estimate\n", - " beta = np.mean(r_p * r_q) / np.mean(r_p * r_p)\n", - "\n", - " # standard error on elastiticy estimate\n", - " se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout[\"ln_p\"].size)\n", - "\n", - " df = pd.DataFrame({'point': beta, 'se': se, 'lower': beta - 1.96 * se, 'upper': beta + 1.96 * se}, index=[0])\n", - " df['category'] = cat\n", - " df['N'] = holdout[holdout['category'] == cat].shape[0]\n", - " dfs.append(df)\n", - "\n", - "df = pd.concat(dfs)\n", - "df" - ], - "metadata": { - "id": "E3ncPtwt8nJi" - }, - "execution_count": null, - "outputs": [], - "id": "E3ncPtwt8nJi" - }, - { - "cell_type": "markdown", - "source": [ - "## Clustering Products\n", - "\n", - "In this final part of the notebook, we'll illustrate how the BERT text embeddings can be used to cluster products based on their descriptions.\n", - "\n", - "Intiuitively, our neural network has now learned which aspects of the text description are relevant to predict prices and market shares.\n", - "We can therefore use the embeddings produced by our network to cluster products, and we might expect that the clusters reflect market-relevant information.\n", - "\n", - "In the following block of cells, we compute embeddings using our learned models and cluster them using $k$-means clustering with $k=10$. Finally, we will explore how the estimated price elasticity differs across clusters.\n", - "\n", - "### Overview of **$k$-means clustering**\n", - "The $k$-means clustering algorithm seeks to divide $n$ data vectors into $k$ groups, each of which contain points that are \"close together.\"\n", - "\n", - "In particular, let $C_1, \\ldots, C_k$ be a partitioning of the data into $k$ disjoint, nonempty subsets (clusters), and define\n", - "$$\\bar{C_i}=\\frac{1}{\\#C_i}\\sum_{x \\in C_i} x$$\n", - "to be the *centroid* of the cluster $C_i$. The $k$-means clustering score $\\mathrm{sc}(C_1 \\ldots C_k)$ is defined to be\n", - "$$\\mathrm{sc}(C_1 \\ldots C_k) = \\sum_{i=1}^k \\sum_{x \\in C_i} \\left(x - \\bar{C_i}\\right)^2.$$\n", - "\n", - "The $k$-means clustering is then defined to be any partitioning $C^*_1 \\ldots C^*_k$ that minimizes the score $\\mathrm{sc}(-)$.\n" - ], - "metadata": { - "id": "QFTjf9vP6Zfu" - }, - "id": "QFTjf9vP6Zfu" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mc7I00JPK6wJ" - }, - "outputs": [], - "source": [ - "## STEP 1: Compute embeddings\n", - "\n", - "input_ids = Input(shape=(128,), dtype=tf.int32)\n", - "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", - "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", - "\n", - "Y1 = bert(input_ids, token_type_ids, attention_mask)[1]\n", - "Y2 = bert2(input_ids, token_type_ids, attention_mask)[1]\n", - "Y = Concatenate()([Y1,Y2])\n", - "\n", - "embedding_model = Model([input_ids, token_type_ids, attention_mask], Y)\n", - "\n", - "embeddings = embedding_model.predict([tensors_holdout['input_ids'],\n", - " tensors_holdout['token_type_ids'],\n", - " tensors_holdout['attention_mask']])" - ], - "id": "Mc7I00JPK6wJ" - }, - { - "cell_type": "markdown", - "source": [ - "### Dimension reduction and the **Johnson-Lindenstrauss transform**\n", - "\n", - "Our learned embeddings have dimension in the $1000$s, and $k$-means clustering is often an expensive operation. To improve the situation, we will use a neat trick that is used extensively in machine learning applications: the *Johnson-Lindenstrauss transform*.\n", - "\n", - "This trick involves finding a low-dimensional linear projection of the embeddings that approximately preserves pairwise distances.\n", - "\n", - "In fact, Johnson and Lindenstrauss proved a much more interesting statement: a Gaussian random matrix will *almost always* approximately preserve pairwise distances.\n", - "\n" - ], - "metadata": { - "id": "hCG2MunU6iF-" - }, - "id": "hCG2MunU6iF-" - }, - { - "cell_type": "code", - "source": [ - "# STEP 2 Make low-dimensional projections\n", - "from sklearn.random_projection import GaussianRandomProjection\n", - "\n", - "jl = GaussianRandomProjection(eps=.25)\n", - "embeddings_lowdim = jl.fit_transform(embeddings)" - ], - "metadata": { - "id": "afGiLR7v6ecJ" - }, - "execution_count": null, - "outputs": [], - "id": "afGiLR7v6ecJ" - }, - { - "cell_type": "code", - "source": [ - "# STEP 3 Compute clusters\n", - "from sklearn.cluster import KMeans\n", - "\n", - "k_means = KMeans(n_clusters=10)\n", - "k_means.fit(embeddings_lowdim)\n", - "cluster_ids = k_means.labels_" - ], - "metadata": { - "id": "9Tl9AM3J6j3X" - }, - "execution_count": null, - "outputs": [], - "id": "9Tl9AM3J6j3X" - }, - { - "cell_type": "code", - "source": [ - "# STEP 4 Regress within each cluster\n", - "\n", - "betas = np.zeros(10)\n", - "ses = np.zeros(10)\n", - "\n", - "r_p = holdout[\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))\n", - "r_q = holdout[\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))\n", - "\n", - "for c in range(10):\n", - "\n", - " r_p_c = r_p[cluster_ids == c]\n", - " r_q_c = r_q[cluster_ids == c]\n", - "\n", - " # Regress to obtain elasticity estimate\n", - " betas[c] = np.mean(r_p_c * r_q_c) / np.mean(r_p_c * r_p_c)\n", - "\n", - " # standard error on elastiticy estimate\n", - " ses[c] = np.sqrt(np.mean( (r_p_c * r_q_c)**2)/(np.mean(r_p_c*r_p_c)**2)/r_p_c.size)" - ], - "metadata": { - "id": "0l7De-Do6mD0" - }, - "execution_count": null, - "outputs": [], - "id": "0l7De-Do6mD0" - }, - { - "cell_type": "code", - "source": [ - "# STEP 5 Plot\n", - "from matplotlib import pyplot as plt\n", - "\n", - "plt.bar(range(10), betas, yerr = 1.96 * ses)" - ], - "metadata": { - "id": "oXoe98f06njT" - }, - "execution_count": null, - "outputs": [], - "id": "oXoe98f06njT" - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.0.5" - }, - "papermill": { - "default_parameters": {}, - "duration": 427.936706, - "end_time": "2022-04-19T09:13:53.230849", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2022-04-19T09:06:45.294143", - "version": "2.3.4" - }, + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "dYwg9btt1wJH" + }, + "source": [ + "# BERT\n", + "\n", + "**Bidirectional Encoder Representations from Transformers.**\n", + "\n", + "_ | _\n", + "- | -\n", + "![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png)\n", + "\n", + "\n", + "### **Overview**\n", + "\n", + "BERT was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin *et al.* The model is based on the Transformer architecture introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani *et al.* and has led to significant improvements in a wide range of natural language tasks.\n", + "\n", + "At the highest level, BERT maps from a block of text to a numeric vector which summarizes the relevant information in the text.\n", + "\n", + "What is remarkable is that numeric summary is sufficiently informative that, for example, the numeric summary of a paragraph followed by a reading comprehension question contains all the information necessary to satisfactorily answer the question.\n", + "\n", + "#### **Transfer Learning**\n", + "\n", + "BERT is a great example of a paradigm called *transfer learning*, which has proved very effective in recent years. In the first step, a network is trained on an unsupervised task using massive amounts of data. In the case of BERT, it was trained to predict missing words and to detect when pairs of sentences are presented in reversed order using all of Wikipedia. This was initially done by Google, using intense computational resources.\n", + "\n", + "Once this network has been trained, it is then used to perform many other supervised tasks using only limited data and computational resources: for example, sentiment classification in tweets or quesiton answering. The network is re-trained to perform these other tasks in such a way that only the final, output parts of the network are allowed to adjust by very much, so that most of the \"information'' originally learned the network is preserved. This process is called *fine tuning*." + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "wNjNs3ViKTiO" + }, + "source": [ + "##Getting to know BERT\n", + "\n", + "BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology.\n", + "\n", + "In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9rooQWVdri1m", + "outputId": "e8d7baf0-1420-4405-d5d5-c63466fbbcdd" + }, + "outputs": [], + "source": [ + "install.packages(\"remotes\")\n", + "remotes::install_github(\"rstudio/tensorflow\")\n", + "install.packages(\"dplyr\")\n", + "install.packages(\"DBI\")\n", + "install.packages(\"ggplot2\")\n", + "install.packages(\"reticulate\")\n", + "install.packages(\"readr\")\n", + "install.packages(\"stringr\")\n", + "install.packages(\"tidyr\")\n", + "install.packages(\"purrr\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"caret\")\n", + "install.packages(\"keras\")" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": { + "id": "TgWpXdSIl5KL" + }, + "source": [ + "##Getting to know BERT\n", + "\n", + "BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology.\n", + "\n", + "In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { "colab": { - "provenance": [] - } + "base_uri": "https://localhost:8080/" + }, + "id": "ppJlcoIatlAw", + "outputId": "3a5cbcea-04c9-4c67-ccb1-1b45e691ccf8" + }, + "outputs": [], + "source": [ + "library(reticulate)\n", + "library(ggplot2)\n", + "library(DBI)\n", + "library(dplyr)\n", + "theme_set(theme_bw())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "id": "GmOhRKEG4jEy" + }, + "outputs": [], + "source": [ + "use_python(\"/usr/bin/python3\", required = TRUE) # Adjust the path as needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": { + "id": "bUEb1TDIs4TK" + }, + "outputs": [], + "source": [ + "py_run_string('\n", + "import tensorflow as tf\n", + "import numpy as np\n", + "import pandas as pd\n", + "from transformers import BertTokenizer, TFBertModel\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")\n", + "')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "id": "A7HTpjkA4u54" + }, + "outputs": [], + "source": [ + "# Check GPU availability\n", + "# py_run_string('\n", + "# device_name = tf.test.gpu_device_name()\n", + "# if device_name != \"/device:GPU:0\":\n", + "# raise SystemError(\"GPU device not found\")\n", + "# print(\"Found GPU at:\", device_name)\n", + "# ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": { + "id": "hZaltj7Fv5Gh" + }, + "outputs": [], + "source": [ + "ssq <- function(x) sum(x * x)\n", + "\n", + "get_r2 <- function(y, yhat) {\n", + " resids <- yhat - y\n", + " flucs <- y - mean(y)\n", + " rss <- ssq(resids)\n", + " tss <- ssq(flucs)\n", + " cat(sprintf(\"RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f\", rss, tss + mean(y)^2, tss, 1 - rss/tss))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "id": "CB3ur5xF41o-" + }, + "outputs": [], + "source": [ + "py_run_string('\n", + "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "bert = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + "')" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "id": "26mRwUFwardQ" + }, + "source": [ + "### Tokenization\n", + "\n", + "The first step in using BERT (or any similar text embedding tool) is to *tokenize* the data. This step standardizes blocks of text, so that meaningless differences in text presentation don't affect the behavior of our algorithm.\n", + "\n", + "Typically the text is transformed into a sequence of 'tokens,' each of which corresponds to a numeric code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "cER5mL4fMSCr" + }, + "outputs": [], + "source": [ + "py_run_string('\n", + "s = \"What happens to this string?\"\n", + "tensors = tokenizer.encode_plus(s, add_special_tokens = True, return_tensors = \"tf\")\n", + "output = bert(tensors)\n", + "')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 329 + }, + "id": "KVETer7w5euE", + "outputId": "3b7b87f7-69f4-4727-bccf-3bd8d6162e00" + }, + "outputs": [], + "source": [ + "# Let's try it out!\n", + "s <- \"What happens to this string?\"\n", + "py_run_string\n", + "input_ids <- py$tensors$input_ids\n", + "attention_mask <- py$tensors$attention_mask\n", + "token_type_ids <- py$tensors$token_type_ids\n", + "\n", + "print(sprintf('Original String: \"%s\"', s))\n", + "print(\"Numeric encoding: \")\n", + "print(list(\n", + " input_ids = input_ids,\n", + " attention_mask = attention_mask,\n", + " token_type_ids = token_type_ids\n", + "))\n", + "# What does this mean?\n", + "py_run_string('tokens = tokenizer.convert_ids_to_tokens(tensors[\"input_ids\"].numpy().flatten().tolist())')\n", + "tokens <- py$tokens\n", + "print(\"Actual tokens:\")\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": { + "id": "JJaz6eEocefa" + }, + "source": [ + "### BERT in a nutshell\n", + "\n", + "Once we have our numeric tokens, we can simply plug them into the BERT network and get a numeric vector summary. Note that in applications, the BERT summary will be \"fine tuned\" to a particular task, which hasn't happened yet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Q1ODAgBMa3Zg", + "outputId": "7ccd4481-6489-4830-c86b-a3c40cb1c37c" + }, + "outputs": [], + "source": [ + "# Load the reticulate library\n", + "library(reticulate)\n", + "\n", + "input_text <- \"What happens to this string?\"\n", + "\n", + "\n", + "cat(sprintf(\"Input: \\\"%s\\\"\\n\\n\", input_text))\n", + "\n", + "py_run_string(sprintf('\n", + "tensors_tf = tokenizer(\"%s\", return_tensors=\"tf\")\n", + "output = bert(tensors_tf)\n", + "', input_text))\n", + "\n", + "output <- py$output\n", + "\n", + "py_run_string('\n", + "from pprint import pformat\n", + "output_type = str(type(output[\"pooler_output\"]))\n", + "output_shape = output[\"pooler_output\"].shape\n", + "output_preview = pformat(output[\"pooler_output\"].numpy())\n", + "')\n", + "\n", + "output_type <- py$output_type\n", + "output_shape <- py$output_shape\n", + "output_preview <- py$output_preview\n", + "\n", + "cat(sprintf(\n", + "\"Output type: %s\\n\\nOutput shape: %s\\n\\nOutput preview: %s\\n\",\n", + "output_type,\n", + "paste(output_shape, collapse=\", \"),\n", + "output_preview\n", + "))\n" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": { + "id": "y_CnEClsl_1p" + }, + "source": [ + "# A practical introduction to BERT\n", + "\n", + "In the next part of the notebook, we are going to explore how a tool like BERT may be useful for causal inference.\n", + "\n", + "In particular, we are going to apply BERT to a subset of data from the Amazon marketplace consisting of roughly 10,000 listings for products in the toy category. Each product comes with a text description, a price, and a number of times reviewed (which we'll use as a proxy for demand / market share).\n", + "\n", + "For more information on the dataset, checkout the [Dataset README](https://github.com/CausalAIBook/MetricsMLNotebooks/blob/main/data/amazon_toys.md).\n", + "\n", + "**For thought**:\n", + "What are some issues you may anticipate when using number of reviews as a proxy for demand or market share?\n", + "\n", + "### Getting to know the data\n", + "\n", + "First, we'll download and clean up the data, and do some preliminary inspection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": { + "id": "_d5eA3xyzdtb" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 247 + }, + "id": "5kzXygwH0BKw", + "outputId": "f368dd28-317f-4815-d05b-a0c2ce7f05d6" + }, + "outputs": [], + "source": [ + "library(readr)\n", + "library(stringr)\n", + "library(tidyr)\n", + "library(purrr)\n", + "data_url <- \"https://github.com/CausalAIBook/MetricsMLNotebooks/raw/main/data/amazon_toys.csv\"\n", + "data <- read_csv(data_url, show_col_types = FALSE)\n", + "problems(data)\n", + "\n", + "data <- data %>%\n", + " mutate(\n", + " number_of_reviews = as.numeric(str_replace_all(number_of_reviews, \",\", \"\"))\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1Su5vOGhD3Df", + "outputId": "577bbbd9-1a7d-4f7e-d9bd-decd46830dc0" + }, + "outputs": [], + "source": [ + "data <- data %>%\n", + " mutate(\n", + " number_of_reviews = as.numeric(str_replace_all(number_of_reviews, \"\\\\D+\", \"\")),\n", + " price = as.numeric(str_extract(price, \"\\\\d+\\\\.?\\\\d*\"))\n", + " ) %>%\n", + " filter(number_of_reviews > 0) %>%\n", + " mutate(\n", + " ln_p = log(price),\n", + " ln_q = log(number_of_reviews / sum(number_of_reviews)),\n", + " text = str_c(product_name, manufacturer, product_description, sep = \" | \")\n", + " ) %>%\n", + " select(text, ln_p, ln_q, amazon_category_and_sub_category) %>%\n", + " drop_na()\n", + "print(head(data))\n", + "data$text_num_words <- str_split(data$text, \"\\\\s+\") %>% map_int(length)\n", + "print(quantile(data$text_num_words, 0.99, na.rm = TRUE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "lovFEHaWp4lC", + "outputId": "2c191aaa-d7ed-4b62-ef28-c513fabfab05" + }, + "outputs": [], + "source": [ + "ggplot(data, aes(x = text_num_words)) +\n", + " geom_density() +\n", + " labs(title = \"Density Plot of Text Lengths in Words\")" + ] + }, + { + "cell_type": "markdown", + "id": "20", + "metadata": { + "id": "CDlHPQZfcv7I" + }, + "source": [ + "Let's make a two-way scatter plot of prices and (proxied) market shares." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 454 + }, + "id": "dNujhir1q_0N", + "outputId": "06ba2da1-4ffc-4dc4-d762-f0bd6cdc039c" + }, + "outputs": [], + "source": [ + "p1 <- ggplot(data, aes(x = ln_p, y = ln_q)) +\n", + " geom_point() +\n", + " geom_smooth(method = \"lm\", color = \"red\") +\n", + " labs(title = \"Scatter Plot with Regression Line\")\n", + "print(p1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 454 + }, + "id": "t7axTjsnq8qI", + "outputId": "84a6aa7b-41cd-43f6-e371-2c46d15c86eb" + }, + "outputs": [], + "source": [ + "p2 <- ggplot(data, aes(x = ln_p, y = ln_q)) +\n", + " geom_smooth(method = \"lm\", color = \"red\") +\n", + " labs(title = \"Regression Line Only\")\n", + "print(p2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aZPOUT5SvR4o", + "outputId": "c5186cae-c1da-4c35-91b0-08f0b9986919" + }, + "outputs": [], + "source": [ + "model <- lm(ln_q ~ ln_p, data = data)\n", + "elasticity <- coef(model)[\"ln_p\"]\n", + "se <- summary(model)$coefficients[\"ln_p\", \"Std. Error\"]\n", + "r_squared_adj <- summary(model)$adj.r.squared\n", + "cat(sprintf(\"Elasticity: %f, SE: %f, R2: %f\\n\\n\", elasticity, se, r_squared_adj))\n", + "conf_intervals <- confint(model, c(\"(Intercept)\", \"ln_p\"), level = 0.95)\n", + "print(conf_intervals)" + ] + }, + { + "cell_type": "markdown", + "id": "24", + "metadata": { + "id": "cr2Ha215l4QK" + }, + "source": [ + "Let's begin with a simple prediction task. We will discover how well can we explain the price of these products using their textual descriptions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pGt-G-qpciGd", + "outputId": "dfdf62d9-66b7-4787-b00c-b019f9064cf4" + }, + "outputs": [], + "source": [ + "install.packages(\"caTools\")\n", + "install.packages(\"base\")\n", + "library(caTools)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26", + "metadata": { + "id": "EPX_-K9CPpuO" + }, + "outputs": [], + "source": [ + "library(caTools)\n", + "set.seed(124)\n", + "split <- sample.split(Y = data$ln_p, SplitRatio = 0.8)\n", + "train_main <- data[split, ]\n", + "holdout <- data[!split, ]\n", + "split_main <- sample.split(Y = train_main$ln_p, SplitRatio = 0.75)\n", + "train <- train_main[split_main, ]\n", + "val <- train_main[!split_main, ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": { + "id": "IGFzO9sZPvAJ" + }, + "outputs": [], + "source": [ + "library(reticulate)\n", + "use_python(\"/usr/bin/python3\", required = TRUE)\n", + "py_run_string('import tensorflow as tf')\n", + "\n", + "py$train_texts <- train$text\n", + "train_tensors <- py_run_string(\"\n", + "tensors = tokenizer(\n", + " list(train_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "train_tensors <- py$tensors\n", + "\n", + "py$val_texts <- val$text\n", + "val_tensors <- py_run_string(\"\n", + "val_tensors = tokenizer(\n", + " list(val_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "val_tensors <- py$val_tensors\n", + "\n", + "py$holdout_texts <- holdout$text\n", + "tensors_holdout <- py_run_string(\"\n", + "tensors_holdout = tokenizer(\n", + " list(holdout_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "tensors_holdout <- py$tensors_holdout\n", + "ln_p <- train$ln_p\n", + "ln_q <- train$ln_q\n", + "val_ln_p <- val$ln_p\n", + "val_ln_q <- val$ln_q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28", + "metadata": { + "id": "XQ4DMJQ0drZm" + }, + "outputs": [], + "source": [ + "ln_p <- train$ln_p\n", + "ln_q <- train$ln_q\n", + "val_ln_p <- val$ln_p\n", + "val_ln_q <- val$ln_q" + ] + }, + { + "cell_type": "markdown", + "id": "29", + "metadata": { + "id": "gPYEMuKZ7ylj" + }, + "source": [ + "# Using BERT as Feature Extractor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30", + "metadata": { + "id": "ytZhy46hdDdr" + }, + "outputs": [], + "source": [ + "library(reticulate)\n", + "#Sys.setenv(RETICULATE_PYTHON = \"/usr/bin/python\")\n", + "library(keras)\n", + "#install_keras()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 835 + }, + "id": "TsqfQ3OH-HR3", + "outputId": "a4107a7d-ce9e-4d09-a63e-1bab3174bf76" + }, + "outputs": [], + "source": [ + "library(caTools)\n", + "library(dplyr)\n", + "library(readr)\n", + "library(reticulate)\n", + "library(keras)\n", + "library(caret)\n", + "library(glmnet)\n", + "library(stringr)\n", + "\n", + "use_python(\"/usr/bin/python3\", required = TRUE)\n", + "py_run_string('import tensorflow as tf')\n", + "py_run_string('from transformers import BertTokenizer, TFBertModel')\n", + "py_run_string('\n", + "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + "')\n", + "\n", + "py$train_texts <- train$text\n", + "train_tensors <- py_run_string(\"\n", + "tensors = tokenizer(\n", + " list(train_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "train_tensors <- py$tensors\n", + "\n", + "py$val_texts <- val$text\n", + "val_tensors <- py_run_string(\"\n", + "val_tensors = tokenizer(\n", + " list(val_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "val_tensors <- py$val_tensors\n", + "\n", + "py$holdout_texts <- holdout$text\n", + "tensors_holdout <- py_run_string(\"\n", + "tensors_holdout = tokenizer(\n", + " list(holdout_texts),\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=128,\n", + " return_tensors='tf'\n", + ")\")\n", + "tensors_holdout <- py$tensors_holdout\n", + "\n", + "ln_p <- train$ln_p\n", + "val_ln_p <- val$ln_p\n", + "holdout_ln_p <- holdout$ln_p\n", + "\n", + "py_run_string('\n", + "import tensorflow as tf\n", + "from transformers import TFBertModel\n", + "\n", + "# Define the input layers\n", + "input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"input_ids\")\n", + "token_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"token_type_ids\")\n", + "attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name=\"attention_mask\")\n", + "\n", + "# Load the pre-trained BERT model\n", + "bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + "outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)\n", + "\n", + "# Define the embedding model\n", + "embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :])\n", + "')\n", + "\n", + "py_run_string('\n", + "import numpy as np\n", + "embeddings = embedding_model.predict({\n", + " \"input_ids\": tf.convert_to_tensor(tensors[\"input_ids\"]),\n", + " \"token_type_ids\": tf.convert_to_tensor(tensors[\"token_type_ids\"]),\n", + " \"attention_mask\": tf.convert_to_tensor(tensors[\"attention_mask\"])\n", + "})\n", + "')\n", + "\n", + "embeddings <- py$embeddings\n", + "\n", + "py$ln_p <- ln_p\n", + "py_run_string('\n", + "from sklearn.linear_model import LassoCV\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "lcv = make_pipeline(StandardScaler(), LassoCV(cv=KFold(n_splits=5, shuffle=True, random_state=123), random_state=123))\n", + "lcv.fit(embeddings, ln_p)\n", + "')\n", + "\n", + "py_run_string('\n", + "embeddings_val = embedding_model.predict({\n", + " \"input_ids\": tf.convert_to_tensor(val_tensors[\"input_ids\"]),\n", + " \"token_type_ids\": tf.convert_to_tensor(val_tensors[\"token_type_ids\"]),\n", + " \"attention_mask\": tf.convert_to_tensor(val_tensors[\"attention_mask\"])\n", + "})\n", + "val_predictions = lcv.predict(embeddings_val)\n", + "')\n", + "\n", + "val_predictions <- py$val_predictions\n", + "\n", + "r2_val <- caret::R2(val_predictions, val_ln_p)\n", + "\n", + "py_run_string('\n", + "embeddings_holdout = embedding_model.predict({\n", + " \"input_ids\": tf.convert_to_tensor(tensors_holdout[\"input_ids\"]),\n", + " \"token_type_ids\": tf.convert_to_tensor(tensors_holdout[\"token_type_ids\"]),\n", + " \"attention_mask\": tf.convert_to_tensor(tensors_holdout[\"attention_mask\"])\n", + "})\n", + "holdout_predictions = lcv.predict(embeddings_holdout)\n", + "')\n", + "\n", + "holdout_predictions <- py$holdout_predictions\n", + "\n", + "r2_holdout <- caret::R2(holdout_predictions, holdout_ln_p)\n", + "\n", + "print(r2_val)\n", + "print(r2_holdout)\n", + "ln_p_hat_holdout <- holdout_predictions" + ] + }, + { + "cell_type": "markdown", + "id": "32", + "metadata": { + "id": "mOc1_C5p7ta7" + }, + "source": [ + "# Linear Probing: Training Only Final Layer after BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33", + "metadata": { + "id": "Ck1xqRIrmx8I" + }, + "outputs": [], + "source": [ + "### Now let's prepare our model\n", + "\n", + "from tensorflow.keras import Model, Input\n", + "from tensorflow.keras.layers import Dense, Dropout, Concatenate\n", + "import tensorflow_addons as tfa\n", + "from tensorflow.keras import regularizers\n", + "\n", + "tf.keras.utils.set_random_seed(123)\n", + "\n", + "input_ids = Input(shape=(128,), dtype=tf.int32)\n", + "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", + "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", + "\n", + "# # First we compute the text embedding\n", + "Z = bert(input_ids, token_type_ids, attention_mask)\n", + "\n", + "for layer in bert.layers:\n", + " layer.trainable=False\n", + " for w in layer.weights: w._trainable=False\n", + "\n", + "# # We want the \"pooled / summary\" embedding, not individual word embeddings\n", + "Z = Z[1]\n", + "\n", + "# # Then we do a regular regression\n", + "# Z = Dropout(0.2)(Z)\n", + "ln_p_hat = Dense(1, activation='linear',\n", + " kernel_regularizer=regularizers.L2(1e-3))(Z)\n", + "\n", + "PricePredictionNetwork = Model([\n", + " input_ids,\n", + " token_type_ids,\n", + " attention_mask,\n", + " ], ln_p_hat)\n", + "PricePredictionNetwork.compile(\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),\n", + " loss=tf.keras.losses.MeanSquaredError(),\n", + " metrics=tfa.metrics.RSquare(),\n", + ")\n", + "PricePredictionNetwork.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34", + "metadata": { + "id": "XhTREb3NcZhH" + }, + "outputs": [], + "source": [ + "from livelossplot import PlotLossesKeras\n", + "\n", + "tf.keras.utils.set_random_seed(123)\n", + "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", + "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/pweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", + "\n", + "PricePredictionNetwork.fit(\n", + " x= [tensors['input_ids'],\n", + " tensors['token_type_ids'],\n", + " tensors['attention_mask'],],\n", + " y=ln_p,\n", + " validation_data = (\n", + " [val_tensors['input_ids'],\n", + " val_tensors['token_type_ids'],\n", + " val_tensors['attention_mask']], val_ln_p\n", + " ),\n", + " epochs=5,\n", + " callbacks = [earlystopping, modelcheckpoint,\n", + " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", + " batch_size=16,\n", + " shuffle=True)" + ] + }, + { + "cell_type": "markdown", + "id": "35", + "metadata": { + "id": "MyFxR5GC8C3K" + }, + "source": [ + "# Fine Tuning starting from the Linear Probing Trained Weights\n", + "\n", + "Now we train the whole network, initializing the weights based on the result of the linear probing phase in the previous section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36", + "metadata": { + "id": "NzWCkTY87luH" + }, + "outputs": [], + "source": [ + "### Now let's prepare our model\n", + "\n", + "from tensorflow.keras import Model, Input\n", + "from tensorflow.keras.layers import Dense, Dropout, Concatenate\n", + "import tensorflow_addons as tfa\n", + "from tensorflow.keras import regularizers\n", + "\n", + "tf.keras.utils.set_random_seed(123)\n", + "\n", + "input_ids = Input(shape=(128,), dtype=tf.int32)\n", + "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", + "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", + "\n", + "# # First we compute the text embedding\n", + "Z = bert(input_ids, token_type_ids, attention_mask)\n", + "\n", + "for layer in bert.layers:\n", + " layer.trainable=True\n", + " for w in layer.weights: w._trainable=True\n", + "\n", + "# # We want the \"pooled / summary\" embedding, not individual word embeddings\n", + "Z = Z[1]\n", + "\n", + "# # Then we do a regularized linear regression\n", + "ln_p_hat = Dense(1, activation='linear',\n", + " kernel_regularizer=regularizers.L2(1e-3))(Z)\n", + "\n", + "PricePredictionNetwork = Model([\n", + " input_ids,\n", + " token_type_ids,\n", + " attention_mask,\n", + " ], ln_p_hat)\n", + "PricePredictionNetwork.compile(\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),\n", + " loss=tf.keras.losses.MeanSquaredError(),\n", + " metrics=tfa.metrics.RSquare(),\n", + ")\n", + "PricePredictionNetwork.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37", + "metadata": { + "id": "PWauCl0T7nUo" + }, + "outputs": [], + "source": [ + "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38", + "metadata": { + "id": "iDSBZWAe8nhE" + }, + "outputs": [], + "source": [ + "from livelossplot import PlotLossesKeras\n", + "\n", + "tf.keras.utils.set_random_seed(123)\n", + "\n", + "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", + "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/pweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", + "\n", + "PricePredictionNetwork.fit(\n", + " x= [tensors['input_ids'],\n", + " tensors['token_type_ids'],\n", + " tensors['attention_mask'],],\n", + " y=ln_p,\n", + " validation_data = (\n", + " [val_tensors['input_ids'],\n", + " val_tensors['token_type_ids'],\n", + " val_tensors['attention_mask']], val_ln_p\n", + " ),\n", + " epochs=10,\n", + " callbacks = [earlystopping, modelcheckpoint,\n", + " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", + " batch_size=16,\n", + " shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39", + "metadata": { + "id": "wchpbXoqBAJu" + }, + "outputs": [], + "source": [ + "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40", + "metadata": { + "id": "jpUmDHYfkJEZ" + }, + "outputs": [], + "source": [ + "# Compute predictions\n", + "ln_p_hat_holdout = PricePredictionNetwork.predict([\n", + " tensors_holdout['input_ids'],\n", + " tensors_holdout['token_type_ids'],\n", + " tensors_holdout['attention_mask'],\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41", + "metadata": { + "id": "g_XK81hpkQMN" + }, + "outputs": [], + "source": [ + "print('Neural Net R^2, Price Prediction:')\n", + "get_r2(holdout['ln_p'], ln_p_hat_holdout)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42", + "metadata": { + "id": "GR4QP4DJPQk0" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.hist(ln_p_hat_holdout)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43", + "metadata": { + "id": "RAGwE4peL1Me" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "44", + "metadata": { + "id": "bafy9ftcoBed" + }, + "source": [ + "Now, let's go one step further and construct a DML estimator of the average price elasticity. In particular, we will model market share $q_i$ as\n", + "$$\\ln q_i = \\alpha + \\beta \\ln p_i + \\psi(d_i) + \\epsilon_i,$$ where $d_i$ denotes the description of product $i$ and $\\psi$ is the composition of text embedding and a linear layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45", + "metadata": { + "id": "Qiteu6FaoctV" + }, + "outputs": [], + "source": [ + "## Build the quantity prediction network\n", + "\n", + "tf.keras.utils.set_random_seed(123)\n", + "\n", + "# Initialize new BERT model from original\n", + "bert2 = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", + "\n", + "# for layer in bert2.layers:\n", + "# layer.trainable=False\n", + "# for w in layer.weights: w._trainable=False\n", + "\n", + "# Define inputs\n", + "input_ids = Input(shape=(128,), dtype=tf.int32)\n", + "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", + "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", + "\n", + "# First we compute the text embedding\n", + "Z = bert2(input_ids, token_type_ids, attention_mask)\n", + "\n", + "# We want the \"pooled / summary\" embedding, not individual word embeddings\n", + "Z = Z[1]\n", + "\n", + "ln_q_hat = Dense(1, activation='linear', kernel_regularizer=regularizers.L2(1e-3))(Z)\n", + "\n", + "# Compile model and optimization routine\n", + "QuantityPredictionNetwork = Model([\n", + " input_ids,\n", + " token_type_ids,\n", + " attention_mask,\n", + " ], ln_q_hat)\n", + "QuantityPredictionNetwork.compile(\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),\n", + " loss=tf.keras.losses.MeanSquaredError(),\n", + " metrics=tfa.metrics.RSquare(),\n", + ")\n", + "QuantityPredictionNetwork.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46", + "metadata": { + "id": "aaxHV0gGMqpw" + }, + "outputs": [], + "source": [ + "## Fit the quantity prediction network in the main sample\n", + "tf.keras.utils.set_random_seed(123)\n", + "\n", + "earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n", + "modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(\"/content/gdrive/MyDrive/qweights.hdf5\", monitor='val_loss', save_best_only=True, save_weights_only=True)\n", + "\n", + "QuantityPredictionNetwork.fit(\n", + " [\n", + " tensors['input_ids'],\n", + " tensors['token_type_ids'],\n", + " tensors['attention_mask'],\n", + " ],\n", + " ln_q,\n", + " validation_data = (\n", + " [val_tensors['input_ids'],\n", + " val_tensors['token_type_ids'],\n", + " val_tensors['attention_mask']], val_ln_q\n", + " ),\n", + " epochs=10,\n", + " callbacks = [earlystopping, modelcheckpoint,\n", + " PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})],\n", + " batch_size=16,\n", + " shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47", + "metadata": { + "id": "TfyQV3lw-xf2" + }, + "outputs": [], + "source": [ + "QuantityPredictionNetwork.load_weights(\"/content/gdrive/MyDrive/qweights.hdf5\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48", + "metadata": { + "id": "YADpNj0jMygZ" + }, + "outputs": [], + "source": [ + "## Predict in the holdout sample, residualize and regress\n", + "\n", + "ln_q_hat_holdout = QuantityPredictionNetwork.predict([\n", + " tensors_holdout['input_ids'],\n", + " tensors_holdout['token_type_ids'],\n", + " tensors_holdout['attention_mask'],\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49", + "metadata": { + "id": "jh4criU1hGIP" + }, + "outputs": [], + "source": [ + "print('Neural Net R^2, Quantity Prediction:')\n", + "get_r2(holdout['ln_q'], ln_q_hat_holdout)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50", + "metadata": { + "id": "ir-_yAfkPM6f" + }, + "outputs": [], + "source": [ + "# Compute residuals\n", + "r_p = holdout[\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))\n", + "r_q = holdout[\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))\n", + "\n", + "# Regress to obtain elasticity estimate\n", + "beta = np.mean(r_p * r_q) / np.mean(r_p * r_p)\n", + "\n", + "# standard error on elastiticy estimate\n", + "se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout[\"ln_p\"].size)\n", + "\n", + "print('Elasticity of Demand with Respect to Price: {}'.format(beta))\n", + "print('Standard Error: {}'.format(se))" + ] + }, + { + "cell_type": "markdown", + "id": "51", + "metadata": { + "id": "VCqeRTB_BNEH" + }, + "source": [ + "# Heterogeneous Elasticities within Major Product Categories\n", + "\n", + "We now look at the major product categories that have many products and we investigate whether the \"within group\" price elasticities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52", + "metadata": { + "id": "XRUZEXqc8HPG" + }, + "outputs": [], + "source": [ + "holdout['category'] = holdout['amazon_category_and_sub_category'].str.split('>').apply(lambda x: x[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53", + "metadata": { + "id": "ymWJv4Ej7lt9" + }, + "outputs": [], + "source": [ + "# Elasticity within the main product categories\n", + "sql.run(\"\"\"\n", + " SELECT category, COUNT(*)\n", + " FROM holdout\n", + " GROUP BY 1\n", + " HAVING COUNT(*)>=100\n", + " ORDER BY 2 desc\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54", + "metadata": { + "id": "E3ncPtwt8nJi" + }, + "outputs": [], + "source": [ + "main_cats = sql.run(\"\"\"\n", + " SELECT category\n", + " FROM holdout\n", + " GROUP BY 1\n", + " HAVING COUNT(*)>=100\n", + "\"\"\")['category']\n", + "\n", + "dfs = []\n", + "for cat in main_cats:\n", + " r_p = holdout[holdout['category'] == cat][\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))[holdout['category'] == cat]\n", + " r_q = holdout[holdout['category'] == cat][\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))[holdout['category'] == cat]\n", + " # Regress to obtain elasticity estimate\n", + " beta = np.mean(r_p * r_q) / np.mean(r_p * r_p)\n", + "\n", + " # standard error on elastiticy estimate\n", + " se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout[\"ln_p\"].size)\n", + "\n", + " df = pd.DataFrame({'point': beta, 'se': se, 'lower': beta - 1.96 * se, 'upper': beta + 1.96 * se}, index=[0])\n", + " df['category'] = cat\n", + " df['N'] = holdout[holdout['category'] == cat].shape[0]\n", + " dfs.append(df)\n", + "\n", + "df = pd.concat(dfs)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "55", + "metadata": { + "id": "QFTjf9vP6Zfu" + }, + "source": [ + "## Clustering Products\n", + "\n", + "In this final part of the notebook, we'll illustrate how the BERT text embeddings can be used to cluster products based on their descriptions.\n", + "\n", + "Intiuitively, our neural network has now learned which aspects of the text description are relevant to predict prices and market shares.\n", + "We can therefore use the embeddings produced by our network to cluster products, and we might expect that the clusters reflect market-relevant information.\n", + "\n", + "In the following block of cells, we compute embeddings using our learned models and cluster them using $k$-means clustering with $k=10$. Finally, we will explore how the estimated price elasticity differs across clusters.\n", + "\n", + "### Overview of **$k$-means clustering**\n", + "The $k$-means clustering algorithm seeks to divide $n$ data vectors into $k$ groups, each of which contain points that are \"close together.\"\n", + "\n", + "In particular, let $C_1, \\ldots, C_k$ be a partitioning of the data into $k$ disjoint, nonempty subsets (clusters), and define\n", + "$$\\bar{C_i}=\\frac{1}{\\#C_i}\\sum_{x \\in C_i} x$$\n", + "to be the *centroid* of the cluster $C_i$. The $k$-means clustering score $\\mathrm{sc}(C_1 \\ldots C_k)$ is defined to be\n", + "$$\\mathrm{sc}(C_1 \\ldots C_k) = \\sum_{i=1}^k \\sum_{x \\in C_i} \\left(x - \\bar{C_i}\\right)^2.$$\n", + "\n", + "The $k$-means clustering is then defined to be any partitioning $C^*_1 \\ldots C^*_k$ that minimizes the score $\\mathrm{sc}(-)$.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56", + "metadata": { + "id": "Mc7I00JPK6wJ" + }, + "outputs": [], + "source": [ + "## STEP 1: Compute embeddings\n", + "\n", + "input_ids = Input(shape=(128,), dtype=tf.int32)\n", + "token_type_ids = Input(shape=(128,), dtype=tf.int32)\n", + "attention_mask = Input(shape=(128,), dtype=tf.int32)\n", + "\n", + "Y1 = bert(input_ids, token_type_ids, attention_mask)[1]\n", + "Y2 = bert2(input_ids, token_type_ids, attention_mask)[1]\n", + "Y = Concatenate()([Y1,Y2])\n", + "\n", + "embedding_model = Model([input_ids, token_type_ids, attention_mask], Y)\n", + "\n", + "embeddings = embedding_model.predict([tensors_holdout['input_ids'],\n", + " tensors_holdout['token_type_ids'],\n", + " tensors_holdout['attention_mask']])" + ] + }, + { + "cell_type": "markdown", + "id": "57", + "metadata": { + "id": "hCG2MunU6iF-" + }, + "source": [ + "### Dimension reduction and the **Johnson-Lindenstrauss transform**\n", + "\n", + "Our learned embeddings have dimension in the $1000$s, and $k$-means clustering is often an expensive operation. To improve the situation, we will use a neat trick that is used extensively in machine learning applications: the *Johnson-Lindenstrauss transform*.\n", + "\n", + "This trick involves finding a low-dimensional linear projection of the embeddings that approximately preserves pairwise distances.\n", + "\n", + "In fact, Johnson and Lindenstrauss proved a much more interesting statement: a Gaussian random matrix will *almost always* approximately preserve pairwise distances.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58", + "metadata": { + "id": "afGiLR7v6ecJ" + }, + "outputs": [], + "source": [ + "# STEP 2 Make low-dimensional projections\n", + "from sklearn.random_projection import GaussianRandomProjection\n", + "\n", + "jl = GaussianRandomProjection(eps=.25)\n", + "embeddings_lowdim = jl.fit_transform(embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59", + "metadata": { + "id": "9Tl9AM3J6j3X" + }, + "outputs": [], + "source": [ + "# STEP 3 Compute clusters\n", + "from sklearn.cluster import KMeans\n", + "\n", + "k_means = KMeans(n_clusters=10)\n", + "k_means.fit(embeddings_lowdim)\n", + "cluster_ids = k_means.labels_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60", + "metadata": { + "id": "0l7De-Do6mD0" + }, + "outputs": [], + "source": [ + "# STEP 4 Regress within each cluster\n", + "\n", + "betas = np.zeros(10)\n", + "ses = np.zeros(10)\n", + "\n", + "r_p = holdout[\"ln_p\"] - ln_p_hat_holdout.reshape((-1,))\n", + "r_q = holdout[\"ln_q\"] - ln_q_hat_holdout.reshape((-1,))\n", + "\n", + "for c in range(10):\n", + "\n", + " r_p_c = r_p[cluster_ids == c]\n", + " r_q_c = r_q[cluster_ids == c]\n", + "\n", + " # Regress to obtain elasticity estimate\n", + " betas[c] = np.mean(r_p_c * r_q_c) / np.mean(r_p_c * r_p_c)\n", + "\n", + " # standard error on elastiticy estimate\n", + " ses[c] = np.sqrt(np.mean( (r_p_c * r_q_c)**2)/(np.mean(r_p_c*r_p_c)**2)/r_p_c.size)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61", + "metadata": { + "id": "oXoe98f06njT" + }, + "outputs": [], + "source": [ + "# STEP 5 Plot\n", + "from matplotlib import pyplot as plt\n", + "\n", + "plt.bar(range(10), betas, yerr = 1.96 * ses)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 427.936706, + "end_time": "2022-04-19T09:13:53.230849", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-04-19T09:06:45.294143", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 8606b8ca049ec0642421477452bc4ea744827a56 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 06:02:30 -0700 Subject: [PATCH 068/261] Update r_convergence_hypothesis_double_lasso.irnb --- ...r_convergence_hypothesis_double_lasso.irnb | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 56196c9b..8e58138e 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -80,7 +80,7 @@ "library(xtable)\n", "library(lmtest)\n", "library(sandwich)\n", - "library(glmnet) # For LassoCV\n", + "library(glmnet) # For LassoCV\n", "library(ggplot2)" ] }, @@ -138,8 +138,7 @@ }, "outputs": [], "source": [ - "getdata <- function(...)\n", - "{\n", + "getdata <- function(...) {\n", " e <- new.env()\n", " name <- data(..., envir = e)[1]\n", " e[[name]]\n", @@ -631,21 +630,21 @@ } ], "source": [ - "fit <- lm(Outcome ~ ., data=X)\n", + "fit <- lm(Outcome ~ ., data = X)\n", "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", "\n", - "hcv.coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se <- sqrt(diag(hcv.coefs))[2] # Estimated std errors\n", + "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gdpsh465 and the corresponding standard error\n", "cat (\"The estimated coefficient on gdpsh465 is\", est,\n", " \" and the corresponding robust standard error is\", se)\n", "\n", "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower.ci <- est - 1.96 * se\n", - "upper.ci <- est + 1.96 * se\n", + "lower_ci <- est - 1.96 * se\n", + "upper_ci <- est + 1.96 * se\n", "\n", - "cat (\"95% Confidence Interval: [\", lower.ci, \",\", upper.ci, \"]\")" + "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" ] }, { @@ -727,7 +726,7 @@ "outputs": [], "source": [ "y <- growth$Outcome\n", - "W <- growth[-which(colnames(growth) %in% c('Outcome', 'intercept', 'gdpsh465'))]\n", + "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", "D <- growth$gdpsh465" ] }, @@ -925,7 +924,7 @@ }, "outputs": [], "source": [ - "tmp_df = as.data.frame(cbind(res_y, res_d))\n", + "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", "colnames(tmp_df) = c(\"res_y\", \"res_D\")" ] }, @@ -1058,20 +1057,20 @@ "# Plot Outcome Lasso-CV Model\n", "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", " geom_line() +\n", - " labs(\n", - " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n", + " labs(\n", + " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n", "\n", "# Plot Treatment Lasso-CV Model\n", "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", " geom_line() +\n", - " labs(\n", - " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n" + " labs(\n", + " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n" ] } ], From df42f81a9b3a8a5f0caead1de4e16f5bc2244079 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:12:42 +0000 Subject: [PATCH 069/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in AC1 --- AC1/r-proxy-controls.Rmd | 178 +++ AC1/r-proxy-controls.irnb | 670 ++++----- ...analysis-with-sensmakr-and-debiased-ml.Rmd | 282 ++++ ...nalysis-with-sensmakr-and-debiased-ml.irnb | 1220 ++++++++--------- 4 files changed, 1405 insertions(+), 945 deletions(-) create mode 100644 AC1/r-proxy-controls.Rmd create mode 100644 AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd diff --git a/AC1/r-proxy-controls.Rmd b/AC1/r-proxy-controls.Rmd new file mode 100644 index 00000000..7e5ce3ad --- /dev/null +++ b/AC1/r-proxy-controls.Rmd @@ -0,0 +1,178 @@ +--- +title: An R Markdown document converted from "AC1/r-proxy-controls.irnb" +output: html_document +--- + +# Negative (Proxy) Controls for Unobserved Confounding + +Consider the following SEM, where $Y$ is the outcome, $D$ is the treatment, $A$ is some unobserved confounding, and $Q$, $X$, $S$ are the observed covariates. In particular, $Q$ is considered to be the proxy control treatment as it a priori has no effect on the actual outcome $Y$, and $S$ is considered to be the proxy control outcome as it a priori is not affected by the actual treatment $D$. See also [An Introduction to Proximal Causal Learning](https://arxiv.org/pdf/2009.10982.pdf), for more information on this setting. + +![proxy_dag.png](https://raw.githubusercontent.com/stanford-msande228/winter23/main/proxy_dag.png) + +Under linearity assumptions, the average treatment effect can be estimated by solving the vector of moment equations: +\begin{align} +E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) \left(\begin{aligned}\tilde{D}\\ \tilde{Q}\end{aligned}\right) \right] = 0 +\end{align} +where for every variable $V$ we denote with $\tilde{V} = V - E[V|X]$. + +When the dimension of the proxy treatment variables $Q$ is larger than the dimension of proxy outcome variables $S$, then the above system of equations is over-identified. In these settings, we first project the "technical instrument" variables $\tilde{V}=(\tilde{D}, \tilde{Q})$ onto the space of "technical treatment" variables $\tilde{W}=(\tilde{D}, \tilde{S})$ and use the projected $\tilde{V}$ as a new "technical instrument". In particular, we run an OLS regression of $\tilde{W}$ on $\tilde{V},$ and define $\tilde{Z} = E[\tilde{W}\mid \tilde{V}] = B \tilde{V}$, where the $t$-th row $\beta_t$ of the matrix $B$ is the OLS coefficient in the regression of $\tilde{W}_t$ on $\tilde{V}$. These new variables $\tilde{Z}$, can also be viewed as engineered technical instrumental variables. Then we have the exactly identified system of equations: +\begin{align} +E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) \tilde{Z} \right] := E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) B \left(\begin{aligned}\tilde{D}\\ \tilde{Q}\end{aligned}\right) \right] = 0 +\end{align} + +The solution to this system of equations is numerically equivalent to the following two stage algorithm: +- Run OLS of $\tilde{W}=(\tilde{D}, \tilde{S})$ on $\tilde{V}=(\tilde{D}, \tilde{Q})$ +- Define $\tilde{Z}$ as the predictions of the OLS model +- Run OLS of $\tilde{Y}$ on $\tilde{Z}$. +This is the well-known Two-Stage-Least-Squares (2SLS) algorithm for instrumental variable regression. + +Since we're considering only linear models and in a low-dimensional setting, we'll focus on just using linear IV methods. + +```{r} +install.packages("hdm") + +library(hdm) + +set.seed(1) +``` + +# Analyzing Simulated Data + +First, let's evaluate the methods on simulated data generated from a linear SEM characterized by the above DAG. For this simulation, we'll set the ATE to 2. + +```{r} +gen_data <- function(n, ate) { + X <- matrix(rnorm(n * 10), ncol = 10) + A <- 2 * X[, 1] + rnorm(n) + Q <- 10 * A + 2 * X[, 1] + rnorm(n) + S <- 5 * A + X[, 1] + rnorm(n) + D <- Q - A + 2 * X[, 1] + rnorm(n) + Y <- ate * D + 5 * A + 2 * S + 0.5 * X[, 1] + rnorm(n) + return(list(X, A, Q, S, D, Y)) +} +``` + +```{r} +data_list <- gen_data(5000, 2) +X <- data_list[[1]] +A <- data_list[[2]] +Q <- data_list[[3]] +S <- data_list[[4]] +D <- data_list[[5]] +Y <- data_list[[6]] +``` + +We define the technical instrument $V=(D, Q)$ and technical treatment $W=(D, S)$. Estimating the treatement effect is then just a matter of solving an instrument variable regression problem with instruments $V$ and treatments $W$ and looking at the first coefficient associated with $D$. + +```{r} +W = cbind(D,S) +V = cbind(D,Q) +``` + +```{r} +piv <- tsls(X, W, Y, V, homoscedastic = FALSE) +cat('Estimated coefficient:',piv$coefficients['D',1],'\n') +cat('Standard error:',piv$se['D'],'\n') +``` + +# With Cross-Fitting + +We can also consider partialling out the controls using DML with cross-fitting + +```{r} +lm.DML.for.PROXYIV <- function(x, d, q, s, y, dreg, qreg, yreg, sreg, nfold=5) { + # this implements DML for a partially linear IV model + nobs <- nrow(x) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- qtil<- stil <- rep(NA, nobs) + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + qfit <- qreg(x[-I[[b]],], q[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) #take a fold out + sfit <- sreg(x[-I[[b]],], s[-I[[b]]]) #take a fold out + dtil[I[[b]]] <- (d[I[[b]]] - x[I[[b]],]%*%as.matrix(dfit$coefficients)) #record residual + qtil[I[[b]]] <- (q[I[[b]]] - x[I[[b]],]%*%as.matrix(qfit$coefficients)) #record residual + ytil[I[[b]]] <- (y[I[[b]]] - x[I[[b]],]%*%as.matrix(yfit$coefficients)) #record residial + stil[I[[b]]] <- (s[I[[b]]] - x[I[[b]],]%*%as.matrix(sfit$coefficients)) #record residual + cat(b," ") + } + ivfit= tsls(y=ytil,d=cbind(dtil,stil), x=NULL, z=cbind(dtil,qtil), intercept=FALSE, homoscedastic = FALSE) + coef.est <- ivfit$coef[1] #extract coefficient + se <- ivfit$se[1] #record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + return( list(coef.est =coef.est , se=se, dtil=dtil, qtil = qtil, ytil=ytil, stil=stil, foldid=foldid, spI = I) ) +} +``` + +We'll just use OLS for partialling out again. We could of course try something more elaborate if we wanted. + +```{r} +dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols +qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols +yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols +sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols + +DML.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5) +``` + +## Real Data - Effects of Smoking on Birth Weight + +In this study, we will be studying the effects of smoking on baby weight. We will consider the following stylized setup: + +Outcome ($Y$): baby weight + +Treatment ($D$): smoking + +Unobserved confounding ($A$): family income + +The observed covariates are put in to 3 groups: + + +* Proxy treatment control ($Q$): mother's education +* Proxy outcome control ($S$): parity (total number of previous pregnancies) +* Other observed covariates ($X$): mother's race and age + + +Education serves as a proxy treatment control $Q$ because it reflects unobserved confounding due to household income $A$ but has no direct medical effect on birth weight $Y$. Parity serves as a proxy outcome control $S$ because family size reflects household income $A$ but is not directly caused by smoking $D$ or education $Q$. + +A description of the data used can be found [here](https://www.stat.berkeley.edu/users/statlabs/data/babies.readme). + +```{r} +data <- read.table("https://www.stat.berkeley.edu/users/statlabs/data/babies23.data", header = TRUE) +summary(data) +``` + +```{r} +# Filter data to exclude entries where income, number of cigarettes smoked, +# race, age are not asked or not known +data = data[data$race != 99, ] +data = data[!(data$number %in% c(98,99)), ] +data = data[!(data$inc %in% c(98,99)), ] +data = data[data$age != 99, ] +dim(data) +``` + +```{r} +# Create matrices for X, D, Q, S, A, Y +X <- model.matrix(~0 + C(race) + age, data) +D <- model.matrix(~0 + number, data) +Q <- model.matrix(~0 + ed, data) +S <- model.matrix(~0 + parity, data) +A <- model.matrix(~0 + inc, data) +Y <- model.matrix(~0 + wt, data) +``` + +```{r} +# Use cross-fitting with OLS to estimate treatment effect within linear model context +dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols +qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols +yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols +sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols + +DML.bw.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5) +``` + diff --git a/AC1/r-proxy-controls.irnb b/AC1/r-proxy-controls.irnb index fc58d023..672a08ff 100644 --- a/AC1/r-proxy-controls.irnb +++ b/AC1/r-proxy-controls.irnb @@ -1,338 +1,338 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "ir", - "display_name": "R" - }, - "language_info": { - "name": "R" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6GIJSF7hQ544" + }, + "source": [ + "# Negative (Proxy) Controls for Unobserved Confounding" + ] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Negative (Proxy) Controls for Unobserved Confounding" - ], - "metadata": { - "id": "6GIJSF7hQ544" - } - }, - { - "cell_type": "markdown", - "source": [ - "Consider the following SEM, where $Y$ is the outcome, $D$ is the treatment, $A$ is some unobserved confounding, and $Q$, $X$, $S$ are the observed covariates. In particular, $Q$ is considered to be the proxy control treatment as it a priori has no effect on the actual outcome $Y$, and $S$ is considered to be the proxy control outcome as it a priori is not affected by the actual treatment $D$. See also [An Introduction to Proximal Causal Learning](https://arxiv.org/pdf/2009.10982.pdf), for more information on this setting.\n", - "\n", - "![proxy_dag.png](https://raw.githubusercontent.com/stanford-msande228/winter23/main/proxy_dag.png)" - ], - "metadata": { - "id": "fJom98ALQ7oY" - } - }, - { - "cell_type": "markdown", - "source": [ - "Under linearity assumptions, the average treatment effect can be estimated by solving the vector of moment equations:\n", - "\\begin{align}\n", - "E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) \\left(\\begin{aligned}\\tilde{D}\\\\ \\tilde{Q}\\end{aligned}\\right) \\right] = 0\n", - "\\end{align}\n", - "where for every variable $V$ we denote with $\\tilde{V} = V - E[V|X]$.\n", - "\n", - "When the dimension of the proxy treatment variables $Q$ is larger than the dimension of proxy outcome variables $S$, then the above system of equations is over-identified. In these settings, we first project the \"technical instrument\" variables $\\tilde{V}=(\\tilde{D}, \\tilde{Q})$ onto the space of \"technical treatment\" variables $\\tilde{W}=(\\tilde{D}, \\tilde{S})$ and use the projected $\\tilde{V}$ as a new \"technical instrument\". In particular, we run an OLS regression of $\\tilde{W}$ on $\\tilde{V},$ and define $\\tilde{Z} = E[\\tilde{W}\\mid \\tilde{V}] = B \\tilde{V}$, where the $t$-th row $\\beta_t$ of the matrix $B$ is the OLS coefficient in the regression of $\\tilde{W}_t$ on $\\tilde{V}$. These new variables $\\tilde{Z}$, can also be viewed as engineered technical instrumental variables. Then we have the exactly identified system of equations:\n", - "\\begin{align}\n", - "E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) \\tilde{Z} \\right] := E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) B \\left(\\begin{aligned}\\tilde{D}\\\\ \\tilde{Q}\\end{aligned}\\right) \\right] = 0\n", - "\\end{align}\n", - "\n", - "The solution to this system of equations is numerically equivalent to the following two stage algorithm:\n", - "- Run OLS of $\\tilde{W}=(\\tilde{D}, \\tilde{S})$ on $\\tilde{V}=(\\tilde{D}, \\tilde{Q})$\n", - "- Define $\\tilde{Z}$ as the predictions of the OLS model\n", - "- Run OLS of $\\tilde{Y}$ on $\\tilde{Z}$.\n", - "This is the well-known Two-Stage-Least-Squares (2SLS) algorithm for instrumental variable regression." - ], - "metadata": { - "id": "KM447zRQREMP" - } - }, - { - "cell_type": "markdown", - "source": [ - "Since we're considering only linear models and in a low-dimensional setting, we'll focus on just using linear IV methods." - ], - "metadata": { - "id": "WC_ssbHee1bO" - } - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"hdm\")\n", - "\n", - "library(hdm)\n", - "\n", - "set.seed(1)" - ], - "metadata": { - "id": "s8ANZZ8sRF3C" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Analyzing Simulated Data\n", - "\n", - "First, let's evaluate the methods on simulated data generated from a linear SEM characterized by the above DAG. For this simulation, we'll set the ATE to 2." - ], - "metadata": { - "id": "TYBp5cB3fTVB" - } - }, - { - "cell_type": "code", - "source": [ - "gen_data <- function(n, ate) {\n", - " X <- matrix(rnorm(n * 10), ncol = 10)\n", - " A <- 2 * X[, 1] + rnorm(n)\n", - " Q <- 10 * A + 2 * X[, 1] + rnorm(n)\n", - " S <- 5 * A + X[, 1] + rnorm(n)\n", - " D <- Q - A + 2 * X[, 1] + rnorm(n)\n", - " Y <- ate * D + 5 * A + 2 * S + 0.5 * X[, 1] + rnorm(n)\n", - " return(list(X, A, Q, S, D, Y))\n", - "}" - ], - "metadata": { - "id": "hLFCX0YnfUfv" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "data_list <- gen_data(5000, 2)\n", - "X <- data_list[[1]]\n", - "A <- data_list[[2]]\n", - "Q <- data_list[[3]]\n", - "S <- data_list[[4]]\n", - "D <- data_list[[5]]\n", - "Y <- data_list[[6]]" - ], - "metadata": { - "id": "jwNPCZT8fxDW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We define the technical instrument $V=(D, Q)$ and technical treatment $W=(D, S)$. Estimating the treatement effect is then just a matter of solving an instrument variable regression problem with instruments $V$ and treatments $W$ and looking at the first coefficient associated with $D$." - ], - "metadata": { - "id": "64UHEtu1f8BU" - } - }, - { - "cell_type": "code", - "source": [ - "W = cbind(D,S)\n", - "V = cbind(D,Q)" - ], - "metadata": { - "id": "fSZ0AyGyf7Mn" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "piv <- tsls(X, W, Y, V, homoscedastic = FALSE)\n", - "cat('Estimated coefficient:',piv$coefficients['D',1],'\\n')\n", - "cat('Standard error:',piv$se['D'],'\\n')" - ], - "metadata": { - "id": "9inuCWFCg8pc" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# With Cross-Fitting\n", - "\n", - "We can also consider partialling out the controls using DML with cross-fitting" - ], - "metadata": { - "id": "ewgCBYN6lCu4" - } - }, - { - "cell_type": "code", - "source": [ - "lm.DML.for.PROXYIV <- function(x, d, q, s, y, dreg, qreg, yreg, sreg, nfold=5) {\n", - " # this implements DML for a partially linear IV model\n", - " nobs <- nrow(x)\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", - " I <- split(1:nobs, foldid)\n", - " # create residualized objects to fill\n", - " ytil <- dtil <- qtil<- stil <- rep(NA, nobs)\n", - " # obtain cross-fitted residuals\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " qfit <- qreg(x[-I[[b]],], q[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) #take a fold out\n", - " sfit <- sreg(x[-I[[b]],], s[-I[[b]]]) #take a fold out\n", - " dtil[I[[b]]] <- (d[I[[b]]] - x[I[[b]],]%*%as.matrix(dfit$coefficients)) #record residual\n", - " qtil[I[[b]]] <- (q[I[[b]]] - x[I[[b]],]%*%as.matrix(qfit$coefficients)) #record residual\n", - " ytil[I[[b]]] <- (y[I[[b]]] - x[I[[b]],]%*%as.matrix(yfit$coefficients)) #record residial\n", - " stil[I[[b]]] <- (s[I[[b]]] - x[I[[b]],]%*%as.matrix(sfit$coefficients)) #record residual\n", - " cat(b,\" \")\n", - " }\n", - " ivfit= tsls(y=ytil,d=cbind(dtil,stil), x=NULL, z=cbind(dtil,qtil), intercept=FALSE, homoscedastic = FALSE)\n", - " coef.est <- ivfit$coef[1] #extract coefficient\n", - " se <- ivfit$se[1] #record standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, qtil = qtil, ytil=ytil, stil=stil, foldid=foldid, spI = I) )\n", - "}" - ], - "metadata": { - "id": "59a2apIRlImz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We'll just use OLS for partialling out again. We could of course try something more elaborate if we wanted." - ], - "metadata": { - "id": "wcytUEblmIB6" - } - }, - { - "cell_type": "code", - "source": [ - "dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols\n", - "qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols\n", - "yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols\n", - "sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols\n", - "\n", - "DML.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5)\n" - ], - "metadata": { - "id": "oYx9OoxnmAYq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Real Data - Effects of Smoking on Birth Weight" - ], - "metadata": { - "id": "23OIAhYa84ng" - } - }, - { - "cell_type": "markdown", - "source": [ - "In this study, we will be studying the effects of smoking on baby weight. We will consider the following stylized setup:\n", - "\n", - "Outcome ($Y$): baby weight\n", - "\n", - "Treatment ($D$): smoking\n", - "\n", - "Unobserved confounding ($A$): family income\n", - "\n", - "The observed covariates are put in to 3 groups:\n", - "\n", - "\n", - "* Proxy treatment control ($Q$): mother's education\n", - "* Proxy outcome control ($S$): parity (total number of previous pregnancies)\n", - "* Other observed covariates ($X$): mother's race and age\n", - "\n", - "\n", - "Education serves as a proxy treatment control $Q$ because it reflects unobserved confounding due to household income $A$ but has no direct medical effect on birth weight $Y$. Parity serves as a proxy outcome control $S$ because family size reflects household income $A$ but is not directly caused by smoking $D$ or education $Q$.\n", - "\n", - "A description of the data used can be found [here](https://www.stat.berkeley.edu/users/statlabs/data/babies.readme)." - ], - "metadata": { - "id": "qFG-RtmR85rv" - } - }, - { - "cell_type": "code", - "source": [ - "data <- read.table(\"https://www.stat.berkeley.edu/users/statlabs/data/babies23.data\", header = TRUE)\n", - "summary(data)" - ], - "metadata": { - "id": "6FjiuPrt8_rk" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Filter data to exclude entries where income, number of cigarettes smoked,\n", - "# race, age are not asked or not known\n", - "data = data[data$race != 99, ]\n", - "data = data[!(data$number %in% c(98,99)), ]\n", - "data = data[!(data$inc %in% c(98,99)), ]\n", - "data = data[data$age != 99, ]\n", - "dim(data)" - ], - "metadata": { - "id": "ooZVwo0y-H9E" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Create matrices for X, D, Q, S, A, Y\n", - "X <- model.matrix(~0 + C(race) + age, data)\n", - "D <- model.matrix(~0 + number, data)\n", - "Q <- model.matrix(~0 + ed, data)\n", - "S <- model.matrix(~0 + parity, data)\n", - "A <- model.matrix(~0 + inc, data)\n", - "Y <- model.matrix(~0 + wt, data)" - ], - "metadata": { - "id": "8vv9F6fe_AoI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Use cross-fitting with OLS to estimate treatment effect within linear model context\n", - "dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols\n", - "qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols\n", - "yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols\n", - "sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols\n", - "\n", - "DML.bw.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5)" - ], - "metadata": { - "id": "E-KCdxFH_TtC" - }, - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "id": "fJom98ALQ7oY" + }, + "source": [ + "Consider the following SEM, where $Y$ is the outcome, $D$ is the treatment, $A$ is some unobserved confounding, and $Q$, $X$, $S$ are the observed covariates. In particular, $Q$ is considered to be the proxy control treatment as it a priori has no effect on the actual outcome $Y$, and $S$ is considered to be the proxy control outcome as it a priori is not affected by the actual treatment $D$. See also [An Introduction to Proximal Causal Learning](https://arxiv.org/pdf/2009.10982.pdf), for more information on this setting.\n", + "\n", + "![proxy_dag.png](https://raw.githubusercontent.com/stanford-msande228/winter23/main/proxy_dag.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KM447zRQREMP" + }, + "source": [ + "Under linearity assumptions, the average treatment effect can be estimated by solving the vector of moment equations:\n", + "\\begin{align}\n", + "E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) \\left(\\begin{aligned}\\tilde{D}\\\\ \\tilde{Q}\\end{aligned}\\right) \\right] = 0\n", + "\\end{align}\n", + "where for every variable $V$ we denote with $\\tilde{V} = V - E[V|X]$.\n", + "\n", + "When the dimension of the proxy treatment variables $Q$ is larger than the dimension of proxy outcome variables $S$, then the above system of equations is over-identified. In these settings, we first project the \"technical instrument\" variables $\\tilde{V}=(\\tilde{D}, \\tilde{Q})$ onto the space of \"technical treatment\" variables $\\tilde{W}=(\\tilde{D}, \\tilde{S})$ and use the projected $\\tilde{V}$ as a new \"technical instrument\". In particular, we run an OLS regression of $\\tilde{W}$ on $\\tilde{V},$ and define $\\tilde{Z} = E[\\tilde{W}\\mid \\tilde{V}] = B \\tilde{V}$, where the $t$-th row $\\beta_t$ of the matrix $B$ is the OLS coefficient in the regression of $\\tilde{W}_t$ on $\\tilde{V}$. These new variables $\\tilde{Z}$, can also be viewed as engineered technical instrumental variables. Then we have the exactly identified system of equations:\n", + "\\begin{align}\n", + "E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) \\tilde{Z} \\right] := E\\left[(\\tilde{Y} - \\alpha \\tilde{D} - \\delta \\tilde{S}) B \\left(\\begin{aligned}\\tilde{D}\\\\ \\tilde{Q}\\end{aligned}\\right) \\right] = 0\n", + "\\end{align}\n", + "\n", + "The solution to this system of equations is numerically equivalent to the following two stage algorithm:\n", + "- Run OLS of $\\tilde{W}=(\\tilde{D}, \\tilde{S})$ on $\\tilde{V}=(\\tilde{D}, \\tilde{Q})$\n", + "- Define $\\tilde{Z}$ as the predictions of the OLS model\n", + "- Run OLS of $\\tilde{Y}$ on $\\tilde{Z}$.\n", + "This is the well-known Two-Stage-Least-Squares (2SLS) algorithm for instrumental variable regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WC_ssbHee1bO" + }, + "source": [ + "Since we're considering only linear models and in a low-dimensional setting, we'll focus on just using linear IV methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s8ANZZ8sRF3C" + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "\n", + "library(hdm)\n", + "\n", + "set.seed(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TYBp5cB3fTVB" + }, + "source": [ + "# Analyzing Simulated Data\n", + "\n", + "First, let's evaluate the methods on simulated data generated from a linear SEM characterized by the above DAG. For this simulation, we'll set the ATE to 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hLFCX0YnfUfv" + }, + "outputs": [], + "source": [ + "gen_data <- function(n, ate) {\n", + " X <- matrix(rnorm(n * 10), ncol = 10)\n", + " A <- 2 * X[, 1] + rnorm(n)\n", + " Q <- 10 * A + 2 * X[, 1] + rnorm(n)\n", + " S <- 5 * A + X[, 1] + rnorm(n)\n", + " D <- Q - A + 2 * X[, 1] + rnorm(n)\n", + " Y <- ate * D + 5 * A + 2 * S + 0.5 * X[, 1] + rnorm(n)\n", + " return(list(X, A, Q, S, D, Y))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jwNPCZT8fxDW" + }, + "outputs": [], + "source": [ + "data_list <- gen_data(5000, 2)\n", + "X <- data_list[[1]]\n", + "A <- data_list[[2]]\n", + "Q <- data_list[[3]]\n", + "S <- data_list[[4]]\n", + "D <- data_list[[5]]\n", + "Y <- data_list[[6]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "64UHEtu1f8BU" + }, + "source": [ + "We define the technical instrument $V=(D, Q)$ and technical treatment $W=(D, S)$. Estimating the treatement effect is then just a matter of solving an instrument variable regression problem with instruments $V$ and treatments $W$ and looking at the first coefficient associated with $D$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fSZ0AyGyf7Mn" + }, + "outputs": [], + "source": [ + "W = cbind(D,S)\n", + "V = cbind(D,Q)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9inuCWFCg8pc" + }, + "outputs": [], + "source": [ + "piv <- tsls(X, W, Y, V, homoscedastic = FALSE)\n", + "cat('Estimated coefficient:',piv$coefficients['D',1],'\\n')\n", + "cat('Standard error:',piv$se['D'],'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ewgCBYN6lCu4" + }, + "source": [ + "# With Cross-Fitting\n", + "\n", + "We can also consider partialling out the controls using DML with cross-fitting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "59a2apIRlImz" + }, + "outputs": [], + "source": [ + "lm.DML.for.PROXYIV <- function(x, d, q, s, y, dreg, qreg, yreg, sreg, nfold=5) {\n", + " # this implements DML for a partially linear IV model\n", + " nobs <- nrow(x)\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", + " I <- split(1:nobs, foldid)\n", + " # create residualized objects to fill\n", + " ytil <- dtil <- qtil<- stil <- rep(NA, nobs)\n", + " # obtain cross-fitted residuals\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " qfit <- qreg(x[-I[[b]],], q[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) #take a fold out\n", + " sfit <- sreg(x[-I[[b]],], s[-I[[b]]]) #take a fold out\n", + " dtil[I[[b]]] <- (d[I[[b]]] - x[I[[b]],]%*%as.matrix(dfit$coefficients)) #record residual\n", + " qtil[I[[b]]] <- (q[I[[b]]] - x[I[[b]],]%*%as.matrix(qfit$coefficients)) #record residual\n", + " ytil[I[[b]]] <- (y[I[[b]]] - x[I[[b]],]%*%as.matrix(yfit$coefficients)) #record residial\n", + " stil[I[[b]]] <- (s[I[[b]]] - x[I[[b]],]%*%as.matrix(sfit$coefficients)) #record residual\n", + " cat(b,\" \")\n", + " }\n", + " ivfit= tsls(y=ytil,d=cbind(dtil,stil), x=NULL, z=cbind(dtil,qtil), intercept=FALSE, homoscedastic = FALSE)\n", + " coef.est <- ivfit$coef[1] #extract coefficient\n", + " se <- ivfit$se[1] #record standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", + " return( list(coef.est =coef.est , se=se, dtil=dtil, qtil = qtil, ytil=ytil, stil=stil, foldid=foldid, spI = I) )\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wcytUEblmIB6" + }, + "source": [ + "We'll just use OLS for partialling out again. We could of course try something more elaborate if we wanted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oYx9OoxnmAYq" + }, + "outputs": [], + "source": [ + "dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols\n", + "qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols\n", + "yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols\n", + "sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols\n", + "\n", + "DML.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "23OIAhYa84ng" + }, + "source": [ + "## Real Data - Effects of Smoking on Birth Weight" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qFG-RtmR85rv" + }, + "source": [ + "In this study, we will be studying the effects of smoking on baby weight. We will consider the following stylized setup:\n", + "\n", + "Outcome ($Y$): baby weight\n", + "\n", + "Treatment ($D$): smoking\n", + "\n", + "Unobserved confounding ($A$): family income\n", + "\n", + "The observed covariates are put in to 3 groups:\n", + "\n", + "\n", + "* Proxy treatment control ($Q$): mother's education\n", + "* Proxy outcome control ($S$): parity (total number of previous pregnancies)\n", + "* Other observed covariates ($X$): mother's race and age\n", + "\n", + "\n", + "Education serves as a proxy treatment control $Q$ because it reflects unobserved confounding due to household income $A$ but has no direct medical effect on birth weight $Y$. Parity serves as a proxy outcome control $S$ because family size reflects household income $A$ but is not directly caused by smoking $D$ or education $Q$.\n", + "\n", + "A description of the data used can be found [here](https://www.stat.berkeley.edu/users/statlabs/data/babies.readme)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6FjiuPrt8_rk" + }, + "outputs": [], + "source": [ + "data <- read.table(\"https://www.stat.berkeley.edu/users/statlabs/data/babies23.data\", header = TRUE)\n", + "summary(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ooZVwo0y-H9E" + }, + "outputs": [], + "source": [ + "# Filter data to exclude entries where income, number of cigarettes smoked,\n", + "# race, age are not asked or not known\n", + "data = data[data$race != 99, ]\n", + "data = data[!(data$number %in% c(98,99)), ]\n", + "data = data[!(data$inc %in% c(98,99)), ]\n", + "data = data[data$age != 99, ]\n", + "dim(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8vv9F6fe_AoI" + }, + "outputs": [], + "source": [ + "# Create matrices for X, D, Q, S, A, Y\n", + "X <- model.matrix(~0 + C(race) + age, data)\n", + "D <- model.matrix(~0 + number, data)\n", + "Q <- model.matrix(~0 + ed, data)\n", + "S <- model.matrix(~0 + parity, data)\n", + "A <- model.matrix(~0 + inc, data)\n", + "Y <- model.matrix(~0 + wt, data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "E-KCdxFH_TtC" + }, + "outputs": [], + "source": [ + "# Use cross-fitting with OLS to estimate treatment effect within linear model context\n", + "dreg <- function(x,d){ lm.fit(x, d) } #ML method=ols\n", + "qreg <- function(x,q){ lm.fit(x, q) } #ML method=ols\n", + "yreg <- function(x,y){ lm.fit(x, y) } #ML method=ols\n", + "sreg <- function(x,s){ lm.fit(x, s) } #ML method=ols\n", + "\n", + "DML.bw.piv = lm.DML.for.PROXYIV(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold=5)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd new file mode 100644 index 00000000..153e91b1 --- /dev/null +++ b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd @@ -0,0 +1,282 @@ +--- +title: An R Markdown document converted from "AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb" +output: html_document +--- + +# Sensitivity Analysis for Unobserved Confounder with DML and Sensmakr + +Here we experiment with using package "sensemakr" in conjunction with debiased ML. + +## Partially Linear SEM + +Consider the SEM +\begin{eqnarray*} +Y & := & \alpha D + \delta A + f_Y(X) + \epsilon_Y, \\ +D & := & \gamma A + f_D(X) + \epsilon_D, \\ +A & : = & f_A(X) + \epsilon_A, \\ +X & := & \epsilon_X, +\end{eqnarray*} +where, conditional on $X$, $\epsilon_Y, \epsilon_D, \epsilon_A$ are mean zero +and mutually uncorrelated. We further normalize +$$ +E[\epsilon_A^2] =1. +$$ +The key structural +parameter is $\alpha$: $$\alpha = \partial_d Y(d)$$ +where $$Y(d) := (Y: do (D=d)).$$ + +To give context to our example, we can interpret $Y$ as earnings, +$D$ as education, $A$ as ability, and $X$ as a set of observed background variables. In this example, we can interpret $\alpha$ as the returns to schooling. + +We start by applying the partialling out operator to get rid of the $X$'s in all of the equations. Define the partialling out operation of any random vector $V$ with respect to another random vector $X$ as the residual that is left after subtracting the best predictor of $V$ given $X$: +$$\tilde V = V - E [V \mid X].$$ +If $f$'s are linear, we can replace $E [V \mid X]$ +by linear projection. After partialling out, we have a simplified system: +\begin{eqnarray*} +\tilde Y & := & \alpha \tilde D + \delta \tilde A + \epsilon_Y, \\ +\tilde D & := & \gamma \tilde A + \epsilon_D, \\ +\tilde A & : = & \epsilon_A, +\end{eqnarray*} +where $\epsilon_Y$, $\epsilon_D$, and $\epsilon_A$ are uncorrelated. + +Then the projection of $\tilde Y$ on $\tilde D$ recovers +$$ +\beta = E [\tilde Y \tilde D]/ E [\tilde D^2] = \alpha + \phi, +$$ +where +$$ +\phi = \delta \gamma/ E \left[(\gamma^2 + \epsilon^2_D)\right], +$$ +is the omitted confounder bias or omitted variable bias. + +The formula follows from inserting the expression for $\tilde D$ into the definition of $\beta$ and then simplifying the resulting expression using the assumptions on the $\epsilon$'s. + +We can use this formula to bound $\phi$ directly by making assumptions on the size of $\delta$ +and $\gamma$. An alternative approach can be based on the following characterization, +based on partial $R^2$'s. This characterization essentially follows +from Cinelli and Hazlett, with the slight difference that we have adapted +the result to the partially linear model. + +*Theorem* [Omitted Confounder Bias in Terms of Partial $R^2$'s] + +In the partially linear SEM setting above, +$$ +\phi^2 = \frac{R^2_{\tilde Y \sim \tilde A \mid \tilde D} R^2_{\tilde D \sim \tilde A} }{ (1 - R^2_{\tilde D \sim \tilde A}) } \ +\frac{E \left[ (\tilde Y - \beta \tilde D)^2 \right] }{E \left[ ( \tilde D )^2 \right]}, +$$ +where $R^2_{V \sim W \mid X}$ denotes the population $R^2$ in the linear regression of $V$ on $W$, after partialling out $X$ from $V$ and $W$ linearly. + + +Therefore, if we place bounds on how much of the variation in $\tilde Y$ and in $\tilde D$ +the unobserved confounder $\tilde A$ is able to explain, we can bound the omitted confounder bias by $$\sqrt{\phi^2}.$$ + +# Empirical Example + +We consider an empirical example based on data surrounding the Darfur war. Specifically, we are interested in the effect of having experienced direct war violence on attitudes towards peace. Data is described here +https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html + +The main outcome is attitude towards peace -- ``peacefactor``. +The key variable of interest is whether the responders were directly harmed (``directlyharmed``). +We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures. +The measured confounders include dummy variables ``female``, ``farmer``, ``herder``, ``pastvoted``, along with ``age`` and household size (``hhsize``). +There is also a village indicator. We deal with village by and partialling out a full set of village dummy variables before conducting +the analysis. The standard errors will be clustered at the village level. + + +## Outline + +We will: +- mimic the partialling out procedure with machine learning tools; +- invoke Sensmakr to compute $\phi^2$ and plot sensitivity results. + +```{r} +# load package +install.packages("sensemakr") +install.packages("lfe") +install.packages("hdm") +install.packages("randomForest") +library(sensemakr) +library(lfe) +library(hdm) +library(randomForest) + +set.seed(1) +``` + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/darfur.csv" +data <- read.csv(file) +dim(data) +attach(darfur) + +# # Alternatively load data and save as follows: +# data("darfur") +# # write.csv(darfur, "darfur.csv", row.names=FALSE) +``` + +## Preprocessing +Take out village fixed effects and run basic linear analysis + +```{r} +#get rid of village fixed effects +peacefactorR <- lm(peacefactor~village)$res +directlyharmedR <- lm(directlyharmed~village)$res +femaleR <- lm(female~village)$res +ageR <- lm(age~village)$res +farmerR <- lm(farmer_dar~village)$res +herderR <- lm(herder_dar~village)$res +pastvotedR <- lm(pastvoted~village)$res +hhsizeR <- lm(hhsize_darfur~village)$res + + +# Preliminary linear model analysis +summary(felm(peacefactorR~ directlyharmedR+ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) + +# here we are clustering standard errors at the village level +summary(felm(peacefactorR~ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) + +summary(felm(directlyharmedR~ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) +``` + +For benchmarking, we want to understand the "partial $R^2$" of the controls on the outcome after partialling out the variable of interest. + +```{r} +#partial out variable of interest +peacefactorR2 <- lm(peacefactorR~directlyharmedR)$res +femaleR2 <- lm(femaleR~directlyharmedR)$res +ageR2 <- lm(ageR~directlyharmedR)$res +farmerR2 <- lm(farmerR~directlyharmedR)$res +herderR2 <- lm(herderR~directlyharmedR)$res +pastvotedR2 <- lm(pastvotedR~directlyharmedR)$res +hhsizeR2 <- lm(hhsizeR~directlyharmedR)$res + + +# R^2 of controls after partialling out variable of interest +summary(lm(peacefactorR2 ~ femaleR2 + + ageR2 + farmerR2 + herderR2 + pastvotedR2 + hhsizeR2)) + +``` + +## Lasso for partialling out controls + +```{r} +resY = rlasso(peacefactorR ~ (femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR)^3, post=F)$res + +resD = rlasso(directlyharmedR ~ (femaleR + + ageR + farmerR + herderR + pastvotedR + hhsizeR)^3 , post=F)$res + +print(c("Controls explain the following fraction of variance of Outcome", 1-var(resY)/var(peacefactorR))) +print(c("Controls explain the following fraction of variance of Treatment", 1-var(resD)/var(directlyharmedR))) + +dml.darfur.model = felm(resY ~ resD|0|0|village) # cluster SEs by village + +summary(dml.darfur.model,robust=T) # cluster SE by village + +dml.darfur.model = lm(resY ~ resD) # linear model to use as input in sensemakr +``` + +## Manual Bias Analysis + +```{r} +# Main estimate +beta = dml.darfur.model$coef[2] + +# Hypothetical values of partial R2s +R2.YC = .13; R2.DC = .01 + +# Elements of the bias equation +kappa<- (R2.YC * R2.DC)/(1- R2.DC) +varianceRatio<- mean(dml.darfur.model$res^2)/mean(resD^2) + +# Compute square bias +BiasSq <- kappa*varianceRatio + +# Compute absolute value of the bias +print(cat("absolute value of the bias:", sqrt(BiasSq)[1])) + +# plotting +gridR2.DC<- seq(0,.3, by=.001) +gridR2.YC<- kappa*(1 - gridR2.DC)/gridR2.DC +gridR2.YC<- ifelse(gridR2.YC> 1, 1, gridR2.YC); + +plot(gridR2.DC, gridR2.YC, type="l", col=4, xlab = "Partial R2 of Treatment with Confounder", + ylab = "Partial R2 of Outcome with Confounder", + main = paste("Combination of R2 such that |Bias| < ", round(sqrt(BiasSq), digits=4)) +) +``` + +## Bias Analysis with Sensemakr + +```{r} +dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, + treatment = "resD") +summary(dml.darfur.sensitivity) +plot(dml.darfur.sensitivity, nlevels = 15) +``` + +## Random Forest for partialling out + +The following code does DML with clustered standard errors by village + +```{r} +DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2, clusterID) { + nobs <- nrow(x) #number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices + I <- split(1:nobs, foldid) #split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + cat(b," ") + } + rfit <- felm(ytil ~ dtil |0|0|clusterID) #get clustered standard errors using felm + rfitSummary<- summary(rfit) + coef.est <- rfitSummary$coef[2] #extract coefficient + se <- rfitSummary$coef[2,2] #record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals +} +``` + +```{r} +x = model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR) +dim(x) + +d = directlyharmedR +y = peacefactorR; + +# DML with Random Forest: +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest + +DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10, clusterID=village) + +resY = DML2.RF$ytil +resD = DML2.RF$dtil + +print(c("Controls explain the following fraction of variance of Outcome", max(1-var(resY)/var(peacefactorR),0))) +print(c("Controls explain the following fraction of variance of Treatment", max(1-var(resD)/var(directlyharmedR),0))) + +dml.darfur.model = lm(resY~resD) +``` + +## Bias Analysis with Sensemakr + + +```{r} +dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, + treatment = "resD") +summary(dml.darfur.sensitivity) + +plot(dml.darfur.sensitivity,nlevels = 15) +``` + diff --git a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb index 7bb64c78..4e3064ca 100644 --- a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb +++ b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb @@ -1,629 +1,629 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "martial-negative", - "metadata": { - "papermill": { - "duration": 0.024232, - "end_time": "2021-04-03T12:55:25.038730", - "exception": false, - "start_time": "2021-04-03T12:55:25.014498", - "status": "completed" - }, - "tags": [], - "id": "martial-negative" - }, - "source": [ - "# Sensitivity Analysis for Unobserved Confounder with DML and Sensmakr\n", - "\n", - "Here we experiment with using package \"sensemakr\" in conjunction with debiased ML." - ] - }, - { - "cell_type": "markdown", - "id": "criminal-workplace", - "metadata": { - "papermill": { - "duration": 0.019939, - "end_time": "2021-04-03T12:55:25.120184", - "exception": false, - "start_time": "2021-04-03T12:55:25.100245", - "status": "completed" - }, - "tags": [], - "id": "criminal-workplace" - }, - "source": [ - "## Partially Linear SEM\n", - "\n", - "Consider the SEM\n", - "\\begin{eqnarray*}\n", - "Y & := & \\alpha D + \\delta A + f_Y(X) + \\epsilon_Y, \\\\\n", - "D & := & \\gamma A + f_D(X) + \\epsilon_D, \\\\\n", - "A & : = & f_A(X) + \\epsilon_A, \\\\\n", - "X & := & \\epsilon_X,\n", - "\\end{eqnarray*}\n", - "where, conditional on $X$, $\\epsilon_Y, \\epsilon_D, \\epsilon_A$ are mean zero\n", - "and mutually uncorrelated. We further normalize\n", - "$$\n", - "E[\\epsilon_A^2] =1.\n", - "$$\n", - "The key structural\n", - "parameter is $\\alpha$: $$\\alpha = \\partial_d Y(d)$$\n", - "where $$Y(d) := (Y: do (D=d)).$$\n", - "\n", - "To give context to our example, we can interpret $Y$ as earnings,\n", - "$D$ as education, $A$ as ability, and $X$ as a set of observed background variables. In this example, we can interpret $\\alpha$ as the returns to schooling.\n", - "\n", - "We start by applying the partialling out operator to get rid of the $X$'s in all of the equations. Define the partialling out operation of any random vector $V$ with respect to another random vector $X$ as the residual that is left after subtracting the best predictor of $V$ given $X$:\n", - "$$\\tilde V = V - E [V \\mid X].$$ \n", - "If $f$'s are linear, we can replace $E [V \\mid X]$\n", - "by linear projection. After partialling out, we have a simplified system:\n", - "\\begin{eqnarray*}\n", - "\\tilde Y & := & \\alpha \\tilde D + \\delta \\tilde A + \\epsilon_Y, \\\\\n", - "\\tilde D & := & \\gamma \\tilde A + \\epsilon_D, \\\\\n", - "\\tilde A & : = & \\epsilon_A,\n", - "\\end{eqnarray*}\n", - "where $\\epsilon_Y$, $\\epsilon_D$, and $\\epsilon_A$ are uncorrelated.\n", - "\n", - "Then the projection of $\\tilde Y$ on $\\tilde D$ recovers\n", - "$$\n", - "\\beta = E [\\tilde Y \\tilde D]/ E [\\tilde D^2] = \\alpha + \\phi,\n", - "$$\n", - "where\n", - "$$\n", - "\\phi = \\delta \\gamma/ E \\left[(\\gamma^2 + \\epsilon^2_D)\\right],\n", - "$$\n", - "is the omitted confounder bias or omitted variable bias.\n", - "\n", - "The formula follows from inserting the expression for $\\tilde D$ into the definition of $\\beta$ and then simplifying the resulting expression using the assumptions on the $\\epsilon$'s.\n", - "\n", - "We can use this formula to bound $\\phi$ directly by making assumptions on the size of $\\delta$\n", - "and $\\gamma$. An alternative approach can be based on the following characterization,\n", - "based on partial $R^2$'s. This characterization essentially follows\n", - "from Cinelli and Hazlett, with the slight difference that we have adapted\n", - "the result to the partially linear model.\n", - "\n", - "*Theorem* [Omitted Confounder Bias in Terms of Partial $R^2$'s]\n", - "\n", - "In the partially linear SEM setting above,\n", - "$$\n", - "\\phi^2 = \\frac{R^2_{\\tilde Y \\sim \\tilde A \\mid \\tilde D} R^2_{\\tilde D \\sim \\tilde A} }{ (1 - R^2_{\\tilde D \\sim \\tilde A}) } \\\n", - "\\frac{E \\left[ (\\tilde Y - \\beta \\tilde D)^2 \\right] }{E \\left[ ( \\tilde D )^2 \\right]},\n", - "$$\n", - "where $R^2_{V \\sim W \\mid X}$ denotes the population $R^2$ in the linear regression of $V$ on $W$, after partialling out $X$ from $V$ and $W$ linearly.\n", - "\n", - "\n", - "Therefore, if we place bounds on how much of the variation in $\\tilde Y$ and in $\\tilde D$\n", - "the unobserved confounder $\\tilde A$ is able to explain, we can bound the omitted confounder bias by $$\\sqrt{\\phi^2}.$$\n" - ] - }, - { - "cell_type": "markdown", - "id": "continuous-marshall", - "metadata": { - "papermill": { - "duration": 0.020014, - "end_time": "2021-04-03T12:55:25.160190", - "exception": false, - "start_time": "2021-04-03T12:55:25.140176", - "status": "completed" - }, - "tags": [], - "id": "continuous-marshall" - }, - "source": [ - "# Empirical Example\n", - "\n", - "We consider an empirical example based on data surrounding the Darfur war. Specifically, we are interested in the effect of having experienced direct war violence on attitudes towards peace. Data is described here\n", - "https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html\n", - "\n", - "The main outcome is attitude towards peace -- ``peacefactor``.\n", - "The key variable of interest is whether the responders were directly harmed (``directlyharmed``).\n", - "We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures.\n", - "The measured confounders include dummy variables ``female``, ``farmer``, ``herder``, ``pastvoted``, along with ``age`` and household size (``hhsize``).\n", - "There is also a village indicator. We deal with village by and partialling out a full set of village dummy variables before conducting\n", - "the analysis. The standard errors will be clustered at the village level." - ] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "## Outline\n", - "\n", - "We will:\n", - "- mimic the partialling out procedure with machine learning tools;\n", - "- invoke Sensmakr to compute $\\phi^2$ and plot sensitivity results.\n" - ], - "metadata": { - "id": "oW_mOo_wcpmV" - }, - "id": "oW_mOo_wcpmV" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "spanish-queue", - "metadata": { - "_execution_state": "idle", - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "papermill": { - "duration": 13.050132, - "end_time": "2021-04-03T12:55:38.230528", - "exception": false, - "start_time": "2021-04-03T12:55:25.180396", - "status": "completed" - }, - "tags": [], - "id": "spanish-queue" - }, - "outputs": [], - "source": [ - "# load package\n", - "install.packages(\"sensemakr\")\n", - "install.packages(\"lfe\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"randomForest\")\n", - "library(sensemakr)\n", - "library(lfe)\n", - "library(hdm)\n", - "library(randomForest)\n", - "\n", - "set.seed(1)" - ] - }, - { - "cell_type": "code", - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/darfur.csv\"\n", - "data <- read.csv(file)\n", - "dim(data)\n", - "attach(darfur)\n", - "\n", - "# # Alternatively load data and save as follows:\n", - "# data(\"darfur\")\n", - "# # write.csv(darfur, \"darfur.csv\", row.names=FALSE)\n" - ], - "metadata": { - "id": "zipYYvHdl60m" - }, - "id": "zipYYvHdl60m", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "hidden-packing", - "metadata": { - "papermill": { - "duration": 0.021289, - "end_time": "2021-04-03T12:55:38.319389", - "exception": false, - "start_time": "2021-04-03T12:55:38.298100", - "status": "completed" - }, - "tags": [], - "id": "hidden-packing" - }, - "source": [ - "## Preprocessing\n", - "Take out village fixed effects and run basic linear analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "authorized-transformation", - "metadata": { - "papermill": { - "duration": 2.339638, - "end_time": "2021-04-03T12:55:40.680306", - "exception": false, - "start_time": "2021-04-03T12:55:38.340668", - "status": "completed" - }, - "tags": [], - "id": "authorized-transformation" - }, - "outputs": [], - "source": [ - "#get rid of village fixed effects\n", - "peacefactorR <- lm(peacefactor~village)$res\n", - "directlyharmedR <- lm(directlyharmed~village)$res\n", - "femaleR <- lm(female~village)$res\n", - "ageR <- lm(age~village)$res\n", - "farmerR <- lm(farmer_dar~village)$res\n", - "herderR <- lm(herder_dar~village)$res\n", - "pastvotedR <- lm(pastvoted~village)$res\n", - "hhsizeR <- lm(hhsize_darfur~village)$res\n", - "\n", - "\n", - "# Preliminary linear model analysis\n", - "summary(felm(peacefactorR~ directlyharmedR+ femaleR +\n", - " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))\n", - "\n", - "# here we are clustering standard errors at the village level\n", - "summary(felm(peacefactorR~ femaleR +\n", - " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))\n", - "\n", - "summary(felm(directlyharmedR~ femaleR +\n", - " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))" - ] - }, - { - "cell_type": "markdown", - "source": [ - "For benchmarking, we want to understand the \"partial $R^2$\" of the controls on the outcome after partialling out the variable of interest." - ], - "metadata": { - "id": "dpvDjIKNw7Nk" - }, - "id": "dpvDjIKNw7Nk" - }, - { - "cell_type": "code", - "source": [ - "#partial out variable of interest\n", - "peacefactorR2 <- lm(peacefactorR~directlyharmedR)$res\n", - "femaleR2 <- lm(femaleR~directlyharmedR)$res\n", - "ageR2 <- lm(ageR~directlyharmedR)$res\n", - "farmerR2 <- lm(farmerR~directlyharmedR)$res\n", - "herderR2 <- lm(herderR~directlyharmedR)$res\n", - "pastvotedR2 <- lm(pastvotedR~directlyharmedR)$res\n", - "hhsizeR2 <- lm(hhsizeR~directlyharmedR)$res\n", - "\n", - "\n", - "# R^2 of controls after partialling out variable of interest\n", - "summary(lm(peacefactorR2 ~ femaleR2 +\n", - " ageR2 + farmerR2 + herderR2 + pastvotedR2 + hhsizeR2))\n", - "\n" - ], - "metadata": { - "id": "ck8nzqQbv8dz" - }, - "id": "ck8nzqQbv8dz", - "execution_count": null, - "outputs": [] + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "martial-negative", + "papermill": { + "duration": 0.024232, + "end_time": "2021-04-03T12:55:25.038730", + "exception": false, + "start_time": "2021-04-03T12:55:25.014498", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "careful-dollar", - "metadata": { - "papermill": { - "duration": 0.041148, - "end_time": "2021-04-03T12:55:40.762964", - "exception": false, - "start_time": "2021-04-03T12:55:40.721816", - "status": "completed" - }, - "tags": [], - "id": "careful-dollar" - }, - "source": [ - "## Lasso for partialling out controls" - ] + "tags": [] + }, + "source": [ + "# Sensitivity Analysis for Unobserved Confounder with DML and Sensmakr\n", + "\n", + "Here we experiment with using package \"sensemakr\" in conjunction with debiased ML." + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "criminal-workplace", + "papermill": { + "duration": 0.019939, + "end_time": "2021-04-03T12:55:25.120184", + "exception": false, + "start_time": "2021-04-03T12:55:25.100245", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "remarkable-mozambique", - "metadata": { - "papermill": { - "duration": 0.310314, - "end_time": "2021-04-03T12:55:41.103965", - "exception": false, - "start_time": "2021-04-03T12:55:40.793651", - "status": "completed" - }, - "tags": [], - "id": "remarkable-mozambique" - }, - "outputs": [], - "source": [ - "resY = rlasso(peacefactorR ~ (femaleR +\n", - " ageR + farmerR+ herderR + pastvotedR + hhsizeR)^3, post=F)$res\n", - "\n", - "resD = rlasso(directlyharmedR ~ (femaleR +\n", - " ageR + farmerR + herderR + pastvotedR + hhsizeR)^3 , post=F)$res\n", - "\n", - "print(c(\"Controls explain the following fraction of variance of Outcome\", 1-var(resY)/var(peacefactorR)))\n", - "print(c(\"Controls explain the following fraction of variance of Treatment\", 1-var(resD)/var(directlyharmedR)))\n", - "\n", - "dml.darfur.model = felm(resY ~ resD|0|0|village) # cluster SEs by village\n", - "\n", - "summary(dml.darfur.model,robust=T) # cluster SE by village\n", - "\n", - "dml.darfur.model = lm(resY ~ resD) # linear model to use as input in sensemakr" - ] + "tags": [] + }, + "source": [ + "## Partially Linear SEM\n", + "\n", + "Consider the SEM\n", + "\\begin{eqnarray*}\n", + "Y & := & \\alpha D + \\delta A + f_Y(X) + \\epsilon_Y, \\\\\n", + "D & := & \\gamma A + f_D(X) + \\epsilon_D, \\\\\n", + "A & : = & f_A(X) + \\epsilon_A, \\\\\n", + "X & := & \\epsilon_X,\n", + "\\end{eqnarray*}\n", + "where, conditional on $X$, $\\epsilon_Y, \\epsilon_D, \\epsilon_A$ are mean zero\n", + "and mutually uncorrelated. We further normalize\n", + "$$\n", + "E[\\epsilon_A^2] =1.\n", + "$$\n", + "The key structural\n", + "parameter is $\\alpha$: $$\\alpha = \\partial_d Y(d)$$\n", + "where $$Y(d) := (Y: do (D=d)).$$\n", + "\n", + "To give context to our example, we can interpret $Y$ as earnings,\n", + "$D$ as education, $A$ as ability, and $X$ as a set of observed background variables. In this example, we can interpret $\\alpha$ as the returns to schooling.\n", + "\n", + "We start by applying the partialling out operator to get rid of the $X$'s in all of the equations. Define the partialling out operation of any random vector $V$ with respect to another random vector $X$ as the residual that is left after subtracting the best predictor of $V$ given $X$:\n", + "$$\\tilde V = V - E [V \\mid X].$$ \n", + "If $f$'s are linear, we can replace $E [V \\mid X]$\n", + "by linear projection. After partialling out, we have a simplified system:\n", + "\\begin{eqnarray*}\n", + "\\tilde Y & := & \\alpha \\tilde D + \\delta \\tilde A + \\epsilon_Y, \\\\\n", + "\\tilde D & := & \\gamma \\tilde A + \\epsilon_D, \\\\\n", + "\\tilde A & : = & \\epsilon_A,\n", + "\\end{eqnarray*}\n", + "where $\\epsilon_Y$, $\\epsilon_D$, and $\\epsilon_A$ are uncorrelated.\n", + "\n", + "Then the projection of $\\tilde Y$ on $\\tilde D$ recovers\n", + "$$\n", + "\\beta = E [\\tilde Y \\tilde D]/ E [\\tilde D^2] = \\alpha + \\phi,\n", + "$$\n", + "where\n", + "$$\n", + "\\phi = \\delta \\gamma/ E \\left[(\\gamma^2 + \\epsilon^2_D)\\right],\n", + "$$\n", + "is the omitted confounder bias or omitted variable bias.\n", + "\n", + "The formula follows from inserting the expression for $\\tilde D$ into the definition of $\\beta$ and then simplifying the resulting expression using the assumptions on the $\\epsilon$'s.\n", + "\n", + "We can use this formula to bound $\\phi$ directly by making assumptions on the size of $\\delta$\n", + "and $\\gamma$. An alternative approach can be based on the following characterization,\n", + "based on partial $R^2$'s. This characterization essentially follows\n", + "from Cinelli and Hazlett, with the slight difference that we have adapted\n", + "the result to the partially linear model.\n", + "\n", + "*Theorem* [Omitted Confounder Bias in Terms of Partial $R^2$'s]\n", + "\n", + "In the partially linear SEM setting above,\n", + "$$\n", + "\\phi^2 = \\frac{R^2_{\\tilde Y \\sim \\tilde A \\mid \\tilde D} R^2_{\\tilde D \\sim \\tilde A} }{ (1 - R^2_{\\tilde D \\sim \\tilde A}) } \\\n", + "\\frac{E \\left[ (\\tilde Y - \\beta \\tilde D)^2 \\right] }{E \\left[ ( \\tilde D )^2 \\right]},\n", + "$$\n", + "where $R^2_{V \\sim W \\mid X}$ denotes the population $R^2$ in the linear regression of $V$ on $W$, after partialling out $X$ from $V$ and $W$ linearly.\n", + "\n", + "\n", + "Therefore, if we place bounds on how much of the variation in $\\tilde Y$ and in $\\tilde D$\n", + "the unobserved confounder $\\tilde A$ is able to explain, we can bound the omitted confounder bias by $$\\sqrt{\\phi^2}.$$\n" + ] + }, + { + "cell_type": "markdown", + "id": "2", + "metadata": { + "id": "continuous-marshall", + "papermill": { + "duration": 0.020014, + "end_time": "2021-04-03T12:55:25.160190", + "exception": false, + "start_time": "2021-04-03T12:55:25.140176", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "built-enlargement", - "metadata": { - "papermill": { - "duration": 0.02335, - "end_time": "2021-04-03T12:55:41.169602", - "exception": false, - "start_time": "2021-04-03T12:55:41.146252", - "status": "completed" - }, - "tags": [], - "id": "built-enlargement" - }, - "source": [ - "## Manual Bias Analysis" - ] + "tags": [] + }, + "source": [ + "# Empirical Example\n", + "\n", + "We consider an empirical example based on data surrounding the Darfur war. Specifically, we are interested in the effect of having experienced direct war violence on attitudes towards peace. Data is described here\n", + "https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html\n", + "\n", + "The main outcome is attitude towards peace -- ``peacefactor``.\n", + "The key variable of interest is whether the responders were directly harmed (``directlyharmed``).\n", + "We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures.\n", + "The measured confounders include dummy variables ``female``, ``farmer``, ``herder``, ``pastvoted``, along with ``age`` and household size (``hhsize``).\n", + "There is also a village indicator. We deal with village by and partialling out a full set of village dummy variables before conducting\n", + "the analysis. The standard errors will be clustered at the village level." + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": { + "id": "oW_mOo_wcpmV" + }, + "source": [ + "\n", + "## Outline\n", + "\n", + "We will:\n", + "- mimic the partialling out procedure with machine learning tools;\n", + "- invoke Sensmakr to compute $\\phi^2$ and plot sensitivity results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "spanish-queue", + "papermill": { + "duration": 13.050132, + "end_time": "2021-04-03T12:55:38.230528", + "exception": false, + "start_time": "2021-04-03T12:55:25.180396", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "respective-sister", - "metadata": { - "papermill": { - "duration": 0.380639, - "end_time": "2021-04-03T12:55:41.573999", - "exception": false, - "start_time": "2021-04-03T12:55:41.193360", - "status": "completed" - }, - "tags": [], - "id": "respective-sister" - }, - "outputs": [], - "source": [ - "# Main estimate\n", - "beta = dml.darfur.model$coef[2]\n", - "\n", - "# Hypothetical values of partial R2s\n", - "R2.YC = .13; R2.DC = .01\n", - "\n", - "# Elements of the bias equation\n", - "kappa<- (R2.YC * R2.DC)/(1- R2.DC)\n", - "varianceRatio<- mean(dml.darfur.model$res^2)/mean(resD^2)\n", - "\n", - "# Compute square bias\n", - "BiasSq <- kappa*varianceRatio\n", - "\n", - "# Compute absolute value of the bias\n", - "print(cat(\"absolute value of the bias:\", sqrt(BiasSq)[1]))\n", - "\n", - "# plotting\n", - "gridR2.DC<- seq(0,.3, by=.001)\n", - "gridR2.YC<- kappa*(1 - gridR2.DC)/gridR2.DC\n", - "gridR2.YC<- ifelse(gridR2.YC> 1, 1, gridR2.YC);\n", - "\n", - "plot(gridR2.DC, gridR2.YC, type=\"l\", col=4, xlab = \"Partial R2 of Treatment with Confounder\",\n", - " ylab = \"Partial R2 of Outcome with Confounder\",\n", - " main = paste(\"Combination of R2 such that |Bias| < \", round(sqrt(BiasSq), digits=4))\n", - ")\n" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "# load package\n", + "install.packages(\"sensemakr\")\n", + "install.packages(\"lfe\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"randomForest\")\n", + "library(sensemakr)\n", + "library(lfe)\n", + "library(hdm)\n", + "library(randomForest)\n", + "\n", + "set.seed(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "id": "zipYYvHdl60m" + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/darfur.csv\"\n", + "data <- read.csv(file)\n", + "dim(data)\n", + "attach(darfur)\n", + "\n", + "# # Alternatively load data and save as follows:\n", + "# data(\"darfur\")\n", + "# # write.csv(darfur, \"darfur.csv\", row.names=FALSE)\n" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": { + "id": "hidden-packing", + "papermill": { + "duration": 0.021289, + "end_time": "2021-04-03T12:55:38.319389", + "exception": false, + "start_time": "2021-04-03T12:55:38.298100", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "sorted-hands", - "metadata": { - "papermill": { - "duration": 0.025659, - "end_time": "2021-04-03T12:55:41.626309", - "exception": false, - "start_time": "2021-04-03T12:55:41.600650", - "status": "completed" - }, - "tags": [], - "id": "sorted-hands" - }, - "source": [ - "## Bias Analysis with Sensemakr" - ] + "tags": [] + }, + "source": [ + "## Preprocessing\n", + "Take out village fixed effects and run basic linear analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "id": "authorized-transformation", + "papermill": { + "duration": 2.339638, + "end_time": "2021-04-03T12:55:40.680306", + "exception": false, + "start_time": "2021-04-03T12:55:38.340668", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "proper-accessory", - "metadata": { - "papermill": { - "duration": 0.572625, - "end_time": "2021-04-03T12:55:42.224511", - "exception": false, - "start_time": "2021-04-03T12:55:41.651886", - "status": "completed" - }, - "tags": [], - "id": "proper-accessory" - }, - "outputs": [], - "source": [ - "dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model,\n", - " treatment = \"resD\")\n", - "summary(dml.darfur.sensitivity)\n", - "plot(dml.darfur.sensitivity, nlevels = 15)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "#get rid of village fixed effects\n", + "peacefactorR <- lm(peacefactor~village)$res\n", + "directlyharmedR <- lm(directlyharmed~village)$res\n", + "femaleR <- lm(female~village)$res\n", + "ageR <- lm(age~village)$res\n", + "farmerR <- lm(farmer_dar~village)$res\n", + "herderR <- lm(herder_dar~village)$res\n", + "pastvotedR <- lm(pastvoted~village)$res\n", + "hhsizeR <- lm(hhsize_darfur~village)$res\n", + "\n", + "\n", + "# Preliminary linear model analysis\n", + "summary(felm(peacefactorR~ directlyharmedR+ femaleR +\n", + " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))\n", + "\n", + "# here we are clustering standard errors at the village level\n", + "summary(felm(peacefactorR~ femaleR +\n", + " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))\n", + "\n", + "summary(felm(directlyharmedR~ femaleR +\n", + " ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village))" + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": { + "id": "dpvDjIKNw7Nk" + }, + "source": [ + "For benchmarking, we want to understand the \"partial $R^2$\" of the controls on the outcome after partialling out the variable of interest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "id": "ck8nzqQbv8dz" + }, + "outputs": [], + "source": [ + "#partial out variable of interest\n", + "peacefactorR2 <- lm(peacefactorR~directlyharmedR)$res\n", + "femaleR2 <- lm(femaleR~directlyharmedR)$res\n", + "ageR2 <- lm(ageR~directlyharmedR)$res\n", + "farmerR2 <- lm(farmerR~directlyharmedR)$res\n", + "herderR2 <- lm(herderR~directlyharmedR)$res\n", + "pastvotedR2 <- lm(pastvotedR~directlyharmedR)$res\n", + "hhsizeR2 <- lm(hhsizeR~directlyharmedR)$res\n", + "\n", + "\n", + "# R^2 of controls after partialling out variable of interest\n", + "summary(lm(peacefactorR2 ~ femaleR2 +\n", + " ageR2 + farmerR2 + herderR2 + pastvotedR2 + hhsizeR2))\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "id": "careful-dollar", + "papermill": { + "duration": 0.041148, + "end_time": "2021-04-03T12:55:40.762964", + "exception": false, + "start_time": "2021-04-03T12:55:40.721816", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "charged-mauritius", - "metadata": { - "papermill": { - "duration": 0.030825, - "end_time": "2021-04-03T12:55:42.286467", - "exception": false, - "start_time": "2021-04-03T12:55:42.255642", - "status": "completed" - }, - "tags": [], - "id": "charged-mauritius" - }, - "source": [ - "## Random Forest for partialling out" - ] + "tags": [] + }, + "source": [ + "## Lasso for partialling out controls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "remarkable-mozambique", + "papermill": { + "duration": 0.310314, + "end_time": "2021-04-03T12:55:41.103965", + "exception": false, + "start_time": "2021-04-03T12:55:40.793651", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "charitable-placement", - "metadata": { - "papermill": { - "duration": 0.030332, - "end_time": "2021-04-03T12:55:42.347072", - "exception": false, - "start_time": "2021-04-03T12:55:42.316740", - "status": "completed" - }, - "tags": [], - "id": "charitable-placement" - }, - "source": [ - "The following code does DML with clustered standard errors by village" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "resY = rlasso(peacefactorR ~ (femaleR +\n", + " ageR + farmerR+ herderR + pastvotedR + hhsizeR)^3, post=F)$res\n", + "\n", + "resD = rlasso(directlyharmedR ~ (femaleR +\n", + " ageR + farmerR + herderR + pastvotedR + hhsizeR)^3 , post=F)$res\n", + "\n", + "print(c(\"Controls explain the following fraction of variance of Outcome\", 1-var(resY)/var(peacefactorR)))\n", + "print(c(\"Controls explain the following fraction of variance of Treatment\", 1-var(resD)/var(directlyharmedR)))\n", + "\n", + "dml.darfur.model = felm(resY ~ resD|0|0|village) # cluster SEs by village\n", + "\n", + "summary(dml.darfur.model,robust=T) # cluster SE by village\n", + "\n", + "dml.darfur.model = lm(resY ~ resD) # linear model to use as input in sensemakr" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "id": "built-enlargement", + "papermill": { + "duration": 0.02335, + "end_time": "2021-04-03T12:55:41.169602", + "exception": false, + "start_time": "2021-04-03T12:55:41.146252", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "collect-neutral", - "metadata": { - "papermill": { - "duration": 0.050776, - "end_time": "2021-04-03T12:55:42.428137", - "exception": false, - "start_time": "2021-04-03T12:55:42.377361", - "status": "completed" - }, - "tags": [], - "id": "collect-neutral" - }, - "outputs": [], - "source": [ - "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2, clusterID) {\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", - " ytil <- dtil <- rep(NA, nobs)\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " cat(b,\" \")\n", - " }\n", - " rfit <- felm(ytil ~ dtil |0|0|clusterID) #get clustered standard errors using felm\n", - " rfitSummary<- summary(rfit)\n", - " coef.est <- rfitSummary$coef[2] #extract coefficient\n", - " se <- rfitSummary$coef[2,2] #record robust standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", - "}" - ] + "tags": [] + }, + "source": [ + "## Manual Bias Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "id": "respective-sister", + "papermill": { + "duration": 0.380639, + "end_time": "2021-04-03T12:55:41.573999", + "exception": false, + "start_time": "2021-04-03T12:55:41.193360", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "x = model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR)\n", - "dim(x)\n", - "\n", - "d = directlyharmedR\n", - "y = peacefactorR;\n", - "\n", - "# DML with Random Forest:\n", - "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "\n", - "DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10, clusterID=village)\n", - "\n", - "resY = DML2.RF$ytil\n", - "resD = DML2.RF$dtil\n", - "\n", - "print(c(\"Controls explain the following fraction of variance of Outcome\", max(1-var(resY)/var(peacefactorR),0)))\n", - "print(c(\"Controls explain the following fraction of variance of Treatment\", max(1-var(resD)/var(directlyharmedR),0)))\n", - "\n", - "dml.darfur.model = lm(resY~resD)" - ], - "metadata": { - "id": "d8eMyN7NRhYO" - }, - "id": "d8eMyN7NRhYO", - "execution_count": null, - "outputs": [] + "tags": [] + }, + "outputs": [], + "source": [ + "# Main estimate\n", + "beta = dml.darfur.model$coef[2]\n", + "\n", + "# Hypothetical values of partial R2s\n", + "R2.YC = .13; R2.DC = .01\n", + "\n", + "# Elements of the bias equation\n", + "kappa<- (R2.YC * R2.DC)/(1- R2.DC)\n", + "varianceRatio<- mean(dml.darfur.model$res^2)/mean(resD^2)\n", + "\n", + "# Compute square bias\n", + "BiasSq <- kappa*varianceRatio\n", + "\n", + "# Compute absolute value of the bias\n", + "print(cat(\"absolute value of the bias:\", sqrt(BiasSq)[1]))\n", + "\n", + "# plotting\n", + "gridR2.DC<- seq(0,.3, by=.001)\n", + "gridR2.YC<- kappa*(1 - gridR2.DC)/gridR2.DC\n", + "gridR2.YC<- ifelse(gridR2.YC> 1, 1, gridR2.YC);\n", + "\n", + "plot(gridR2.DC, gridR2.YC, type=\"l\", col=4, xlab = \"Partial R2 of Treatment with Confounder\",\n", + " ylab = \"Partial R2 of Outcome with Confounder\",\n", + " main = paste(\"Combination of R2 such that |Bias| < \", round(sqrt(BiasSq), digits=4))\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": { + "id": "sorted-hands", + "papermill": { + "duration": 0.025659, + "end_time": "2021-04-03T12:55:41.626309", + "exception": false, + "start_time": "2021-04-03T12:55:41.600650", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "## Bias Analysis with Sensemakr\n", - "\n" - ], - "metadata": { - "id": "cUxDc1mYdMHH" - }, - "id": "cUxDc1mYdMHH" + "tags": [] + }, + "source": [ + "## Bias Analysis with Sensemakr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "id": "proper-accessory", + "papermill": { + "duration": 0.572625, + "end_time": "2021-04-03T12:55:42.224511", + "exception": false, + "start_time": "2021-04-03T12:55:41.651886", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "obvious-there", - "metadata": { - "papermill": { - "duration": 40.040643, - "end_time": "2021-04-03T12:56:22.614312", - "exception": false, - "start_time": "2021-04-03T12:55:42.573669", - "status": "completed" - }, - "tags": [], - "id": "obvious-there" - }, - "outputs": [], - "source": [ - "dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model,\n", - " treatment = \"resD\")\n", - "summary(dml.darfur.sensitivity)\n", - "\n", - "plot(dml.darfur.sensitivity,nlevels = 15)\n" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model,\n", + " treatment = \"resD\")\n", + "summary(dml.darfur.sensitivity)\n", + "plot(dml.darfur.sensitivity, nlevels = 15)" + ] + }, + { + "cell_type": "markdown", + "id": "16", + "metadata": { + "id": "charged-mauritius", + "papermill": { + "duration": 0.030825, + "end_time": "2021-04-03T12:55:42.286467", + "exception": false, + "start_time": "2021-04-03T12:55:42.255642", + "status": "completed" }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "lqXAYlPCTMl1" - }, - "id": "lqXAYlPCTMl1", - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "source": [ + "## Random Forest for partialling out" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": { + "id": "charitable-placement", + "papermill": { + "duration": 0.030332, + "end_time": "2021-04-03T12:55:42.347072", + "exception": false, + "start_time": "2021-04-03T12:55:42.316740", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "source": [ + "The following code does DML with clustered standard errors by village" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": { + "id": "collect-neutral", + "papermill": { + "duration": 0.050776, + "end_time": "2021-04-03T12:55:42.428137", + "exception": false, + "start_time": "2021-04-03T12:55:42.377361", + "status": "completed" }, + "tags": [] + }, + "outputs": [], + "source": [ + "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2, clusterID) {\n", + " nobs <- nrow(x) #number of observations\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", + " I <- split(1:nobs, foldid) #split observation indices into folds\n", + " ytil <- dtil <- rep(NA, nobs)\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " cat(b,\" \")\n", + " }\n", + " rfit <- felm(ytil ~ dtil |0|0|clusterID) #get clustered standard errors using felm\n", + " rfitSummary<- summary(rfit)\n", + " coef.est <- rfitSummary$coef[2] #extract coefficient\n", + " se <- rfitSummary$coef[2,2] #record robust standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", + " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": { + "id": "d8eMyN7NRhYO" + }, + "outputs": [], + "source": [ + "x = model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR)\n", + "dim(x)\n", + "\n", + "d = directlyharmedR\n", + "y = peacefactorR;\n", + "\n", + "# DML with Random Forest:\n", + "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", + "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", + "\n", + "DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10, clusterID=village)\n", + "\n", + "resY = DML2.RF$ytil\n", + "resD = DML2.RF$dtil\n", + "\n", + "print(c(\"Controls explain the following fraction of variance of Outcome\", max(1-var(resY)/var(peacefactorR),0)))\n", + "print(c(\"Controls explain the following fraction of variance of Treatment\", max(1-var(resD)/var(directlyharmedR),0)))\n", + "\n", + "dml.darfur.model = lm(resY~resD)" + ] + }, + { + "cell_type": "markdown", + "id": "20", + "metadata": { + "id": "cUxDc1mYdMHH" + }, + "source": [ + "## Bias Analysis with Sensemakr\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": { + "id": "obvious-there", "papermill": { - "default_parameters": {}, - "duration": 60.897772, - "end_time": "2021-04-03T12:56:22.764591", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-04-03T12:55:21.866819", - "version": "2.3.3" + "duration": 40.040643, + "end_time": "2021-04-03T12:56:22.614312", + "exception": false, + "start_time": "2021-04-03T12:55:42.573669", + "status": "completed" }, - "colab": { - "provenance": [] - } + "tags": [] + }, + "outputs": [], + "source": [ + "dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model,\n", + " treatment = \"resD\")\n", + "summary(dml.darfur.sensitivity)\n", + "\n", + "plot(dml.darfur.sensitivity,nlevels = 15)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": { + "id": "lqXAYlPCTMl1" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 60.897772, + "end_time": "2021-04-03T12:56:22.764591", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-04-03T12:55:21.866819", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 86946256b9cbabc9cd96c9da9ffdabee57aa8577 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:16:41 +0000 Subject: [PATCH 070/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM3 --- ..._functional_approximation_by_nn_and_rf.Rmd | 184 + ...functional_approximation_by_nn_and_rf.irnb | 958 ++--- PM3/r_ml_wage_prediction.Rmd | 565 +++ PM3/r_ml_wage_prediction.irnb | 3594 ++++++++--------- 4 files changed, 3025 insertions(+), 2276 deletions(-) create mode 100644 PM3/r_functional_approximation_by_nn_and_rf.Rmd create mode 100644 PM3/r_ml_wage_prediction.Rmd diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd new file mode 100644 index 00000000..0f46a695 --- /dev/null +++ b/PM3/r_functional_approximation_by_nn_and_rf.Rmd @@ -0,0 +1,184 @@ +--- +title: An R Markdown document converted from "PM3/r_functional_approximation_by_nn_and_rf.irnb" +output: html_document +--- + +# Functional Approximations by Trees and Neural Networks + +Here we show how the function +$$ +x \mapsto exp(4 x) +$$ +can be easily approximated by a tree-based methods (Trees, Random Forest) and a neural network (2 Layered Neural Network) + +```{r} +install.packages("randomForest") +install.packages("rpart") +install.packages("gbm") +install.packages("keras") + + +library(randomForest) +library(rpart) +library(gbm) +library(keras) +``` + +# Functional Approximation by a Tree + +We play around with the penalty level $cp$ below to illustrate how it affects the complexity of tree. Recall we may use this to prune the tree to improve predictive performance and lessen the noise in our final estimate. A simple penalty would be the number of leaves times a penalty level $\alpha$. + +Specifics on the penalty can be found [here](https://cran.r-project.org/web/packages/rpart/rpart.pdf). + +```{r} +set.seed(1) +X_train <- matrix(runif(1000),1000,1) +Y_train <- exp(4*X_train) #Noiseless case Y=g(X) +dim(X_train) + + +# shallow tree +TreeModel<- rpart(Y_train~X_train, cp=.01) #cp is penalty level +pred.TM<- predict(TreeModel, newx=X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.TM, col=3, pch=19) +``` + +```{r} +set.seed(1) +X_train <- matrix(runif(1000),1000,1) +Y_train <- exp(4*X_train) #Noiseless case Y=g(X) +dim(X_train) + + +TreeModel<- rpart(Y_train~X_train, cp=.0005) #cp is penalty level +pred.TM<- predict(TreeModel, newx=X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.TM, col=3, pch=19) +``` + +# Functional Approximation by RF + +Here we show how the function +$$ +x \mapsto exp(4 x) +$$ +can be easily approximated by a tree-based method (Random Forest) and a neural network (2 Layered Neural Network) + +```{r} +RFmodel<- randomForest(Y_train~X_train) +pred.RF<- predict(RFmodel, newdata=X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.RF, col=4, pch=19,) +``` + +# Boosted Trees + +```{r} +data_train = as.data.frame(cbind(X_train, Y_train)) +BoostTreemodel<- gbm(Y_train~X_train, distribution= "gaussian", n.trees=100, shrinkage=.01, interaction.depth +=3) +# shrinkage is "learning rate" +# n.trees is the number of boosting steps +# interaction.depth is the max depth of each tree +pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=100) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.BT, col=4, pch=19,) +``` + +```{r} +data_train = as.data.frame(cbind(X_train, Y_train)) +BoostTreemodel<- gbm(Y_train~X_train, distribution= "gaussian", n.trees=1000, shrinkage=.01, interaction.depth +=3) +# shrinkage is "learning rate" +# n.trees is the number of boosting steps +# interaction.depth is the max depth of each tree +pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=1000) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.BT, col=4, pch=19,) +``` + +# Same Example with a Neural Network + +```{r} +build_model <- function() { + model <- keras_model_sequential() %>% + layer_dense(units = 200, activation = "relu", + input_shape = 1)%>% + layer_dense(units = 20, activation = "relu") %>% + layer_dense(units = 1) + + model %>% compile( + optimizer = optimizer_adam(lr = 0.01), + loss = "mse", + metrics = c("mae"), + ) +} +``` + +```{r} +model <- build_model() +summary(model) +``` + +```{r} +num_epochs <- 1 +model %>% fit(X_train, Y_train, + epochs = num_epochs, batch_size = 10, verbose = 0) +pred.NN <- model %>% predict(X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.NN, col=4, pch=19,) + +``` + +```{r} +num_epochs <- 100 +model %>% fit(X_train, Y_train, + epochs = num_epochs, batch_size = 10, verbose = 0) +pred.NN <- model %>% predict(X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.NN, col=4, pch=19,) + + +``` + +### Using Early Stopping + +```{r} +# Define the neural network architecture +model <- keras_model_sequential() %>% + layer_dense(units = 200, activation = 'relu', input_shape = 1) %>% + layer_dense(units = 20, activation = 'relu') %>% + layer_dense(units = 1) # Output layer with 1 unit for regression task + +# Compile the model +model %>% compile( + optimizer = optimizer_adam(lr = 0.01), + loss = "mse", + metrics = c("mae"), +) + +summary(model) +``` + +```{r} +num_epochs <- 100 + +# Define early stopping based on validation set (20%) performance +early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5) + +# Train the model +model %>% fit( + X_train, Y_train, + epochs = num_epochs, + batch_size = 10, + validation_split = 0.2, # 20% validation set + verbose = 0, + callbacks = list(early_stopping) +) + +pred.NN <- model %>% predict(X_train) +plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") +points(X_train, pred.NN, col=4, pch=19,) +``` + diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index cfe42599..eb4b1823 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -1,497 +1,497 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "elegant-proxy", - "metadata": { - "papermill": { - "duration": 0.011489, - "end_time": "2021-03-30T21:54:42.895419", - "exception": false, - "start_time": "2021-03-30T21:54:42.883930", - "status": "completed" - }, - "tags": [], - "id": "elegant-proxy" - }, - "source": [ - "# Functional Approximations by Trees and Neural Networks\n", - "\n", - "Here we show how the function\n", - "$$\n", - "x \\mapsto exp(4 x)\n", - "$$\n", - "can be easily approximated by a tree-based methods (Trees, Random Forest) and a neural network (2 Layered Neural Network)" - ] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"randomForest\")\n", - "install.packages(\"rpart\")\n", - "install.packages(\"gbm\")\n", - "install.packages(\"keras\")\n", - "\n", - "\n", - "library(randomForest)\n", - "library(rpart)\n", - "library(gbm)\n", - "library(keras)" - ], - "metadata": { - "id": "NULYR1oB9aWz" - }, - "id": "NULYR1oB9aWz", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "widespread-mention", - "metadata": { - "papermill": { - "duration": 0.009467, - "end_time": "2021-03-30T21:54:42.915858", - "exception": false, - "start_time": "2021-03-30T21:54:42.906391", - "status": "completed" - }, - "tags": [], - "id": "widespread-mention" - }, - "source": [ - "# Functional Approximation by a Tree\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "We play around with the penalty level $cp$ below to illustrate how it affects the complexity of tree. Recall we may use this to prune the tree to improve predictive performance and lessen the noise in our final estimate. A simple penalty would be the number of leaves times a penalty level $\\alpha$.\n", - "\n", - "Specifics on the penalty can be found [here](https://cran.r-project.org/web/packages/rpart/rpart.pdf)." - ], - "metadata": { - "id": "C4WFqJKmC25Z" - }, - "id": "C4WFqJKmC25Z" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "registered-correction", - "metadata": { - "papermill": { - "duration": 0.694812, - "end_time": "2021-03-30T21:54:43.620078", - "exception": false, - "start_time": "2021-03-30T21:54:42.925266", - "status": "completed" - }, - "tags": [], - "id": "registered-correction" - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "X_train <- matrix(runif(1000),1000,1)\n", - "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", - "dim(X_train)\n", - "\n", - "\n", - "# shallow tree\n", - "TreeModel<- rpart(Y_train~X_train, cp=.01) #cp is penalty level\n", - "pred.TM<- predict(TreeModel, newx=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.TM, col=3, pch=19)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "banner-sleeve", - "metadata": { - "papermill": { - "duration": 0.294088, - "end_time": "2021-03-30T21:54:43.926159", - "exception": false, - "start_time": "2021-03-30T21:54:43.632071", - "status": "completed" - }, - "tags": [], - "id": "banner-sleeve" - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "X_train <- matrix(runif(1000),1000,1)\n", - "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", - "dim(X_train)\n", - "\n", - "\n", - "TreeModel<- rpart(Y_train~X_train, cp=.0005) #cp is penalty level\n", - "pred.TM<- predict(TreeModel, newx=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.TM, col=3, pch=19)" - ] - }, - { - "cell_type": "markdown", - "id": "local-saturn", - "metadata": { - "papermill": { - "duration": 0.013444, - "end_time": "2021-03-30T21:54:43.953303", - "exception": false, - "start_time": "2021-03-30T21:54:43.939859", - "status": "completed" - }, - "tags": [], - "id": "local-saturn" - }, - "source": [ - "# Functional Approximation by RF" - ] - }, - { - "cell_type": "markdown", - "id": "international-serum", - "metadata": { - "papermill": { - "duration": 0.01351, - "end_time": "2021-03-30T21:54:43.980273", - "exception": false, - "start_time": "2021-03-30T21:54:43.966763", - "status": "completed" - }, - "tags": [], - "id": "international-serum" - }, - "source": [ - "Here we show how the function\n", - "$$\n", - "x \\mapsto exp(4 x)\n", - "$$\n", - "can be easily approximated by a tree-based method (Random Forest) and a neural network (2 Layered Neural Network)" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "elegant-proxy", + "papermill": { + "duration": 0.011489, + "end_time": "2021-03-30T21:54:42.895419", + "exception": false, + "start_time": "2021-03-30T21:54:42.883930", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "further-siemens", - "metadata": { - "papermill": { - "duration": 1.170101, - "end_time": "2021-03-30T21:54:45.163992", - "exception": false, - "start_time": "2021-03-30T21:54:43.993891", - "status": "completed" - }, - "tags": [], - "id": "further-siemens" - }, - "outputs": [], - "source": [ - "RFmodel<- randomForest(Y_train~X_train)\n", - "pred.RF<- predict(RFmodel, newdata=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.RF, col=4, pch=19,)\n" - ] + "tags": [] + }, + "source": [ + "# Functional Approximations by Trees and Neural Networks\n", + "\n", + "Here we show how the function\n", + "$$\n", + "x \\mapsto exp(4 x)\n", + "$$\n", + "can be easily approximated by a tree-based methods (Trees, Random Forest) and a neural network (2 Layered Neural Network)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": { + "id": "NULYR1oB9aWz" + }, + "outputs": [], + "source": [ + "install.packages(\"randomForest\")\n", + "install.packages(\"rpart\")\n", + "install.packages(\"gbm\")\n", + "install.packages(\"keras\")\n", + "\n", + "\n", + "library(randomForest)\n", + "library(rpart)\n", + "library(gbm)\n", + "library(keras)" + ] + }, + { + "cell_type": "markdown", + "id": "2", + "metadata": { + "id": "widespread-mention", + "papermill": { + "duration": 0.009467, + "end_time": "2021-03-30T21:54:42.915858", + "exception": false, + "start_time": "2021-03-30T21:54:42.906391", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "infrared-belgium", - "metadata": { - "papermill": { - "duration": 0.015474, - "end_time": "2021-03-30T21:54:45.201078", - "exception": false, - "start_time": "2021-03-30T21:54:45.185604", - "status": "completed" - }, - "tags": [], - "id": "infrared-belgium" - }, - "source": [ - "# Boosted Trees" - ] + "tags": [] + }, + "source": [ + "# Functional Approximation by a Tree\n" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": { + "id": "C4WFqJKmC25Z" + }, + "source": [ + "We play around with the penalty level $cp$ below to illustrate how it affects the complexity of tree. Recall we may use this to prune the tree to improve predictive performance and lessen the noise in our final estimate. A simple penalty would be the number of leaves times a penalty level $\\alpha$.\n", + "\n", + "Specifics on the penalty can be found [here](https://cran.r-project.org/web/packages/rpart/rpart.pdf)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "id": "registered-correction", + "papermill": { + "duration": 0.694812, + "end_time": "2021-03-30T21:54:43.620078", + "exception": false, + "start_time": "2021-03-30T21:54:42.925266", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "naval-twenty", - "metadata": { - "papermill": { - "duration": 1.40312, - "end_time": "2021-03-30T21:54:46.619828", - "exception": false, - "start_time": "2021-03-30T21:54:45.216708", - "status": "completed" - }, - "tags": [], - "id": "naval-twenty" - }, - "outputs": [], - "source": [ - "data_train = as.data.frame(cbind(X_train, Y_train))\n", - "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=100, shrinkage=.01, interaction.depth\n", - "=3)\n", - "# shrinkage is \"learning rate\"\n", - "# n.trees is the number of boosting steps\n", - "# interaction.depth is the max depth of each tree\n", - "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=100)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.BT, col=4, pch=19,)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "X_train <- matrix(runif(1000),1000,1)\n", + "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", + "dim(X_train)\n", + "\n", + "\n", + "# shallow tree\n", + "TreeModel<- rpart(Y_train~X_train, cp=.01) #cp is penalty level\n", + "pred.TM<- predict(TreeModel, newx=X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.TM, col=3, pch=19)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "id": "banner-sleeve", + "papermill": { + "duration": 0.294088, + "end_time": "2021-03-30T21:54:43.926159", + "exception": false, + "start_time": "2021-03-30T21:54:43.632071", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "listed-michigan", - "metadata": { - "papermill": { - "duration": 0.411341, - "end_time": "2021-03-30T21:54:47.050501", - "exception": false, - "start_time": "2021-03-30T21:54:46.639160", - "status": "completed" - }, - "tags": [], - "id": "listed-michigan" - }, - "outputs": [], - "source": [ - "data_train = as.data.frame(cbind(X_train, Y_train))\n", - "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=1000, shrinkage=.01, interaction.depth\n", - "=3)\n", - "# shrinkage is \"learning rate\"\n", - "# n.trees is the number of boosting steps\n", - "# interaction.depth is the max depth of each tree\n", - "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=1000)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.BT, col=4, pch=19,)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "X_train <- matrix(runif(1000),1000,1)\n", + "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", + "dim(X_train)\n", + "\n", + "\n", + "TreeModel<- rpart(Y_train~X_train, cp=.0005) #cp is penalty level\n", + "pred.TM<- predict(TreeModel, newx=X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.TM, col=3, pch=19)" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": { + "id": "local-saturn", + "papermill": { + "duration": 0.013444, + "end_time": "2021-03-30T21:54:43.953303", + "exception": false, + "start_time": "2021-03-30T21:54:43.939859", + "status": "completed" }, - { - "cell_type": "markdown", - "id": "psychological-venice", - "metadata": { - "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", - "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", - "papermill": { - "duration": 0.018291, - "end_time": "2021-03-30T21:54:47.087924", - "exception": false, - "start_time": "2021-03-30T21:54:47.069633", - "status": "completed" - }, - "tags": [], - "id": "psychological-venice" - }, - "source": [ - "# Same Example with a Neural Network" - ] + "tags": [] + }, + "source": [ + "# Functional Approximation by RF" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": { + "id": "international-serum", + "papermill": { + "duration": 0.01351, + "end_time": "2021-03-30T21:54:43.980273", + "exception": false, + "start_time": "2021-03-30T21:54:43.966763", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "behind-redhead", - "metadata": { - "papermill": { - "duration": 0.195046, - "end_time": "2021-03-30T21:54:47.301402", - "exception": false, - "start_time": "2021-03-30T21:54:47.106356", - "status": "completed" - }, - "tags": [], - "id": "behind-redhead" - }, - "outputs": [], - "source": [ - "build_model <- function() {\n", - " model <- keras_model_sequential() %>%\n", - " layer_dense(units = 200, activation = \"relu\",\n", - " input_shape = 1)%>%\n", - " layer_dense(units = 20, activation = \"relu\") %>%\n", - " layer_dense(units = 1)\n", - "\n", - " model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", - " loss = \"mse\",\n", - " metrics = c(\"mae\"),\n", - " )\n", - "}" - ] + "tags": [] + }, + "source": [ + "Here we show how the function\n", + "$$\n", + "x \\mapsto exp(4 x)\n", + "$$\n", + "can be easily approximated by a tree-based method (Random Forest) and a neural network (2 Layered Neural Network)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": { + "id": "further-siemens", + "papermill": { + "duration": 1.170101, + "end_time": "2021-03-30T21:54:45.163992", + "exception": false, + "start_time": "2021-03-30T21:54:43.993891", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "modified-monitor", - "metadata": { - "papermill": { - "duration": 7.599438, - "end_time": "2021-03-30T21:54:54.919929", - "exception": false, - "start_time": "2021-03-30T21:54:47.320491", - "status": "completed" - }, - "tags": [], - "id": "modified-monitor" - }, - "outputs": [], - "source": [ - "model <- build_model()\n", - "summary(model)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "RFmodel<- randomForest(Y_train~X_train)\n", + "pred.RF<- predict(RFmodel, newdata=X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.RF, col=4, pch=19,)\n" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": { + "id": "infrared-belgium", + "papermill": { + "duration": 0.015474, + "end_time": "2021-03-30T21:54:45.201078", + "exception": false, + "start_time": "2021-03-30T21:54:45.185604", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "early-savannah", - "metadata": { - "papermill": { - "duration": 1.37161, - "end_time": "2021-03-30T21:54:56.310872", - "exception": false, - "start_time": "2021-03-30T21:54:54.939262", - "status": "completed" - }, - "tags": [], - "id": "early-savannah" - }, - "outputs": [], - "source": [ - "num_epochs <- 1\n", - "model %>% fit(X_train, Y_train,\n", - " epochs = num_epochs, batch_size = 10, verbose = 0)\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)\n", - "\n" - ] + "tags": [] + }, + "source": [ + "# Boosted Trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": { + "id": "naval-twenty", + "papermill": { + "duration": 1.40312, + "end_time": "2021-03-30T21:54:46.619828", + "exception": false, + "start_time": "2021-03-30T21:54:45.216708", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "id": "answering-ready", - "metadata": { - "papermill": { - "duration": 13.865941, - "end_time": "2021-03-30T21:55:10.197721", - "exception": false, - "start_time": "2021-03-30T21:54:56.331780", - "status": "completed" - }, - "tags": [], - "id": "answering-ready" - }, - "outputs": [], - "source": [ - "num_epochs <- 100\n", - "model %>% fit(X_train, Y_train,\n", - " epochs = num_epochs, batch_size = 10, verbose = 0)\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)\n", - "\n", - "\n" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "data_train = as.data.frame(cbind(X_train, Y_train))\n", + "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=100, shrinkage=.01, interaction.depth\n", + "=3)\n", + "# shrinkage is \"learning rate\"\n", + "# n.trees is the number of boosting steps\n", + "# interaction.depth is the max depth of each tree\n", + "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=100)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.BT, col=4, pch=19,)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "listed-michigan", + "papermill": { + "duration": 0.411341, + "end_time": "2021-03-30T21:54:47.050501", + "exception": false, + "start_time": "2021-03-30T21:54:46.639160", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "### Using Early Stopping" - ], - "metadata": { - "id": "RAE1DNS1TL8K" - }, - "id": "RAE1DNS1TL8K" + "tags": [] + }, + "outputs": [], + "source": [ + "data_train = as.data.frame(cbind(X_train, Y_train))\n", + "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=1000, shrinkage=.01, interaction.depth\n", + "=3)\n", + "# shrinkage is \"learning rate\"\n", + "# n.trees is the number of boosting steps\n", + "# interaction.depth is the max depth of each tree\n", + "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=1000)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.BT, col=4, pch=19,)" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "id": "psychological-venice", + "papermill": { + "duration": 0.018291, + "end_time": "2021-03-30T21:54:47.087924", + "exception": false, + "start_time": "2021-03-30T21:54:47.069633", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "# Define the neural network architecture\n", - "model <- keras_model_sequential() %>%\n", - " layer_dense(units = 200, activation = 'relu', input_shape = 1) %>%\n", - " layer_dense(units = 20, activation = 'relu') %>%\n", - " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", - "\n", - "# Compile the model\n", - "model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", - " loss = \"mse\",\n", - " metrics = c(\"mae\"),\n", - ")\n", - "\n", - "summary(model)" - ], - "metadata": { - "id": "_cyeRToRTORV" - }, - "id": "_cyeRToRTORV", - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "# Same Example with a Neural Network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "id": "behind-redhead", + "papermill": { + "duration": 0.195046, + "end_time": "2021-03-30T21:54:47.301402", + "exception": false, + "start_time": "2021-03-30T21:54:47.106356", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "num_epochs <- 100\n", - "\n", - "# Define early stopping based on validation set (20%) performance\n", - "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", - "\n", - "# Train the model\n", - "model %>% fit(\n", - " X_train, Y_train,\n", - " epochs = num_epochs,\n", - " batch_size = 10,\n", - " validation_split = 0.2, # 20% validation set\n", - " verbose = 0,\n", - " callbacks = list(early_stopping)\n", - ")\n", - "\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)" - ], - "metadata": { - "id": "FuBqP_e7Te5Y" - }, - "id": "FuBqP_e7Te5Y", - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "outputs": [], + "source": [ + "build_model <- function() {\n", + " model <- keras_model_sequential() %>%\n", + " layer_dense(units = 200, activation = \"relu\",\n", + " input_shape = 1)%>%\n", + " layer_dense(units = 20, activation = \"relu\") %>%\n", + " layer_dense(units = 1)\n", + "\n", + " model %>% compile(\n", + " optimizer = optimizer_adam(lr = 0.01),\n", + " loss = \"mse\",\n", + " metrics = c(\"mae\"),\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": { + "id": "modified-monitor", + "papermill": { + "duration": 7.599438, + "end_time": "2021-03-30T21:54:54.919929", + "exception": false, + "start_time": "2021-03-30T21:54:47.320491", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "outputs": [], + "source": [ + "model <- build_model()\n", + "summary(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "id": "early-savannah", + "papermill": { + "duration": 1.37161, + "end_time": "2021-03-30T21:54:56.310872", + "exception": false, + "start_time": "2021-03-30T21:54:54.939262", + "status": "completed" }, + "tags": [] + }, + "outputs": [], + "source": [ + "num_epochs <- 1\n", + "model %>% fit(X_train, Y_train,\n", + " epochs = num_epochs, batch_size = 10, verbose = 0)\n", + "pred.NN <- model %>% predict(X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.NN, col=4, pch=19,)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": { + "id": "answering-ready", "papermill": { - "default_parameters": {}, - "duration": 30.682213, - "end_time": "2021-03-30T21:55:10.531019", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-03-30T21:54:39.848806", - "version": "2.3.2" + "duration": 13.865941, + "end_time": "2021-03-30T21:55:10.197721", + "exception": false, + "start_time": "2021-03-30T21:54:56.331780", + "status": "completed" }, - "colab": { - "provenance": [] - } + "tags": [] + }, + "outputs": [], + "source": [ + "num_epochs <- 100\n", + "model %>% fit(X_train, Y_train,\n", + " epochs = num_epochs, batch_size = 10, verbose = 0)\n", + "pred.NN <- model %>% predict(X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.NN, col=4, pch=19,)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": { + "id": "RAE1DNS1TL8K" + }, + "source": [ + "### Using Early Stopping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": { + "id": "_cyeRToRTORV" + }, + "outputs": [], + "source": [ + "# Define the neural network architecture\n", + "model <- keras_model_sequential() %>%\n", + " layer_dense(units = 200, activation = 'relu', input_shape = 1) %>%\n", + " layer_dense(units = 20, activation = 'relu') %>%\n", + " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", + "\n", + "# Compile the model\n", + "model %>% compile(\n", + " optimizer = optimizer_adam(lr = 0.01),\n", + " loss = \"mse\",\n", + " metrics = c(\"mae\"),\n", + ")\n", + "\n", + "summary(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": { + "id": "FuBqP_e7Te5Y" + }, + "outputs": [], + "source": [ + "num_epochs <- 100\n", + "\n", + "# Define early stopping based on validation set (20%) performance\n", + "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", + "\n", + "# Train the model\n", + "model %>% fit(\n", + " X_train, Y_train,\n", + " epochs = num_epochs,\n", + " batch_size = 10,\n", + " validation_split = 0.2, # 20% validation set\n", + " verbose = 0,\n", + " callbacks = list(early_stopping)\n", + ")\n", + "\n", + "pred.NN <- model %>% predict(X_train)\n", + "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", + "points(X_train, pred.NN, col=4, pch=19,)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 30.682213, + "end_time": "2021-03-30T21:55:10.531019", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-03-30T21:54:39.848806", + "version": "2.3.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PM3/r_ml_wage_prediction.Rmd b/PM3/r_ml_wage_prediction.Rmd new file mode 100644 index 00000000..75065749 --- /dev/null +++ b/PM3/r_ml_wage_prediction.Rmd @@ -0,0 +1,565 @@ +--- +title: An R Markdown document converted from "PM3/r_ml_wage_prediction.irnb" +output: html_document +--- + +# Machine Learning Estimators for Wage Prediction + +We illustrate how to predict an outcome variable Y in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. So far we have used linear prediction rules, e.g. Lasso regression, for estimation. +Now, we also consider nonlinear prediction rules including tree-based methods. + +```{r} +# Import relevant packages + +install.packages("xtable") +install.packages("hdm") +install.packages("glmnet") +install.packages("randomForest") +install.packages("rpart") +install.packages("nnet") +install.packages("gbm") +install.packages("rpart.plot") +install.packages("keras") + +library(hdm) +library(xtable) +library(glmnet) +library(randomForest) +library(rpart) +library(nnet) +library(gbm) +library(rpart.plot) +library(keras) +``` + +## Data + +Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. +The preproccessed sample consists of $5150$ never-married individuals. + +```{r} +file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +data <- read.csv(file) +dim(data) +``` + +The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators. + +```{r} +Z <- subset(data,select=-c(lwage,wage)) # regressors +colnames(Z) +``` + +The following figure shows the weekly wage distribution from the US survey data. + +```{r} +hist(data$wage, xlab= "hourly wage", main="Empirical wage distribution from the US survey data", breaks= 35) +``` + +Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by +the logarithm. + +## Analysis + +Due to the skewness of the data, we are considering log wages which leads to the following regression model + +$$log(wage) = g(Z) + \epsilon.$$ + +We will estimate the two sets of prediction rules: Linear and Nonlinear Models. +In linear models, we estimate the prediction rule of the form + +$$\hat g(Z) = \hat \beta'X.$$ +Again, we generate $X$ in two ways: + +1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators). + + +2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions. + +To evaluate the out-of-sample performance, we split the data first. + +```{r} +set.seed(1234) +training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) + +data_train <- data[training,] +data_test <- data[-training,] +``` + +We construct the two different model matrices $X_{basic}$ and $X_{flex}$ for both the training and the test sample: + +```{r} +X_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" +X_flex <- "sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)" +formula_basic <- as.formula(paste("lwage", "~", X_basic)) +formula_flex <- as.formula(paste("lwage", "~", X_flex)) + +model_X_basic_train <- model.matrix(formula_basic,data_train) +model_X_basic_test <- model.matrix(formula_basic,data_test) +p_basic <- dim(model_X_basic_train)[2] +model_X_flex_train <- model.matrix(formula_flex,data_train) +model_X_flex_test <- model.matrix(formula_flex,data_test) +p_flex <- dim(model_X_flex_train)[2] +``` + +```{r} +Y_train <- data_train$lwage +Y_test <- data_test$lwage +``` + +```{r} +p_basic +p_flex +``` + +As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression. + +### OLS + +We fit the basic model to our training data by running an ols regression and compute the mean squared error on the test sample. + +```{r} +# ols (basic model) +fit.lm.basic <- lm(formula_basic, data_train) +``` + +```{r} +# Compute the Out-Of-Sample Performance +yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test) +cat("The mean squared error (MSE) using the basic model is equal to" , mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model) +``` + +To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*: + +```{r} +MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2] +MSE.lm.basic +``` + +We also compute the out-of-sample $R^2$: + +```{r} +R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test) +cat("The R^2 using the basic model is equal to",R2.lm.basic) # MSE OLS (basic model) +``` + +We repeat the same procedure for the flexible model. + +```{r} +# ols (flexible model) +fit.lm.flex <- lm(formula_flex, data_train) +# Compute the Out-Of-Sample Performance +options(warn=-1) +yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test) +MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2] +R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test) +cat("The R^2 using the flexible model is equal to",R2.lm.flex) # MSE OLS (flexible model) +``` + +We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We now proceed by running lasso regressions and related penalized methods. + +### Lasso, Ridge and Elastic Net + +Considering the basic model, we run a lasso/post-lasso regression first and then we compute the measures for the out-of-sample performance. Note that applying the package *hdm* and the function *rlasso* we rely on a theoretical based choice of the penalty level $\lambda$ in the lasso regression. + +```{r} +# lasso and variants +fit.rlasso <- rlasso(formula_basic, data_train, post=FALSE) +fit.rlasso.post <- rlasso(formula_basic, data_train, post=TRUE) +yhat.rlasso <- predict(fit.rlasso, newdata=data_test) +yhat.rlasso.post <- predict(fit.rlasso.post, newdata=data_test) + +MSE.lasso <- summary(lm((Y_test-yhat.rlasso)^2~1))$coef[1:2] +MSE.lasso.post <- summary(lm((Y_test-yhat.rlasso.post)^2~1))$coef[1:2] + +R2.lasso <- 1-MSE.lasso[1]/var(Y_test) +R2.lasso.post <- 1-MSE.lasso.post[1]/var(Y_test) +cat("The R^2 using the basic model is equal to",R2.lasso,"for lasso and",R2.lasso.post,"for post-lasso") # R^2 lasso/post-lasso (basic model) +``` + +Now, we repeat the same procedure for the flexible model. + +```{r} +fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE) +fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE) +yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test) +yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test) + +MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2] +MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2] + +R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test) +R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test) +cat("The R^2 using the flexible model is equal to",R2.lasso.flex,"for lasso and",R2.lasso.post.flex,"for post-lasso") # R^2 lasso/post-lasso (flexible model) +``` + +In contrast to a theoretical based choice of the tuning parameter $\lambda$ in the lasso regression, we can also use cross-validation to determine the penalty level by applying the package *glmnet* and the function cv.glmnet. In this context, we also run a ridge and a elastic net regression by adjusting the parameter *alpha*. + +```{r} +fit.lasso.cv <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=1) +fit.ridge <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=0) +fit.elnet <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=.5) + +yhat.lasso.cv <- predict(fit.lasso.cv, newx = model_X_basic_test) +yhat.ridge <- predict(fit.ridge, newx = model_X_basic_test) +yhat.elnet <- predict(fit.elnet, newx = model_X_basic_test) + +MSE.lasso.cv <- summary(lm((Y_test-yhat.lasso.cv)^2~1))$coef[1:2] +MSE.ridge <- summary(lm((Y_test-yhat.ridge)^2~1))$coef[1:2] +MSE.elnet <- summary(lm((Y_test-yhat.elnet)^2~1))$coef[1:2] + +R2.lasso.cv <- 1-MSE.lasso.cv[1]/var(Y_test) +R2.ridge <- 1-MSE.ridge[1]/var(Y_test) +R2.elnet <- 1-MSE.elnet[1]/var(Y_test) + +# R^2 using cross-validation (basic model) +cat("R^2 using cross-validation for lasso, ridge and elastic net in the basic model:",R2.lasso.cv,R2.ridge,R2.elnet) +``` + +Note that the following calculations for the flexible model need some computation time. + +```{r} +fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=1) +fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=0) +fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=.5) + +yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test) +yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test) +yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test) + +MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2] +MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2] +MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2] + +R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test) +R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test) +R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test) + +# R^2 using cross-validation (flexible model) +cat("R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:",R2.lasso.cv.flex,R2.ridge.flex,R2.elnet.flex) +``` + +The performance of the lasso regression with cross-validated penalty is quite similar to the performance of lasso using a theoretical based choice of the tuning parameter. + +#Non-linear models + +Besides linear regression models, we consider nonlinear regression models to build a predictive model. We are applying regression trees, random forests, boosted trees and neural nets to estimate the regression function $g(X)$. + +## Regression Trees + +We fit a regression tree to the training data using the basic model. The variable *cp* controls the complexity of the regression tree, i.e. how deep we build the tree. + +```{r} +# tree +fit.trees <- rpart(formula_basic, data_train, minbucket=5, cp = 0.001) +prp(fit.trees, leaf.round=1, space=2, yspace=2, split.space=2,shadow.col = "gray",trace = 1) # plotting the tree +``` + +An important method to improve predictive performance is called "Pruning the Tree". This +means the process of cutting down the branches of a tree. We apply pruning to the complex tree above to reduce the depth. Initially, we determine the optimal complexity of the regression tree. + +```{r} +bestcp <- fit.trees$cptable[which.min(fit.trees$cptable[,"xerror"]),"CP"] +bestcp +``` + +Now, we can prune the tree and visualize the prediction rule. + +```{r} +fit.prunedtree <- prune(fit.trees,cp=bestcp) +prp(fit.prunedtree,leaf.round=1, space=3, yspace=3, split.space=7, shadow.col = "gray",trace = 1,yesno=1) +``` + +Finally, we calculate the mean-squared error and the $R^2$ on the test sample to evaluate the out-of-sample performance of the pruned tree. + +```{r} +yhat.pt <- predict(fit.prunedtree,newdata=data_test) +MSE.pt <- summary(lm((Y_test-yhat.pt)^2~1))$coef[1:2] +R2.pt <- 1-MSE.pt[1]/var(Y_test) + +# R^2 of the pruned tree +cat("R^2 of the pruned tree:",R2.pt) +``` + +## Random Forest and Boosted Trees + +In the next step, we apply the more advanced tree-based methods random forest and boosted trees. + +```{r} +# random forest +fit.rf <- randomForest(model_X_basic_train, Y_train, ntree=2000, nodesize=20, data = data_train) + +## Evaluating the method +yhat.rf <- predict(fit.rf, newdata=model_X_basic_test) # prediction + +MSE.rf = summary(lm((Y_test-yhat.rf)^2~1))$coef[1:2] +R2.rf <- 1-MSE.rf[1]/var(Y_test) +``` + +```{r} +# boosting +fit.boost <- gbm(formula_basic, data=data_train, distribution= "gaussian", bag.fraction = .5, interaction.depth=2, n.trees=1000, shrinkage=.01) +best.boost <- gbm.perf(fit.boost, plot.it = FALSE) # cross-validation to determine when to stop + +## Evaluating the method +yhat.boost <- predict(fit.boost, newdata=data_test, n.trees=best.boost) + +MSE.boost = summary(lm((Y_test-yhat.boost)^2~1))$coef[1:2] +R2.boost <- 1-MSE.boost[1]/var(Y_test) +``` + +```{r} +# printing R^2 +cat("R^2 of the random forest and boosted trees:", R2.rf, R2.boost) +``` + +## NNets + +First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers -- 2 hidden and one output layer. + +```{r} +# Define the neural network architecture +model <- keras_model_sequential() %>% + layer_dense(units = 50, activation = 'relu', input_shape = dim(model_X_basic_train)[2]) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 1) # Output layer with 1 unit for regression task + +# Compile the model +model %>% compile( + optimizer = optimizer_adam(lr = 0.01), + loss = "mse", + metrics = c("mae"), +) + +summary(model) +``` + +```{r} +num_epochs <- 100 + +# Define early stopping based on validation set (20%) performance +early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5) + +# Train the model +model %>% fit( + model_X_basic_train, Y_train, + epochs = num_epochs, + batch_size = 10, + validation_split = 0.2, # 20% validation set + verbose = 0, + callbacks = list(early_stopping) +) +``` + +```{r} +# evaluating the performance +model %>% evaluate(model_X_basic_test, Y_test, verbose = 0) +``` + +```{r} +# Calculating the performance measures +yhat.nn <- model %>% predict(model_X_basic_test) +MSE.nn = summary(lm((Y_test-yhat.nn)^2~1))$coef[1:2] +R2.nn <- 1-MSE.nn[1]/var(Y_test) +# printing R^2 +cat("R^2 of the neural network:",R2.nn) +``` + +To conclude, let us have a look at our results. + +## Results + +```{r} +table<- matrix(0, 16, 3) +table[1,1:2] <- MSE.lm.basic +table[2,1:2] <- MSE.lm.flex +table[3,1:2] <- MSE.lasso +table[4,1:2] <- MSE.lasso.post +table[5,1:2] <- MSE.lasso.flex +table[6,1:2] <- MSE.lasso.post.flex +table[7,1:2] <- MSE.lasso.cv +table[8,1:2] <- MSE.ridge +table[9,1:2] <- MSE.elnet +table[10,1:2] <- MSE.lasso.cv.flex +table[11,1:2] <- MSE.ridge.flex +table[12,1:2] <- MSE.elnet.flex +table[13,1:2] <- MSE.rf +table[14,1:2] <- MSE.boost +table[15,1:2] <- MSE.pt +table[16,1:2] <- MSE.nn + + + +table[1,3] <- R2.lm.basic +table[2,3] <- R2.lm.flex +table[3,3] <- R2.lasso +table[4,3] <- R2.lasso.post +table[5,3] <- R2.lasso.flex +table[6,3] <- R2.lasso.post.flex +table[7,3] <- R2.lasso.cv +table[8,3] <- R2.ridge +table[9,3] <- R2.elnet +table[10,3] <- R2.lasso.cv.flex +table[11,3] <- R2.ridge.flex +table[12,3] <- R2.elnet.flex +table[13,3] <- R2.rf +table[14,3] <- R2.boost +table[15,3] <- R2.pt +table[16,3] <- R2.nn + + + + +colnames(table)<- c("MSE", "S.E. for MSE", "R-squared") +rownames(table)<- c("Least Squares (basic)","Least Squares (flexible)", "Lasso", "Post-Lasso","Lasso (flexible)","Post-Lasso (flexible)", + "Cross-Validated lasso", "Cross-Validated ridge","Cross-Validated elnet","Cross-Validated lasso (flexible)","Cross-Validated ridge (flexible)","Cross-Validated elnet (flexible)", + "Random Forest","Boosted Trees", "Pruned Tree", "Neural Net (Early)") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab +``` + +Above, we displayed the results for a single split of data into the training and testing part. The table shows the test MSE in column 1 as well as the standard error in column 2 and the test $R^2$ +in column 3. We see that most models perform similarly. For most of these methods, test MSEs are within one standard error of each other. Remarkably, OLS with just the basic variables performs extremely well. However, OLS on a flexible model with many regressors performs very poorly giving nearly the highest test MSE. It is worth noticing that, as this is just a simple illustration meant to be relatively quick, the nonlinear models are not tuned. Thus, there is potential to improve the performance of the nonlinear methods we used in the analysis. + +### Ensemble learning + +In the final step, we can build a prediction model by combing the strengths of the models we considered so far. This ensemble method is of the form + $$ f(x) = \sum_{k=1}^K \alpha_k f_k(x) $$ +where the $f_k$'s denote our prediction rules from the table above and the $\alpha_k$'s are the corresponding weights. + +We first estimate the weights without penalization. + +```{r} +ensemble.ols <- summary(lm(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn)) +ensemble.ols +``` + +Alternatively, we can determine the weights via lasso regression. + +```{r} +ensemble.lasso <- summary(rlasso(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn)) +ensemble.lasso +``` + +The estimated weights are shown in the following table. + +```{r} +table<- matrix(0, 17, 2) +table[1:17,1] <- ensemble.ols$coef[1:17] +table[1:17,2] <- ensemble.lasso$coef[1:17] + + +colnames(table)<- c("Weight OLS", "Weight Lasso") + + +rownames(table)<- c("Constant","Least Squares (basic)", "Least Squares (flexible)", "Lasso (basic)", + "Lasso (flexible)", "Post-Lasso (basic)", "Post-Lasso (flexible)", "LassoCV (basic)", + "Lasso CV (flexible)", "Ridge CV (basic)", "Ridge CV (flexible)", "ElNet CV (basic)", + "ElNet CV (flexible)", "Pruned Tree", "Random Forest","Boosted Trees", "Neural Net") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab +``` + +We note the superior $R^2$ performance of the ensembles. Though for more unbiased performance evaluation, we should have left out a third sample to validate the performance of the stacked model. + +```{r} +# print ensemble R^2 +cat("R^2 of stacking with LS weights:",ensemble.ols$adj.r.squared,"\n") +cat("R^2 of stacking with Lasso weights:",ensemble.lasso$adj.r.squared,"\n") +``` + +# Automatic Machine Learning with H20 AutoML + +We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, we reanalyze the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. + +We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other. + +```{r} +# load the H2O package +install.packages("h2o") +library(h2o) +``` + +```{r} +# start h2o cluster +h2o.init() +``` + +```{r} +# convert data as h2o type +train_h = as.h2o(data_train) +test_h = as.h2o(data_test) + +# have a look at the data +h2o.describe(train_h) +``` + +```{r} +y_name = 'lwage' +X_names = setdiff(names(data), c('lwage','wage','occ', 'ind')) + +# run AutoML for 10 base models and a maximal runtime of 100 seconds +aml = h2o.automl(x=X_names, y=y_name, + training_frame = train_h, + leaderboard_frame = test_h, + max_models = 10, + seed = 1, + max_runtime_secs = 100 + ) +# AutoML Leaderboard +lb = aml@leaderboard +print(lb, n = nrow(lb)) +``` + +We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by + +```{r} +aml@leaderboard$mse[1] +``` + +The in-sample performance can be evaluated by + +```{r} +aml@leader +``` + +This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard. + +```{r} +model_ids <- as.data.frame(aml@leaderboard$model_id)[,1] +# Get the "All Models" Stacked Ensemble model +se <- h2o.getModel(grep("StackedEnsemble_AllModels", model_ids, value = TRUE)[1]) +# Get the Stacked Ensemble metalearner model +metalearner <- se@model$metalearner_model +h2o.varimp(metalearner) +``` + +The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM. + +```{r} +h2o.varimp_plot(metalearner) +``` + +## Generating Predictions Using Leader Model + +We can also generate predictions on a test sample using the leader model object. + +```{r} +pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample +head(pred) +``` + +```{r} +y_test <- as.matrix(test_h$lwage) +R2_test <- 1-summary(lm((y_test-pred)^2~1))$coef[1]/var(y_test) +cat("MSE, SE, R^2:" , summary(lm((y_test-pred)^2~1))$coef[1:2], R2_test) +``` + +We observe both a similar MSE and $R^2$ relative to the better performing models in our previous results. + +```{r} +h2o.shutdown(prompt = F) # shut down the h20 automatically without prompting user +``` + diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index fb275dfd..ccd197c5 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -1,1799 +1,1799 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "_execution_state": "idle", - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "papermill": { - "duration": 0.036479, - "end_time": "2021-02-13T18:19:43.396666", - "exception": false, - "start_time": "2021-02-13T18:19:43.360187", - "status": "completed" - }, - "tags": [], - "id": "nAE4EexhWVGB" - }, - "source": [ - "# Machine Learning Estimators for Wage Prediction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036639, - "end_time": "2021-02-13T18:19:43.468425", - "exception": false, - "start_time": "2021-02-13T18:19:43.431786", - "status": "completed" - }, - "tags": [], - "id": "yppG1kQBWVGC" - }, - "source": [ - "We illustrate how to predict an outcome variable Y in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. So far we have used linear prediction rules, e.g. Lasso regression, for estimation.\n", - "Now, we also consider nonlinear prediction rules including tree-based methods." - ] - }, - { - "cell_type": "code", - "source": [ - "# Import relevant packages\n", - "\n", - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"rpart\")\n", - "install.packages(\"nnet\")\n", - "install.packages(\"gbm\")\n", - "install.packages(\"rpart.plot\")\n", - "install.packages(\"keras\")\n", - "\n", - "library(hdm)\n", - "library(xtable)\n", - "library(glmnet)\n", - "library(randomForest)\n", - "library(rpart)\n", - "library(nnet)\n", - "library(gbm)\n", - "library(rpart.plot)\n", - "library(keras)\n" - ], - "metadata": { - "id": "ww70bLKfEsOb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.034705, - "end_time": "2021-02-13T18:19:43.537814", - "exception": false, - "start_time": "2021-02-13T18:19:43.503109", - "status": "completed" - }, - "tags": [], - "id": "bvL0TvgoWVGC" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036082, - "end_time": "2021-02-13T18:19:43.609347", - "exception": false, - "start_time": "2021-02-13T18:19:43.573265", - "status": "completed" - }, - "tags": [], - "id": "KcgFOEEUWVGD" - }, - "source": [ - "Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.\n", - "The preproccessed sample consists of $5150$ never-married individuals." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.279387, - "end_time": "2021-02-13T18:19:43.923823", - "exception": false, - "start_time": "2021-02-13T18:19:43.644436", - "status": "completed" - }, - "tags": [], - "id": "vls0HahyWVGE" - }, - "outputs": [], - "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", - "data <- read.csv(file)\n", - "dim(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.034902, - "end_time": "2021-02-13T18:19:43.994834", - "exception": false, - "start_time": "2021-02-13T18:19:43.959932", - "status": "completed" - }, - "tags": [], - "id": "Nu76eiIbWVGG" - }, - "source": [ - "The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.091723, - "end_time": "2021-02-13T18:19:44.123394", - "exception": false, - "start_time": "2021-02-13T18:19:44.031671", - "status": "completed" - }, - "tags": [], - "id": "G0M2NqjOWVGH" - }, - "outputs": [], - "source": [ - "Z <- subset(data,select=-c(lwage,wage)) # regressors\n", - "colnames(Z)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.037074, - "end_time": "2021-02-13T18:19:44.196749", - "exception": false, - "start_time": "2021-02-13T18:19:44.159675", - "status": "completed" - }, - "tags": [], - "id": "pa4XliHWWVGH" - }, - "source": [ - "The following figure shows the weekly wage distribution from the US survey data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.443391, - "end_time": "2021-02-13T18:19:44.677379", - "exception": false, - "start_time": "2021-02-13T18:19:44.233988", - "status": "completed" - }, - "tags": [], - "id": "H3nL_puaWVGI" - }, - "outputs": [], - "source": [ - "hist(data$wage, xlab= \"hourly wage\", main=\"Empirical wage distribution from the US survey data\", breaks= 35)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036602, - "end_time": "2021-02-13T18:19:44.752465", - "exception": false, - "start_time": "2021-02-13T18:19:44.715863", - "status": "completed" - }, - "tags": [], - "id": "gHx9j6Z-WVGI" - }, - "source": [ - "Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by\n", - "the logarithm." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036009, - "end_time": "2021-02-13T18:19:44.826260", - "exception": false, - "start_time": "2021-02-13T18:19:44.790251", - "status": "completed" - }, - "tags": [], - "id": "pp584t1QWVGI" - }, - "source": [ - "## Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036925, - "end_time": "2021-02-13T18:19:44.899159", - "exception": false, - "start_time": "2021-02-13T18:19:44.862234", - "status": "completed" - }, - "tags": [], - "id": "uqvQSWp3WVGI" - }, - "source": [ - "Due to the skewness of the data, we are considering log wages which leads to the following regression model\n", - "\n", - "$$log(wage) = g(Z) + \\epsilon.$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.036183, - "end_time": "2021-02-13T18:19:44.971528", - "exception": false, - "start_time": "2021-02-13T18:19:44.935345", - "status": "completed" - }, - "tags": [], - "id": "7mjPGAzeWVGI" - }, - "source": [ - "We will estimate the two sets of prediction rules: Linear and Nonlinear Models.\n", - "In linear models, we estimate the prediction rule of the form\n", - "\n", - "$$\\hat g(Z) = \\hat \\beta'X.$$\n", - "Again, we generate $X$ in two ways:\n", - "\n", - "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators).\n", - "\n", - "\n", - "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.037318, - "end_time": "2021-02-13T18:19:45.044959", - "exception": false, - "start_time": "2021-02-13T18:19:45.007641", - "status": "completed" - }, - "tags": [], - "id": "soVOsFWVWVGJ" - }, - "source": [ - "To evaluate the out-of-sample performance, we split the data first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.062188, - "end_time": "2021-02-13T18:19:45.143118", - "exception": false, - "start_time": "2021-02-13T18:19:45.080930", - "status": "completed" - }, - "tags": [], - "id": "z93_1rRMWVGJ" - }, - "outputs": [], - "source": [ - "set.seed(1234)\n", - "training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE)\n", - "\n", - "data_train <- data[training,]\n", - "data_test <- data[-training,]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.038774, - "end_time": "2021-02-13T18:19:45.217757", - "exception": false, - "start_time": "2021-02-13T18:19:45.178983", - "status": "completed" - }, - "tags": [], - "id": "Fhb0MtAGWVGJ" - }, - "source": [ - "We construct the two different model matrices $X_{basic}$ and $X_{flex}$ for both the training and the test sample:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.094135, - "end_time": "2021-02-13T18:19:45.347955", - "exception": false, - "start_time": "2021-02-13T18:19:45.253820", - "status": "completed" - }, - "tags": [], - "id": "4N38OGcwWVGJ" - }, - "outputs": [], - "source": [ - "X_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", - "X_flex <- \"sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\"\n", - "formula_basic <- as.formula(paste(\"lwage\", \"~\", X_basic))\n", - "formula_flex <- as.formula(paste(\"lwage\", \"~\", X_flex))\n", - "\n", - "model_X_basic_train <- model.matrix(formula_basic,data_train)\n", - "model_X_basic_test <- model.matrix(formula_basic,data_test)\n", - "p_basic <- dim(model_X_basic_train)[2]\n", - "model_X_flex_train <- model.matrix(formula_flex,data_train)\n", - "model_X_flex_test <- model.matrix(formula_flex,data_test)\n", - "p_flex <- dim(model_X_flex_train)[2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.060969, - "end_time": "2021-02-13T18:19:45.445389", - "exception": false, - "start_time": "2021-02-13T18:19:45.384420", - "status": "completed" - }, - "tags": [], - "id": "iUbis0zFWVGJ" - }, - "outputs": [], - "source": [ - "Y_train <- data_train$lwage\n", - "Y_test <- data_test$lwage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.062723, - "end_time": "2021-02-13T18:19:45.545189", - "exception": false, - "start_time": "2021-02-13T18:19:45.482466", - "status": "completed" - }, - "tags": [], - "id": "pKWfxw3sWVGK" - }, - "outputs": [], - "source": [ - "p_basic\n", - "p_flex" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.037704, - "end_time": "2021-02-13T18:19:45.622370", - "exception": false, - "start_time": "2021-02-13T18:19:45.584666", - "status": "completed" - }, - "tags": [], - "id": "BFWIXx9KWVGK" - }, - "source": [ - "As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.038763, - "end_time": "2021-02-13T18:19:45.699126", - "exception": false, - "start_time": "2021-02-13T18:19:45.660363", - "status": "completed" - }, - "tags": [], - "id": "FvwCARMSWVGK" - }, - "source": [ - "### OLS" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.039458, - "end_time": "2021-02-13T18:19:45.779460", - "exception": false, - "start_time": "2021-02-13T18:19:45.740002", - "status": "completed" - }, - "tags": [], - "id": "TmXoHZ29WVGK" - }, - "source": [ - "We fit the basic model to our training data by running an ols regression and compute the mean squared error on the test sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.069537, - "end_time": "2021-02-13T18:19:45.887169", - "exception": false, - "start_time": "2021-02-13T18:19:45.817632", - "status": "completed" - }, - "tags": [], - "id": "dKpIzgYhWVGK" - }, - "outputs": [], - "source": [ - "# ols (basic model)\n", - "fit.lm.basic <- lm(formula_basic, data_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.074423, - "end_time": "2021-02-13T18:19:45.999870", - "exception": false, - "start_time": "2021-02-13T18:19:45.925447", - "status": "completed" - }, - "tags": [], - "id": "oVxJtcLqWVGK" - }, - "outputs": [], - "source": [ - "# Compute the Out-Of-Sample Performance\n", - "yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test)\n", - "cat(\"The mean squared error (MSE) using the basic model is equal to\" , mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.052764, - "end_time": "2021-02-13T18:19:46.122829", - "exception": false, - "start_time": "2021-02-13T18:19:46.070065", - "status": "completed" - }, - "tags": [], - "id": "K3pW0712WVGK" - }, - "source": [ - "To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.076484, - "end_time": "2021-02-13T18:19:46.239015", - "exception": false, - "start_time": "2021-02-13T18:19:46.162531", - "status": "completed" - }, - "tags": [], - "id": "pQ8T93iCWVGL" - }, - "outputs": [], - "source": [ - "MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2]\n", - "MSE.lm.basic" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.039088, - "end_time": "2021-02-13T18:19:46.317915", - "exception": false, - "start_time": "2021-02-13T18:19:46.278827", - "status": "completed" - }, - "tags": [], - "id": "xxRMlMvoWVGL" - }, - "source": [ - "We also compute the out-of-sample $R^2$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.057098, - "end_time": "2021-02-13T18:19:46.413754", - "exception": false, - "start_time": "2021-02-13T18:19:46.356656", - "status": "completed" - }, - "tags": [], - "id": "wOGfwTlwWVGL" - }, - "outputs": [], - "source": [ - "R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test)\n", - "cat(\"The R^2 using the basic model is equal to\",R2.lm.basic) # MSE OLS (basic model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.039585, - "end_time": "2021-02-13T18:19:46.492903", - "exception": false, - "start_time": "2021-02-13T18:19:46.453318", - "status": "completed" - }, - "tags": [], - "id": "7rsIX-qEWVGL" - }, - "source": [ - "We repeat the same procedure for the flexible model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.198636, - "end_time": "2021-02-13T18:19:46.730717", - "exception": false, - "start_time": "2021-02-13T18:19:46.532081", - "status": "completed" - }, - "tags": [], - "id": "lCRbc_HDWVGL" - }, - "outputs": [], - "source": [ - "# ols (flexible model)\n", - "fit.lm.flex <- lm(formula_flex, data_train)\n", - "# Compute the Out-Of-Sample Performance\n", - "options(warn=-1)\n", - "yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test)\n", - "MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2]\n", - "R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test)\n", - "cat(\"The R^2 using the flexible model is equal to\",R2.lm.flex) # MSE OLS (flexible model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.051953, - "end_time": "2021-02-13T18:19:46.853182", - "exception": false, - "start_time": "2021-02-13T18:19:46.801229", - "status": "completed" - }, - "tags": [], - "id": "1DKdOWI8WVGL" - }, - "source": [ - "We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We now proceed by running lasso regressions and related penalized methods." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.042521, - "end_time": "2021-02-13T18:19:46.935859", - "exception": false, - "start_time": "2021-02-13T18:19:46.893338", - "status": "completed" - }, - "tags": [], - "id": "-zfmqjNVWVGM" - }, - "source": [ - "### Lasso, Ridge and Elastic Net\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.040161, - "end_time": "2021-02-13T18:19:47.015626", - "exception": false, - "start_time": "2021-02-13T18:19:46.975465", - "status": "completed" - }, - "tags": [], - "id": "9fvu80MpWVGM" - }, - "source": [ - "Considering the basic model, we run a lasso/post-lasso regression first and then we compute the measures for the out-of-sample performance. Note that applying the package *hdm* and the function *rlasso* we rely on a theoretical based choice of the penalty level $\\lambda$ in the lasso regression." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.577781, - "end_time": "2021-02-13T18:19:47.634269", - "exception": false, - "start_time": "2021-02-13T18:19:47.056488", - "status": "completed" - }, - "tags": [], - "id": "OFhArnPNWVGM" - }, - "outputs": [], - "source": [ - "# lasso and variants\n", - "fit.rlasso <- rlasso(formula_basic, data_train, post=FALSE)\n", - "fit.rlasso.post <- rlasso(formula_basic, data_train, post=TRUE)\n", - "yhat.rlasso <- predict(fit.rlasso, newdata=data_test)\n", - "yhat.rlasso.post <- predict(fit.rlasso.post, newdata=data_test)\n", - "\n", - "MSE.lasso <- summary(lm((Y_test-yhat.rlasso)^2~1))$coef[1:2]\n", - "MSE.lasso.post <- summary(lm((Y_test-yhat.rlasso.post)^2~1))$coef[1:2]\n", - "\n", - "R2.lasso <- 1-MSE.lasso[1]/var(Y_test)\n", - "R2.lasso.post <- 1-MSE.lasso.post[1]/var(Y_test)\n", - "cat(\"The R^2 using the basic model is equal to\",R2.lasso,\"for lasso and\",R2.lasso.post,\"for post-lasso\") # R^2 lasso/post-lasso (basic model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.049543, - "end_time": "2021-02-13T18:19:47.757271", - "exception": false, - "start_time": "2021-02-13T18:19:47.707728", - "status": "completed" - }, - "tags": [], - "id": "NRUPJtOzWVGM" - }, - "source": [ - "Now, we repeat the same procedure for the flexible model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 3.430649, - "end_time": "2021-02-13T18:19:51.229007", - "exception": false, - "start_time": "2021-02-13T18:19:47.798358", - "status": "completed" - }, - "tags": [], - "id": "17doIQ14WVGM" - }, - "outputs": [], - "source": [ - "fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE)\n", - "fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE)\n", - "yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test)\n", - "yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test)\n", - "\n", - "MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2]\n", - "MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2]\n", - "\n", - "R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test)\n", - "R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test)\n", - "cat(\"The R^2 using the flexible model is equal to\",R2.lasso.flex,\"for lasso and\",R2.lasso.post.flex,\"for post-lasso\") # R^2 lasso/post-lasso (flexible model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.041452, - "end_time": "2021-02-13T18:19:51.436401", - "exception": false, - "start_time": "2021-02-13T18:19:51.394949", - "status": "completed" - }, - "tags": [], - "id": "i9xxyyM-WVGN" - }, - "source": [ - "In contrast to a theoretical based choice of the tuning parameter $\\lambda$ in the lasso regression, we can also use cross-validation to determine the penalty level by applying the package *glmnet* and the function cv.glmnet. In this context, we also run a ridge and a elastic net regression by adjusting the parameter *alpha*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 2.248453, - "end_time": "2021-02-13T18:19:53.725885", - "exception": false, - "start_time": "2021-02-13T18:19:51.477432", - "status": "completed" - }, - "tags": [], - "id": "JfxqhppCWVGT" - }, - "outputs": [], - "source": [ - "fit.lasso.cv <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=.5)\n", - "\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = model_X_basic_test)\n", - "yhat.ridge <- predict(fit.ridge, newx = model_X_basic_test)\n", - "yhat.elnet <- predict(fit.elnet, newx = model_X_basic_test)\n", - "\n", - "MSE.lasso.cv <- summary(lm((Y_test-yhat.lasso.cv)^2~1))$coef[1:2]\n", - "MSE.ridge <- summary(lm((Y_test-yhat.ridge)^2~1))$coef[1:2]\n", - "MSE.elnet <- summary(lm((Y_test-yhat.elnet)^2~1))$coef[1:2]\n", - "\n", - "R2.lasso.cv <- 1-MSE.lasso.cv[1]/var(Y_test)\n", - "R2.ridge <- 1-MSE.ridge[1]/var(Y_test)\n", - "R2.elnet <- 1-MSE.elnet[1]/var(Y_test)\n", - "\n", - "# R^2 using cross-validation (basic model)\n", - "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the basic model:\",R2.lasso.cv,R2.ridge,R2.elnet)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.042613, - "end_time": "2021-02-13T18:19:53.812553", - "exception": false, - "start_time": "2021-02-13T18:19:53.769940", - "status": "completed" - }, - "tags": [], - "id": "dv2mcitqWVGT" - }, - "source": [ - "Note that the following calculations for the flexible model need some computation time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 13.588391, - "end_time": "2021-02-13T18:20:07.443188", - "exception": false, - "start_time": "2021-02-13T18:19:53.854797", - "status": "completed" - }, - "tags": [], - "id": "GxOMyb3LWVGU" - }, - "outputs": [], - "source": [ - "fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=.5)\n", - "\n", - "yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test)\n", - "yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test)\n", - "yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test)\n", - "\n", - "MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2]\n", - "MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2]\n", - "MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2]\n", - "\n", - "R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test)\n", - "R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test)\n", - "R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test)\n", - "\n", - "# R^2 using cross-validation (flexible model)\n", - "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:\",R2.lasso.cv.flex,R2.ridge.flex,R2.elnet.flex)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.04263, - "end_time": "2021-02-13T18:20:07.529566", - "exception": false, - "start_time": "2021-02-13T18:20:07.486936", - "status": "completed" - }, - "tags": [], - "id": "QFat4RN3WVGU" - }, - "source": [ - "The performance of the lasso regression with cross-validated penalty is quite similar to the performance of lasso using a theoretical based choice of the tuning parameter." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.042859, - "end_time": "2021-02-13T18:20:07.614751", - "exception": false, - "start_time": "2021-02-13T18:20:07.571892", - "status": "completed" - }, - "tags": [], - "id": "j6tbPExhWVGU" - }, - "source": [ - "#Non-linear models" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.042125, - "end_time": "2021-02-13T18:20:07.699092", - "exception": false, - "start_time": "2021-02-13T18:20:07.656967", - "status": "completed" - }, - "tags": [], - "id": "OtyGNYtFWVGU" - }, - "source": [ - "Besides linear regression models, we consider nonlinear regression models to build a predictive model. We are applying regression trees, random forests, boosted trees and neural nets to estimate the regression function $g(X)$." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.043267, - "end_time": "2021-02-13T18:20:08.261600", - "exception": false, - "start_time": "2021-02-13T18:20:08.218333", - "status": "completed" - }, - "tags": [], - "id": "c2NCnTfZWVGV" - }, - "source": [ - "## Regression Trees" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.043445, - "end_time": "2021-02-13T18:20:08.348402", - "exception": false, - "start_time": "2021-02-13T18:20:08.304957", - "status": "completed" - }, - "tags": [], - "id": "ShRO5WNpWVGV" - }, - "source": [ - "We fit a regression tree to the training data using the basic model. The variable *cp* controls the complexity of the regression tree, i.e. how deep we build the tree." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 1.843972, - "end_time": "2021-02-13T18:20:10.235503", - "exception": false, - "start_time": "2021-02-13T18:20:08.391531", - "status": "completed" - }, - "tags": [], - "id": "3n80mXGeWVGV" - }, - "outputs": [], - "source": [ - "# tree\n", - "fit.trees <- rpart(formula_basic, data_train, minbucket=5, cp = 0.001)\n", - "prp(fit.trees, leaf.round=1, space=2, yspace=2, split.space=2,shadow.col = \"gray\",trace = 1) # plotting the tree" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.046456, - "end_time": "2021-02-13T18:20:10.328795", - "exception": false, - "start_time": "2021-02-13T18:20:10.282339", - "status": "completed" - }, - "tags": [], - "id": "KrqAEnpvWVGW" - }, - "source": [ - "An important method to improve predictive performance is called \"Pruning the Tree\". This\n", - "means the process of cutting down the branches of a tree. We apply pruning to the complex tree above to reduce the depth. Initially, we determine the optimal complexity of the regression tree." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.070106, - "end_time": "2021-02-13T18:20:10.445828", - "exception": false, - "start_time": "2021-02-13T18:20:10.375722", - "status": "completed" - }, - "tags": [], - "id": "vCrBEct1WVGW" - }, - "outputs": [], - "source": [ - "bestcp <- fit.trees$cptable[which.min(fit.trees$cptable[,\"xerror\"]),\"CP\"]\n", - "bestcp" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.047157, - "end_time": "2021-02-13T18:20:10.540327", - "exception": false, - "start_time": "2021-02-13T18:20:10.493170", - "status": "completed" - }, - "tags": [], - "id": "bTEqTX1ZWVGW" - }, - "source": [ - "Now, we can prune the tree and visualize the prediction rule." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.543483, - "end_time": "2021-02-13T18:20:11.131455", - "exception": false, - "start_time": "2021-02-13T18:20:10.587972", - "status": "completed" - }, - "tags": [], - "id": "a9Sirh4_WVGW" - }, - "outputs": [], - "source": [ - "fit.prunedtree <- prune(fit.trees,cp=bestcp)\n", - "prp(fit.prunedtree,leaf.round=1, space=3, yspace=3, split.space=7, shadow.col = \"gray\",trace = 1,yesno=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.04994, - "end_time": "2021-02-13T18:20:11.334448", - "exception": false, - "start_time": "2021-02-13T18:20:11.284508", - "status": "completed" - }, - "tags": [], - "id": "2oDS05mgWVGW" - }, - "source": [ - "Finally, we calculate the mean-squared error and the $R^2$ on the test sample to evaluate the out-of-sample performance of the pruned tree." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.079534, - "end_time": "2021-02-13T18:20:11.463701", - "exception": false, - "start_time": "2021-02-13T18:20:11.384167", - "status": "completed" - }, - "tags": [], - "id": "AvXxs5R3WVGW" - }, - "outputs": [], - "source": [ - "yhat.pt <- predict(fit.prunedtree,newdata=data_test)\n", - "MSE.pt <- summary(lm((Y_test-yhat.pt)^2~1))$coef[1:2]\n", - "R2.pt <- 1-MSE.pt[1]/var(Y_test)\n", - "\n", - "# R^2 of the pruned tree\n", - "cat(\"R^2 of the pruned tree:\",R2.pt)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.052287, - "end_time": "2021-02-13T18:20:11.566330", - "exception": false, - "start_time": "2021-02-13T18:20:11.514043", - "status": "completed" - }, - "tags": [], - "id": "03cG01y1WVGX" - }, - "source": [ - "## Random Forest and Boosted Trees" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.050794, - "end_time": "2021-02-13T18:20:11.667980", - "exception": false, - "start_time": "2021-02-13T18:20:11.617186", - "status": "completed" - }, - "tags": [], - "id": "7nRCkm1WWVGX" - }, - "source": [ - "In the next step, we apply the more advanced tree-based methods random forest and boosted trees." - ] - }, - { - "cell_type": "code", - "source": [ - "# random forest\n", - "fit.rf <- randomForest(model_X_basic_train, Y_train, ntree=2000, nodesize=20, data = data_train)\n", - "\n", - "## Evaluating the method\n", - "yhat.rf <- predict(fit.rf, newdata=model_X_basic_test) # prediction\n", - "\n", - "MSE.rf = summary(lm((Y_test-yhat.rf)^2~1))$coef[1:2]\n", - "R2.rf <- 1-MSE.rf[1]/var(Y_test)\n" - ], - "metadata": { - "id": "ZbLiUr0Lh4Le" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 56.677891, - "end_time": "2021-02-13T18:21:08.396363", - "exception": false, - "start_time": "2021-02-13T18:20:11.718472", - "status": "completed" - }, - "tags": [], - "id": "p1Q7NNZaWVGX" - }, - "outputs": [], - "source": [ - "# boosting\n", - "fit.boost <- gbm(formula_basic, data=data_train, distribution= \"gaussian\", bag.fraction = .5, interaction.depth=2, n.trees=1000, shrinkage=.01)\n", - "best.boost <- gbm.perf(fit.boost, plot.it = FALSE) # cross-validation to determine when to stop\n", - "\n", - "## Evaluating the method\n", - "yhat.boost <- predict(fit.boost, newdata=data_test, n.trees=best.boost)\n", - "\n", - "MSE.boost = summary(lm((Y_test-yhat.boost)^2~1))$coef[1:2]\n", - "R2.boost <- 1-MSE.boost[1]/var(Y_test)\n" - ] - }, - { - "cell_type": "code", - "source": [ - "# printing R^2\n", - "cat(\"R^2 of the random forest and boosted trees:\", R2.rf, R2.boost)" - ], - "metadata": { - "id": "WkzBr2OOi9GC" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## NNets\n", - "\n", - "First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers -- 2 hidden and one output layer." - ], - "metadata": { - "id": "tyLLLoTCunpl" - } - }, - { - "cell_type": "code", - "source": [ - "# Define the neural network architecture\n", - "model <- keras_model_sequential() %>%\n", - " layer_dense(units = 50, activation = 'relu', input_shape = dim(model_X_basic_train)[2]) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", - "\n", - "# Compile the model\n", - "model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", - " loss = \"mse\",\n", - " metrics = c(\"mae\"),\n", - ")\n", - "\n", - "summary(model)" - ], - "metadata": { - "id": "hKNFcGgwt3gm" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "num_epochs <- 100\n", - "\n", - "# Define early stopping based on validation set (20%) performance\n", - "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", - "\n", - "# Train the model\n", - "model %>% fit(\n", - " model_X_basic_train, Y_train,\n", - " epochs = num_epochs,\n", - " batch_size = 10,\n", - " validation_split = 0.2, # 20% validation set\n", - " verbose = 0,\n", - " callbacks = list(early_stopping)\n", - ")" - ], - "metadata": { - "id": "c3guqZeeyDd3" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# evaluating the performance\n", - "model %>% evaluate(model_X_basic_test, Y_test, verbose = 0)" - ], - "metadata": { - "id": "oFRmau4lzDoa" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Calculating the performance measures\n", - "yhat.nn <- model %>% predict(model_X_basic_test)\n", - "MSE.nn = summary(lm((Y_test-yhat.nn)^2~1))$coef[1:2]\n", - "R2.nn <- 1-MSE.nn[1]/var(Y_test)\n", - "# printing R^2\n", - "cat(\"R^2 of the neural network:\",R2.nn)" - ], - "metadata": { - "id": "UZP6ytgUzAlz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.051225, - "end_time": "2021-02-13T18:21:08.500313", - "exception": false, - "start_time": "2021-02-13T18:21:08.449088", - "status": "completed" - }, - "tags": [], - "id": "2KLGpmUTWVGX" - }, - "source": [ - "To conclude, let us have a look at our results." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.052403, - "end_time": "2021-02-13T18:21:08.603976", - "exception": false, - "start_time": "2021-02-13T18:21:08.551573", - "status": "completed" - }, - "tags": [], - "id": "rz0O_d-qWVGX" - }, - "source": [ - "## Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.167847, - "end_time": "2021-02-13T18:21:08.823485", - "exception": false, - "start_time": "2021-02-13T18:21:08.655638", - "status": "completed" - }, - "tags": [], - "id": "gtM59T07WVGY" - }, - "outputs": [], - "source": [ - "table<- matrix(0, 16, 3)\n", - "table[1,1:2] <- MSE.lm.basic\n", - "table[2,1:2] <- MSE.lm.flex\n", - "table[3,1:2] <- MSE.lasso\n", - "table[4,1:2] <- MSE.lasso.post\n", - "table[5,1:2] <- MSE.lasso.flex\n", - "table[6,1:2] <- MSE.lasso.post.flex\n", - "table[7,1:2] <- MSE.lasso.cv\n", - "table[8,1:2] <- MSE.ridge\n", - "table[9,1:2] <- MSE.elnet\n", - "table[10,1:2] <- MSE.lasso.cv.flex\n", - "table[11,1:2] <- MSE.ridge.flex\n", - "table[12,1:2] <- MSE.elnet.flex\n", - "table[13,1:2] <- MSE.rf\n", - "table[14,1:2] <- MSE.boost\n", - "table[15,1:2] <- MSE.pt\n", - "table[16,1:2] <- MSE.nn\n", - "\n", - "\n", - "\n", - "table[1,3] <- R2.lm.basic\n", - "table[2,3] <- R2.lm.flex\n", - "table[3,3] <- R2.lasso\n", - "table[4,3] <- R2.lasso.post\n", - "table[5,3] <- R2.lasso.flex\n", - "table[6,3] <- R2.lasso.post.flex\n", - "table[7,3] <- R2.lasso.cv\n", - "table[8,3] <- R2.ridge\n", - "table[9,3] <- R2.elnet\n", - "table[10,3] <- R2.lasso.cv.flex\n", - "table[11,3] <- R2.ridge.flex\n", - "table[12,3] <- R2.elnet.flex\n", - "table[13,3] <- R2.rf\n", - "table[14,3] <- R2.boost\n", - "table[15,3] <- R2.pt\n", - "table[16,3] <- R2.nn\n", - "\n", - "\n", - "\n", - "\n", - "colnames(table)<- c(\"MSE\", \"S.E. for MSE\", \"R-squared\")\n", - "rownames(table)<- c(\"Least Squares (basic)\",\"Least Squares (flexible)\", \"Lasso\", \"Post-Lasso\",\"Lasso (flexible)\",\"Post-Lasso (flexible)\",\n", - " \"Cross-Validated lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\"Cross-Validated lasso (flexible)\",\"Cross-Validated ridge (flexible)\",\"Cross-Validated elnet (flexible)\",\n", - " \"Random Forest\",\"Boosted Trees\", \"Pruned Tree\", \"Neural Net (Early)\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.052897, - "end_time": "2021-02-13T18:21:08.930888", - "exception": false, - "start_time": "2021-02-13T18:21:08.877991", - "status": "completed" - }, - "tags": [], - "id": "-A-sNNZmWVGY" - }, - "source": [ - "Above, we displayed the results for a single split of data into the training and testing part. The table shows the test MSE in column 1 as well as the standard error in column 2 and the test $R^2$\n", - "in column 3. We see that most models perform similarly. For most of these methods, test MSEs are within one standard error of each other. Remarkably, OLS with just the basic variables performs extremely well. However, OLS on a flexible model with many regressors performs very poorly giving nearly the highest test MSE. It is worth noticing that, as this is just a simple illustration meant to be relatively quick, the nonlinear models are not tuned. Thus, there is potential to improve the performance of the nonlinear methods we used in the analysis." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.052594, - "end_time": "2021-02-13T18:21:09.036009", - "exception": false, - "start_time": "2021-02-13T18:21:08.983415", - "status": "completed" - }, - "tags": [], - "id": "G7vGZPhPWVGY" - }, - "source": [ - "### Ensemble learning" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.053134, - "end_time": "2021-02-13T18:21:09.146558", - "exception": false, - "start_time": "2021-02-13T18:21:09.093424", - "status": "completed" - }, - "tags": [], - "id": "-py__sTwWVGY" - }, - "source": [ - "In the final step, we can build a prediction model by combing the strengths of the models we considered so far. This ensemble method is of the form\n", - "\t$$ f(x) = \\sum_{k=1}^K \\alpha_k f_k(x) $$\n", - "where the $f_k$'s denote our prediction rules from the table above and the $\\alpha_k$'s are the corresponding weights.\n", - "\n", - "We first estimate the weights without penalization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.079851, - "end_time": "2021-02-13T18:21:09.388686", - "exception": false, - "start_time": "2021-02-13T18:21:09.308835", - "status": "completed" - }, - "tags": [], - "id": "amnVg2qMWVGY" - }, - "outputs": [], - "source": [ - "ensemble.ols <- summary(lm(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", - "ensemble.ols\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.054822, - "end_time": "2021-02-13T18:21:09.498067", - "exception": false, - "start_time": "2021-02-13T18:21:09.443245", - "status": "completed" - }, - "tags": [], - "id": "9dNbETxxWVGY" - }, - "source": [ - "Alternatively, we can determine the weights via lasso regression." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.175196, - "end_time": "2021-02-13T18:21:09.727077", - "exception": false, - "start_time": "2021-02-13T18:21:09.551881", - "status": "completed" - }, - "tags": [], - "id": "tUNMMhdVWVGZ" - }, - "outputs": [], - "source": [ - "ensemble.lasso <- summary(rlasso(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", - "ensemble.lasso" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.055874, - "end_time": "2021-02-13T18:21:09.838636", - "exception": false, - "start_time": "2021-02-13T18:21:09.782762", - "status": "completed" - }, - "tags": [], - "id": "Xv9P__WOWVGZ" - }, - "source": [ - "The estimated weights are shown in the following table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "papermill": { - "duration": 0.094431, - "end_time": "2021-02-13T18:21:09.988946", - "exception": false, - "start_time": "2021-02-13T18:21:09.894515", - "status": "completed" - }, - "tags": [], - "id": "tHp5cn8rWVGZ" - }, - "outputs": [], - "source": [ - "table<- matrix(0, 17, 2)\n", - "table[1:17,1] <- ensemble.ols$coef[1:17]\n", - "table[1:17,2] <- ensemble.lasso$coef[1:17]\n", - "\n", - "\n", - "colnames(table)<- c(\"Weight OLS\", \"Weight Lasso\")\n", - "\n", - "\n", - "rownames(table)<- c(\"Constant\",\"Least Squares (basic)\", \"Least Squares (flexible)\", \"Lasso (basic)\",\n", - " \"Lasso (flexible)\", \"Post-Lasso (basic)\", \"Post-Lasso (flexible)\", \"LassoCV (basic)\",\n", - " \"Lasso CV (flexible)\", \"Ridge CV (basic)\", \"Ridge CV (flexible)\", \"ElNet CV (basic)\",\n", - " \"ElNet CV (flexible)\", \"Pruned Tree\", \"Random Forest\",\"Boosted Trees\", \"Neural Net\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.056002, - "end_time": "2021-02-13T18:21:10.101284", - "exception": false, - "start_time": "2021-02-13T18:21:10.045282", - "status": "completed" - }, - "tags": [], - "id": "ac-Nynd9WVGZ" - }, - "source": [ - "We note the superior $R^2$ performance of the ensembles. Though for more unbiased performance evaluation, we should have left out a third sample to validate the performance of the stacked model." - ] - }, - { - "cell_type": "code", - "source": [ - "# print ensemble R^2\n", - "cat(\"R^2 of stacking with LS weights:\",ensemble.ols$adj.r.squared,\"\\n\")\n", - "cat(\"R^2 of stacking with Lasso weights:\",ensemble.lasso$adj.r.squared,\"\\n\")" - ], - "metadata": { - "id": "pcyQsL5xmKxR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Automatic Machine Learning with H20 AutoML\n", - "\n", - "We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, we reanalyze the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015." - ], - "metadata": { - "id": "0G7F6n_2ELZJ" - } - }, - { - "cell_type": "markdown", - "source": [ - "We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other." - ], - "metadata": { - "id": "E7yQsmgqEPsz" - } - }, - { - "cell_type": "code", - "source": [ - "# load the H2O package\n", - "install.packages(\"h2o\")\n", - "library(h2o)" - ], - "metadata": { - "id": "NPz9qeg2EPAN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# start h2o cluster\n", - "h2o.init()" - ], - "metadata": { - "id": "fxz49VSXEZDC" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# convert data as h2o type\n", - "train_h = as.h2o(data_train)\n", - "test_h = as.h2o(data_test)\n", - "\n", - "# have a look at the data\n", - "h2o.describe(train_h)" - ], - "metadata": { - "id": "orzSZz_eEnWg" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "y_name = 'lwage'\n", - "X_names = setdiff(names(data), c('lwage','wage','occ', 'ind'))\n", - "\n", - "# run AutoML for 10 base models and a maximal runtime of 100 seconds\n", - "aml = h2o.automl(x=X_names, y=y_name,\n", - " training_frame = train_h,\n", - " leaderboard_frame = test_h,\n", - " max_models = 10,\n", - " seed = 1,\n", - " max_runtime_secs = 100\n", - " )\n", - "# AutoML Leaderboard\n", - "lb = aml@leaderboard\n", - "print(lb, n = nrow(lb))" - ], - "metadata": { - "id": "5PohiG13EqTn" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by" - ], - "metadata": { - "id": "b6-CATAVHQtV" - } - }, - { - "cell_type": "code", - "source": [ - "aml@leaderboard$mse[1]" - ], - "metadata": { - "id": "fIBhP8LSGpA6" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The in-sample performance can be evaluated by" - ], - "metadata": { - "id": "BikxGH4kHWDh" - } - }, - { - "cell_type": "code", - "source": [ - "aml@leader" - ], - "metadata": { - "id": "MDYChZcXHVgf" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble \"All Models\" model. The \"All Models\" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard." - ], - "metadata": { - "id": "EEXG6snKHaXY" - } - }, - { - "cell_type": "code", - "source": [ - "model_ids <- as.data.frame(aml@leaderboard$model_id)[,1]\n", - "# Get the \"All Models\" Stacked Ensemble model\n", - "se <- h2o.getModel(grep(\"StackedEnsemble_AllModels\", model_ids, value = TRUE)[1])\n", - "# Get the Stacked Ensemble metalearner model\n", - "metalearner <- se@model$metalearner_model\n", - "h2o.varimp(metalearner)" - ], - "metadata": { - "id": "4mnpHT3wHYq9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM.\n" - ], - "metadata": { - "id": "6m1zg2e-HhQV" - } - }, - { - "cell_type": "code", - "source": [ - "h2o.varimp_plot(metalearner)" - ], - "metadata": { - "id": "J6azOyskHcMu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Generating Predictions Using Leader Model\n", - "\n", - "We can also generate predictions on a test sample using the leader model object." - ], - "metadata": { - "id": "QDyCj4RdH-PK" - } - }, - { - "cell_type": "code", - "source": [ - "pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample\n", - "head(pred)" - ], - "metadata": { - "id": "c6mkVpADH-hB" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "y_test <- as.matrix(test_h$lwage)\n", - "R2_test <- 1-summary(lm((y_test-pred)^2~1))$coef[1]/var(y_test)\n", - "cat(\"MSE, SE, R^2:\" , summary(lm((y_test-pred)^2~1))$coef[1:2], R2_test)\n" - ], - "metadata": { - "id": "GwLL8pywIBBI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We observe both a similar MSE and $R^2$ relative to the better performing models in our previous results." - ], - "metadata": { - "id": "brC7ST6qInti" - } - }, - { - "cell_type": "code", - "source": [ - "h2o.shutdown(prompt = F) # shut down the h20 automatically without prompting user" - ], - "metadata": { - "id": "yoJihU54Ioxs" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" - }, - "papermill": { - "default_parameters": {}, - "duration": 90.376935, - "end_time": "2021-02-13T18:21:10.266455", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-02-13T18:19:39.889520", - "version": "2.2.2" - }, - "colab": { - "provenance": [] - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "nAE4EexhWVGB", + "papermill": { + "duration": 0.036479, + "end_time": "2021-02-13T18:19:43.396666", + "exception": false, + "start_time": "2021-02-13T18:19:43.360187", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Machine Learning Estimators for Wage Prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yppG1kQBWVGC", + "papermill": { + "duration": 0.036639, + "end_time": "2021-02-13T18:19:43.468425", + "exception": false, + "start_time": "2021-02-13T18:19:43.431786", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We illustrate how to predict an outcome variable Y in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. So far we have used linear prediction rules, e.g. Lasso regression, for estimation.\n", + "Now, we also consider nonlinear prediction rules including tree-based methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ww70bLKfEsOb" + }, + "outputs": [], + "source": [ + "# Import relevant packages\n", + "\n", + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"rpart\")\n", + "install.packages(\"nnet\")\n", + "install.packages(\"gbm\")\n", + "install.packages(\"rpart.plot\")\n", + "install.packages(\"keras\")\n", + "\n", + "library(hdm)\n", + "library(xtable)\n", + "library(glmnet)\n", + "library(randomForest)\n", + "library(rpart)\n", + "library(nnet)\n", + "library(gbm)\n", + "library(rpart.plot)\n", + "library(keras)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bvL0TvgoWVGC", + "papermill": { + "duration": 0.034705, + "end_time": "2021-02-13T18:19:43.537814", + "exception": false, + "start_time": "2021-02-13T18:19:43.503109", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KcgFOEEUWVGD", + "papermill": { + "duration": 0.036082, + "end_time": "2021-02-13T18:19:43.609347", + "exception": false, + "start_time": "2021-02-13T18:19:43.573265", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.\n", + "The preproccessed sample consists of $5150$ never-married individuals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vls0HahyWVGE", + "papermill": { + "duration": 0.279387, + "end_time": "2021-02-13T18:19:43.923823", + "exception": false, + "start_time": "2021-02-13T18:19:43.644436", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "data <- read.csv(file)\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nu76eiIbWVGG", + "papermill": { + "duration": 0.034902, + "end_time": "2021-02-13T18:19:43.994834", + "exception": false, + "start_time": "2021-02-13T18:19:43.959932", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G0M2NqjOWVGH", + "papermill": { + "duration": 0.091723, + "end_time": "2021-02-13T18:19:44.123394", + "exception": false, + "start_time": "2021-02-13T18:19:44.031671", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "Z <- subset(data,select=-c(lwage,wage)) # regressors\n", + "colnames(Z)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pa4XliHWWVGH", + "papermill": { + "duration": 0.037074, + "end_time": "2021-02-13T18:19:44.196749", + "exception": false, + "start_time": "2021-02-13T18:19:44.159675", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The following figure shows the weekly wage distribution from the US survey data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H3nL_puaWVGI", + "papermill": { + "duration": 0.443391, + "end_time": "2021-02-13T18:19:44.677379", + "exception": false, + "start_time": "2021-02-13T18:19:44.233988", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "hist(data$wage, xlab= \"hourly wage\", main=\"Empirical wage distribution from the US survey data\", breaks= 35)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gHx9j6Z-WVGI", + "papermill": { + "duration": 0.036602, + "end_time": "2021-02-13T18:19:44.752465", + "exception": false, + "start_time": "2021-02-13T18:19:44.715863", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by\n", + "the logarithm." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pp584t1QWVGI", + "papermill": { + "duration": 0.036009, + "end_time": "2021-02-13T18:19:44.826260", + "exception": false, + "start_time": "2021-02-13T18:19:44.790251", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uqvQSWp3WVGI", + "papermill": { + "duration": 0.036925, + "end_time": "2021-02-13T18:19:44.899159", + "exception": false, + "start_time": "2021-02-13T18:19:44.862234", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Due to the skewness of the data, we are considering log wages which leads to the following regression model\n", + "\n", + "$$log(wage) = g(Z) + \\epsilon.$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7mjPGAzeWVGI", + "papermill": { + "duration": 0.036183, + "end_time": "2021-02-13T18:19:44.971528", + "exception": false, + "start_time": "2021-02-13T18:19:44.935345", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We will estimate the two sets of prediction rules: Linear and Nonlinear Models.\n", + "In linear models, we estimate the prediction rule of the form\n", + "\n", + "$$\\hat g(Z) = \\hat \\beta'X.$$\n", + "Again, we generate $X$ in two ways:\n", + "\n", + "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators).\n", + "\n", + "\n", + "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "soVOsFWVWVGJ", + "papermill": { + "duration": 0.037318, + "end_time": "2021-02-13T18:19:45.044959", + "exception": false, + "start_time": "2021-02-13T18:19:45.007641", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To evaluate the out-of-sample performance, we split the data first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z93_1rRMWVGJ", + "papermill": { + "duration": 0.062188, + "end_time": "2021-02-13T18:19:45.143118", + "exception": false, + "start_time": "2021-02-13T18:19:45.080930", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(1234)\n", + "training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE)\n", + "\n", + "data_train <- data[training,]\n", + "data_test <- data[-training,]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fhb0MtAGWVGJ", + "papermill": { + "duration": 0.038774, + "end_time": "2021-02-13T18:19:45.217757", + "exception": false, + "start_time": "2021-02-13T18:19:45.178983", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We construct the two different model matrices $X_{basic}$ and $X_{flex}$ for both the training and the test sample:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4N38OGcwWVGJ", + "papermill": { + "duration": 0.094135, + "end_time": "2021-02-13T18:19:45.347955", + "exception": false, + "start_time": "2021-02-13T18:19:45.253820", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "X_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", + "X_flex <- \"sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\"\n", + "formula_basic <- as.formula(paste(\"lwage\", \"~\", X_basic))\n", + "formula_flex <- as.formula(paste(\"lwage\", \"~\", X_flex))\n", + "\n", + "model_X_basic_train <- model.matrix(formula_basic,data_train)\n", + "model_X_basic_test <- model.matrix(formula_basic,data_test)\n", + "p_basic <- dim(model_X_basic_train)[2]\n", + "model_X_flex_train <- model.matrix(formula_flex,data_train)\n", + "model_X_flex_test <- model.matrix(formula_flex,data_test)\n", + "p_flex <- dim(model_X_flex_train)[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iUbis0zFWVGJ", + "papermill": { + "duration": 0.060969, + "end_time": "2021-02-13T18:19:45.445389", + "exception": false, + "start_time": "2021-02-13T18:19:45.384420", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "Y_train <- data_train$lwage\n", + "Y_test <- data_test$lwage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pKWfxw3sWVGK", + "papermill": { + "duration": 0.062723, + "end_time": "2021-02-13T18:19:45.545189", + "exception": false, + "start_time": "2021-02-13T18:19:45.482466", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "p_basic\n", + "p_flex" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BFWIXx9KWVGK", + "papermill": { + "duration": 0.037704, + "end_time": "2021-02-13T18:19:45.622370", + "exception": false, + "start_time": "2021-02-13T18:19:45.584666", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FvwCARMSWVGK", + "papermill": { + "duration": 0.038763, + "end_time": "2021-02-13T18:19:45.699126", + "exception": false, + "start_time": "2021-02-13T18:19:45.660363", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### OLS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TmXoHZ29WVGK", + "papermill": { + "duration": 0.039458, + "end_time": "2021-02-13T18:19:45.779460", + "exception": false, + "start_time": "2021-02-13T18:19:45.740002", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We fit the basic model to our training data by running an ols regression and compute the mean squared error on the test sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dKpIzgYhWVGK", + "papermill": { + "duration": 0.069537, + "end_time": "2021-02-13T18:19:45.887169", + "exception": false, + "start_time": "2021-02-13T18:19:45.817632", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# ols (basic model)\n", + "fit.lm.basic <- lm(formula_basic, data_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oVxJtcLqWVGK", + "papermill": { + "duration": 0.074423, + "end_time": "2021-02-13T18:19:45.999870", + "exception": false, + "start_time": "2021-02-13T18:19:45.925447", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Compute the Out-Of-Sample Performance\n", + "yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test)\n", + "cat(\"The mean squared error (MSE) using the basic model is equal to\" , mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K3pW0712WVGK", + "papermill": { + "duration": 0.052764, + "end_time": "2021-02-13T18:19:46.122829", + "exception": false, + "start_time": "2021-02-13T18:19:46.070065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQ8T93iCWVGL", + "papermill": { + "duration": 0.076484, + "end_time": "2021-02-13T18:19:46.239015", + "exception": false, + "start_time": "2021-02-13T18:19:46.162531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2]\n", + "MSE.lm.basic" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xxRMlMvoWVGL", + "papermill": { + "duration": 0.039088, + "end_time": "2021-02-13T18:19:46.317915", + "exception": false, + "start_time": "2021-02-13T18:19:46.278827", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We also compute the out-of-sample $R^2$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wOGfwTlwWVGL", + "papermill": { + "duration": 0.057098, + "end_time": "2021-02-13T18:19:46.413754", + "exception": false, + "start_time": "2021-02-13T18:19:46.356656", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test)\n", + "cat(\"The R^2 using the basic model is equal to\",R2.lm.basic) # MSE OLS (basic model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7rsIX-qEWVGL", + "papermill": { + "duration": 0.039585, + "end_time": "2021-02-13T18:19:46.492903", + "exception": false, + "start_time": "2021-02-13T18:19:46.453318", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We repeat the same procedure for the flexible model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lCRbc_HDWVGL", + "papermill": { + "duration": 0.198636, + "end_time": "2021-02-13T18:19:46.730717", + "exception": false, + "start_time": "2021-02-13T18:19:46.532081", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# ols (flexible model)\n", + "fit.lm.flex <- lm(formula_flex, data_train)\n", + "# Compute the Out-Of-Sample Performance\n", + "options(warn=-1)\n", + "yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test)\n", + "MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2]\n", + "R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test)\n", + "cat(\"The R^2 using the flexible model is equal to\",R2.lm.flex) # MSE OLS (flexible model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1DKdOWI8WVGL", + "papermill": { + "duration": 0.051953, + "end_time": "2021-02-13T18:19:46.853182", + "exception": false, + "start_time": "2021-02-13T18:19:46.801229", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We now proceed by running lasso regressions and related penalized methods." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-zfmqjNVWVGM", + "papermill": { + "duration": 0.042521, + "end_time": "2021-02-13T18:19:46.935859", + "exception": false, + "start_time": "2021-02-13T18:19:46.893338", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### Lasso, Ridge and Elastic Net\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9fvu80MpWVGM", + "papermill": { + "duration": 0.040161, + "end_time": "2021-02-13T18:19:47.015626", + "exception": false, + "start_time": "2021-02-13T18:19:46.975465", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Considering the basic model, we run a lasso/post-lasso regression first and then we compute the measures for the out-of-sample performance. Note that applying the package *hdm* and the function *rlasso* we rely on a theoretical based choice of the penalty level $\\lambda$ in the lasso regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OFhArnPNWVGM", + "papermill": { + "duration": 0.577781, + "end_time": "2021-02-13T18:19:47.634269", + "exception": false, + "start_time": "2021-02-13T18:19:47.056488", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# lasso and variants\n", + "fit.rlasso <- rlasso(formula_basic, data_train, post=FALSE)\n", + "fit.rlasso.post <- rlasso(formula_basic, data_train, post=TRUE)\n", + "yhat.rlasso <- predict(fit.rlasso, newdata=data_test)\n", + "yhat.rlasso.post <- predict(fit.rlasso.post, newdata=data_test)\n", + "\n", + "MSE.lasso <- summary(lm((Y_test-yhat.rlasso)^2~1))$coef[1:2]\n", + "MSE.lasso.post <- summary(lm((Y_test-yhat.rlasso.post)^2~1))$coef[1:2]\n", + "\n", + "R2.lasso <- 1-MSE.lasso[1]/var(Y_test)\n", + "R2.lasso.post <- 1-MSE.lasso.post[1]/var(Y_test)\n", + "cat(\"The R^2 using the basic model is equal to\",R2.lasso,\"for lasso and\",R2.lasso.post,\"for post-lasso\") # R^2 lasso/post-lasso (basic model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NRUPJtOzWVGM", + "papermill": { + "duration": 0.049543, + "end_time": "2021-02-13T18:19:47.757271", + "exception": false, + "start_time": "2021-02-13T18:19:47.707728", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we repeat the same procedure for the flexible model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "17doIQ14WVGM", + "papermill": { + "duration": 3.430649, + "end_time": "2021-02-13T18:19:51.229007", + "exception": false, + "start_time": "2021-02-13T18:19:47.798358", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE)\n", + "fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE)\n", + "yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test)\n", + "yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test)\n", + "\n", + "MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2]\n", + "MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2]\n", + "\n", + "R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test)\n", + "R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test)\n", + "cat(\"The R^2 using the flexible model is equal to\",R2.lasso.flex,\"for lasso and\",R2.lasso.post.flex,\"for post-lasso\") # R^2 lasso/post-lasso (flexible model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i9xxyyM-WVGN", + "papermill": { + "duration": 0.041452, + "end_time": "2021-02-13T18:19:51.436401", + "exception": false, + "start_time": "2021-02-13T18:19:51.394949", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "In contrast to a theoretical based choice of the tuning parameter $\\lambda$ in the lasso regression, we can also use cross-validation to determine the penalty level by applying the package *glmnet* and the function cv.glmnet. In this context, we also run a ridge and a elastic net regression by adjusting the parameter *alpha*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JfxqhppCWVGT", + "papermill": { + "duration": 2.248453, + "end_time": "2021-02-13T18:19:53.725885", + "exception": false, + "start_time": "2021-02-13T18:19:51.477432", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fit.lasso.cv <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=1)\n", + "fit.ridge <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=0)\n", + "fit.elnet <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=.5)\n", + "\n", + "yhat.lasso.cv <- predict(fit.lasso.cv, newx = model_X_basic_test)\n", + "yhat.ridge <- predict(fit.ridge, newx = model_X_basic_test)\n", + "yhat.elnet <- predict(fit.elnet, newx = model_X_basic_test)\n", + "\n", + "MSE.lasso.cv <- summary(lm((Y_test-yhat.lasso.cv)^2~1))$coef[1:2]\n", + "MSE.ridge <- summary(lm((Y_test-yhat.ridge)^2~1))$coef[1:2]\n", + "MSE.elnet <- summary(lm((Y_test-yhat.elnet)^2~1))$coef[1:2]\n", + "\n", + "R2.lasso.cv <- 1-MSE.lasso.cv[1]/var(Y_test)\n", + "R2.ridge <- 1-MSE.ridge[1]/var(Y_test)\n", + "R2.elnet <- 1-MSE.elnet[1]/var(Y_test)\n", + "\n", + "# R^2 using cross-validation (basic model)\n", + "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the basic model:\",R2.lasso.cv,R2.ridge,R2.elnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dv2mcitqWVGT", + "papermill": { + "duration": 0.042613, + "end_time": "2021-02-13T18:19:53.812553", + "exception": false, + "start_time": "2021-02-13T18:19:53.769940", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Note that the following calculations for the flexible model need some computation time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GxOMyb3LWVGU", + "papermill": { + "duration": 13.588391, + "end_time": "2021-02-13T18:20:07.443188", + "exception": false, + "start_time": "2021-02-13T18:19:53.854797", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=1)\n", + "fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=0)\n", + "fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=.5)\n", + "\n", + "yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test)\n", + "yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test)\n", + "yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test)\n", + "\n", + "MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2]\n", + "MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2]\n", + "MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2]\n", + "\n", + "R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test)\n", + "R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test)\n", + "R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test)\n", + "\n", + "# R^2 using cross-validation (flexible model)\n", + "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:\",R2.lasso.cv.flex,R2.ridge.flex,R2.elnet.flex)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QFat4RN3WVGU", + "papermill": { + "duration": 0.04263, + "end_time": "2021-02-13T18:20:07.529566", + "exception": false, + "start_time": "2021-02-13T18:20:07.486936", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The performance of the lasso regression with cross-validated penalty is quite similar to the performance of lasso using a theoretical based choice of the tuning parameter." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j6tbPExhWVGU", + "papermill": { + "duration": 0.042859, + "end_time": "2021-02-13T18:20:07.614751", + "exception": false, + "start_time": "2021-02-13T18:20:07.571892", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#Non-linear models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OtyGNYtFWVGU", + "papermill": { + "duration": 0.042125, + "end_time": "2021-02-13T18:20:07.699092", + "exception": false, + "start_time": "2021-02-13T18:20:07.656967", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Besides linear regression models, we consider nonlinear regression models to build a predictive model. We are applying regression trees, random forests, boosted trees and neural nets to estimate the regression function $g(X)$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c2NCnTfZWVGV", + "papermill": { + "duration": 0.043267, + "end_time": "2021-02-13T18:20:08.261600", + "exception": false, + "start_time": "2021-02-13T18:20:08.218333", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Regression Trees" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ShRO5WNpWVGV", + "papermill": { + "duration": 0.043445, + "end_time": "2021-02-13T18:20:08.348402", + "exception": false, + "start_time": "2021-02-13T18:20:08.304957", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We fit a regression tree to the training data using the basic model. The variable *cp* controls the complexity of the regression tree, i.e. how deep we build the tree." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3n80mXGeWVGV", + "papermill": { + "duration": 1.843972, + "end_time": "2021-02-13T18:20:10.235503", + "exception": false, + "start_time": "2021-02-13T18:20:08.391531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# tree\n", + "fit.trees <- rpart(formula_basic, data_train, minbucket=5, cp = 0.001)\n", + "prp(fit.trees, leaf.round=1, space=2, yspace=2, split.space=2,shadow.col = \"gray\",trace = 1) # plotting the tree" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KrqAEnpvWVGW", + "papermill": { + "duration": 0.046456, + "end_time": "2021-02-13T18:20:10.328795", + "exception": false, + "start_time": "2021-02-13T18:20:10.282339", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "An important method to improve predictive performance is called \"Pruning the Tree\". This\n", + "means the process of cutting down the branches of a tree. We apply pruning to the complex tree above to reduce the depth. Initially, we determine the optimal complexity of the regression tree." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vCrBEct1WVGW", + "papermill": { + "duration": 0.070106, + "end_time": "2021-02-13T18:20:10.445828", + "exception": false, + "start_time": "2021-02-13T18:20:10.375722", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "bestcp <- fit.trees$cptable[which.min(fit.trees$cptable[,\"xerror\"]),\"CP\"]\n", + "bestcp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bTEqTX1ZWVGW", + "papermill": { + "duration": 0.047157, + "end_time": "2021-02-13T18:20:10.540327", + "exception": false, + "start_time": "2021-02-13T18:20:10.493170", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we can prune the tree and visualize the prediction rule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a9Sirh4_WVGW", + "papermill": { + "duration": 0.543483, + "end_time": "2021-02-13T18:20:11.131455", + "exception": false, + "start_time": "2021-02-13T18:20:10.587972", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fit.prunedtree <- prune(fit.trees,cp=bestcp)\n", + "prp(fit.prunedtree,leaf.round=1, space=3, yspace=3, split.space=7, shadow.col = \"gray\",trace = 1,yesno=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2oDS05mgWVGW", + "papermill": { + "duration": 0.04994, + "end_time": "2021-02-13T18:20:11.334448", + "exception": false, + "start_time": "2021-02-13T18:20:11.284508", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Finally, we calculate the mean-squared error and the $R^2$ on the test sample to evaluate the out-of-sample performance of the pruned tree." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AvXxs5R3WVGW", + "papermill": { + "duration": 0.079534, + "end_time": "2021-02-13T18:20:11.463701", + "exception": false, + "start_time": "2021-02-13T18:20:11.384167", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "yhat.pt <- predict(fit.prunedtree,newdata=data_test)\n", + "MSE.pt <- summary(lm((Y_test-yhat.pt)^2~1))$coef[1:2]\n", + "R2.pt <- 1-MSE.pt[1]/var(Y_test)\n", + "\n", + "# R^2 of the pruned tree\n", + "cat(\"R^2 of the pruned tree:\",R2.pt)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "03cG01y1WVGX", + "papermill": { + "duration": 0.052287, + "end_time": "2021-02-13T18:20:11.566330", + "exception": false, + "start_time": "2021-02-13T18:20:11.514043", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Random Forest and Boosted Trees" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7nRCkm1WWVGX", + "papermill": { + "duration": 0.050794, + "end_time": "2021-02-13T18:20:11.667980", + "exception": false, + "start_time": "2021-02-13T18:20:11.617186", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "In the next step, we apply the more advanced tree-based methods random forest and boosted trees." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZbLiUr0Lh4Le" + }, + "outputs": [], + "source": [ + "# random forest\n", + "fit.rf <- randomForest(model_X_basic_train, Y_train, ntree=2000, nodesize=20, data = data_train)\n", + "\n", + "## Evaluating the method\n", + "yhat.rf <- predict(fit.rf, newdata=model_X_basic_test) # prediction\n", + "\n", + "MSE.rf = summary(lm((Y_test-yhat.rf)^2~1))$coef[1:2]\n", + "R2.rf <- 1-MSE.rf[1]/var(Y_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p1Q7NNZaWVGX", + "papermill": { + "duration": 56.677891, + "end_time": "2021-02-13T18:21:08.396363", + "exception": false, + "start_time": "2021-02-13T18:20:11.718472", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# boosting\n", + "fit.boost <- gbm(formula_basic, data=data_train, distribution= \"gaussian\", bag.fraction = .5, interaction.depth=2, n.trees=1000, shrinkage=.01)\n", + "best.boost <- gbm.perf(fit.boost, plot.it = FALSE) # cross-validation to determine when to stop\n", + "\n", + "## Evaluating the method\n", + "yhat.boost <- predict(fit.boost, newdata=data_test, n.trees=best.boost)\n", + "\n", + "MSE.boost = summary(lm((Y_test-yhat.boost)^2~1))$coef[1:2]\n", + "R2.boost <- 1-MSE.boost[1]/var(Y_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WkzBr2OOi9GC" + }, + "outputs": [], + "source": [ + "# printing R^2\n", + "cat(\"R^2 of the random forest and boosted trees:\", R2.rf, R2.boost)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tyLLLoTCunpl" + }, + "source": [ + "## NNets\n", + "\n", + "First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers -- 2 hidden and one output layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hKNFcGgwt3gm" + }, + "outputs": [], + "source": [ + "# Define the neural network architecture\n", + "model <- keras_model_sequential() %>%\n", + " layer_dense(units = 50, activation = 'relu', input_shape = dim(model_X_basic_train)[2]) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", + "\n", + "# Compile the model\n", + "model %>% compile(\n", + " optimizer = optimizer_adam(lr = 0.01),\n", + " loss = \"mse\",\n", + " metrics = c(\"mae\"),\n", + ")\n", + "\n", + "summary(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c3guqZeeyDd3" + }, + "outputs": [], + "source": [ + "num_epochs <- 100\n", + "\n", + "# Define early stopping based on validation set (20%) performance\n", + "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", + "\n", + "# Train the model\n", + "model %>% fit(\n", + " model_X_basic_train, Y_train,\n", + " epochs = num_epochs,\n", + " batch_size = 10,\n", + " validation_split = 0.2, # 20% validation set\n", + " verbose = 0,\n", + " callbacks = list(early_stopping)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oFRmau4lzDoa" + }, + "outputs": [], + "source": [ + "# evaluating the performance\n", + "model %>% evaluate(model_X_basic_test, Y_test, verbose = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UZP6ytgUzAlz" + }, + "outputs": [], + "source": [ + "# Calculating the performance measures\n", + "yhat.nn <- model %>% predict(model_X_basic_test)\n", + "MSE.nn = summary(lm((Y_test-yhat.nn)^2~1))$coef[1:2]\n", + "R2.nn <- 1-MSE.nn[1]/var(Y_test)\n", + "# printing R^2\n", + "cat(\"R^2 of the neural network:\",R2.nn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2KLGpmUTWVGX", + "papermill": { + "duration": 0.051225, + "end_time": "2021-02-13T18:21:08.500313", + "exception": false, + "start_time": "2021-02-13T18:21:08.449088", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To conclude, let us have a look at our results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rz0O_d-qWVGX", + "papermill": { + "duration": 0.052403, + "end_time": "2021-02-13T18:21:08.603976", + "exception": false, + "start_time": "2021-02-13T18:21:08.551573", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gtM59T07WVGY", + "papermill": { + "duration": 0.167847, + "end_time": "2021-02-13T18:21:08.823485", + "exception": false, + "start_time": "2021-02-13T18:21:08.655638", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "table<- matrix(0, 16, 3)\n", + "table[1,1:2] <- MSE.lm.basic\n", + "table[2,1:2] <- MSE.lm.flex\n", + "table[3,1:2] <- MSE.lasso\n", + "table[4,1:2] <- MSE.lasso.post\n", + "table[5,1:2] <- MSE.lasso.flex\n", + "table[6,1:2] <- MSE.lasso.post.flex\n", + "table[7,1:2] <- MSE.lasso.cv\n", + "table[8,1:2] <- MSE.ridge\n", + "table[9,1:2] <- MSE.elnet\n", + "table[10,1:2] <- MSE.lasso.cv.flex\n", + "table[11,1:2] <- MSE.ridge.flex\n", + "table[12,1:2] <- MSE.elnet.flex\n", + "table[13,1:2] <- MSE.rf\n", + "table[14,1:2] <- MSE.boost\n", + "table[15,1:2] <- MSE.pt\n", + "table[16,1:2] <- MSE.nn\n", + "\n", + "\n", + "\n", + "table[1,3] <- R2.lm.basic\n", + "table[2,3] <- R2.lm.flex\n", + "table[3,3] <- R2.lasso\n", + "table[4,3] <- R2.lasso.post\n", + "table[5,3] <- R2.lasso.flex\n", + "table[6,3] <- R2.lasso.post.flex\n", + "table[7,3] <- R2.lasso.cv\n", + "table[8,3] <- R2.ridge\n", + "table[9,3] <- R2.elnet\n", + "table[10,3] <- R2.lasso.cv.flex\n", + "table[11,3] <- R2.ridge.flex\n", + "table[12,3] <- R2.elnet.flex\n", + "table[13,3] <- R2.rf\n", + "table[14,3] <- R2.boost\n", + "table[15,3] <- R2.pt\n", + "table[16,3] <- R2.nn\n", + "\n", + "\n", + "\n", + "\n", + "colnames(table)<- c(\"MSE\", \"S.E. for MSE\", \"R-squared\")\n", + "rownames(table)<- c(\"Least Squares (basic)\",\"Least Squares (flexible)\", \"Lasso\", \"Post-Lasso\",\"Lasso (flexible)\",\"Post-Lasso (flexible)\",\n", + " \"Cross-Validated lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\"Cross-Validated lasso (flexible)\",\"Cross-Validated ridge (flexible)\",\"Cross-Validated elnet (flexible)\",\n", + " \"Random Forest\",\"Boosted Trees\", \"Pruned Tree\", \"Neural Net (Early)\")\n", + "tab <- xtable(table, digits =3)\n", + "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-A-sNNZmWVGY", + "papermill": { + "duration": 0.052897, + "end_time": "2021-02-13T18:21:08.930888", + "exception": false, + "start_time": "2021-02-13T18:21:08.877991", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Above, we displayed the results for a single split of data into the training and testing part. The table shows the test MSE in column 1 as well as the standard error in column 2 and the test $R^2$\n", + "in column 3. We see that most models perform similarly. For most of these methods, test MSEs are within one standard error of each other. Remarkably, OLS with just the basic variables performs extremely well. However, OLS on a flexible model with many regressors performs very poorly giving nearly the highest test MSE. It is worth noticing that, as this is just a simple illustration meant to be relatively quick, the nonlinear models are not tuned. Thus, there is potential to improve the performance of the nonlinear methods we used in the analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G7vGZPhPWVGY", + "papermill": { + "duration": 0.052594, + "end_time": "2021-02-13T18:21:09.036009", + "exception": false, + "start_time": "2021-02-13T18:21:08.983415", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### Ensemble learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-py__sTwWVGY", + "papermill": { + "duration": 0.053134, + "end_time": "2021-02-13T18:21:09.146558", + "exception": false, + "start_time": "2021-02-13T18:21:09.093424", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "In the final step, we can build a prediction model by combing the strengths of the models we considered so far. This ensemble method is of the form\n", + "\t$$ f(x) = \\sum_{k=1}^K \\alpha_k f_k(x) $$\n", + "where the $f_k$'s denote our prediction rules from the table above and the $\\alpha_k$'s are the corresponding weights.\n", + "\n", + "We first estimate the weights without penalization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "amnVg2qMWVGY", + "papermill": { + "duration": 0.079851, + "end_time": "2021-02-13T18:21:09.388686", + "exception": false, + "start_time": "2021-02-13T18:21:09.308835", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ensemble.ols <- summary(lm(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", + "ensemble.ols\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9dNbETxxWVGY", + "papermill": { + "duration": 0.054822, + "end_time": "2021-02-13T18:21:09.498067", + "exception": false, + "start_time": "2021-02-13T18:21:09.443245", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Alternatively, we can determine the weights via lasso regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tUNMMhdVWVGZ", + "papermill": { + "duration": 0.175196, + "end_time": "2021-02-13T18:21:09.727077", + "exception": false, + "start_time": "2021-02-13T18:21:09.551881", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ensemble.lasso <- summary(rlasso(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", + "ensemble.lasso" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xv9P__WOWVGZ", + "papermill": { + "duration": 0.055874, + "end_time": "2021-02-13T18:21:09.838636", + "exception": false, + "start_time": "2021-02-13T18:21:09.782762", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The estimated weights are shown in the following table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tHp5cn8rWVGZ", + "papermill": { + "duration": 0.094431, + "end_time": "2021-02-13T18:21:09.988946", + "exception": false, + "start_time": "2021-02-13T18:21:09.894515", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "table<- matrix(0, 17, 2)\n", + "table[1:17,1] <- ensemble.ols$coef[1:17]\n", + "table[1:17,2] <- ensemble.lasso$coef[1:17]\n", + "\n", + "\n", + "colnames(table)<- c(\"Weight OLS\", \"Weight Lasso\")\n", + "\n", + "\n", + "rownames(table)<- c(\"Constant\",\"Least Squares (basic)\", \"Least Squares (flexible)\", \"Lasso (basic)\",\n", + " \"Lasso (flexible)\", \"Post-Lasso (basic)\", \"Post-Lasso (flexible)\", \"LassoCV (basic)\",\n", + " \"Lasso CV (flexible)\", \"Ridge CV (basic)\", \"Ridge CV (flexible)\", \"ElNet CV (basic)\",\n", + " \"ElNet CV (flexible)\", \"Pruned Tree\", \"Random Forest\",\"Boosted Trees\", \"Neural Net\")\n", + "tab <- xtable(table, digits =3)\n", + "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ac-Nynd9WVGZ", + "papermill": { + "duration": 0.056002, + "end_time": "2021-02-13T18:21:10.101284", + "exception": false, + "start_time": "2021-02-13T18:21:10.045282", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We note the superior $R^2$ performance of the ensembles. Though for more unbiased performance evaluation, we should have left out a third sample to validate the performance of the stacked model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pcyQsL5xmKxR" + }, + "outputs": [], + "source": [ + "# print ensemble R^2\n", + "cat(\"R^2 of stacking with LS weights:\",ensemble.ols$adj.r.squared,\"\\n\")\n", + "cat(\"R^2 of stacking with Lasso weights:\",ensemble.lasso$adj.r.squared,\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0G7F6n_2ELZJ" + }, + "source": [ + "# Automatic Machine Learning with H20 AutoML\n", + "\n", + "We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, we reanalyze the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E7yQsmgqEPsz" + }, + "source": [ + "We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPz9qeg2EPAN" + }, + "outputs": [], + "source": [ + "# load the H2O package\n", + "install.packages(\"h2o\")\n", + "library(h2o)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fxz49VSXEZDC" + }, + "outputs": [], + "source": [ + "# start h2o cluster\n", + "h2o.init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "orzSZz_eEnWg" + }, + "outputs": [], + "source": [ + "# convert data as h2o type\n", + "train_h = as.h2o(data_train)\n", + "test_h = as.h2o(data_test)\n", + "\n", + "# have a look at the data\n", + "h2o.describe(train_h)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5PohiG13EqTn" + }, + "outputs": [], + "source": [ + "y_name = 'lwage'\n", + "X_names = setdiff(names(data), c('lwage','wage','occ', 'ind'))\n", + "\n", + "# run AutoML for 10 base models and a maximal runtime of 100 seconds\n", + "aml = h2o.automl(x=X_names, y=y_name,\n", + " training_frame = train_h,\n", + " leaderboard_frame = test_h,\n", + " max_models = 10,\n", + " seed = 1,\n", + " max_runtime_secs = 100\n", + " )\n", + "# AutoML Leaderboard\n", + "lb = aml@leaderboard\n", + "print(lb, n = nrow(lb))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b6-CATAVHQtV" + }, + "source": [ + "We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fIBhP8LSGpA6" + }, + "outputs": [], + "source": [ + "aml@leaderboard$mse[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BikxGH4kHWDh" + }, + "source": [ + "The in-sample performance can be evaluated by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MDYChZcXHVgf" + }, + "outputs": [], + "source": [ + "aml@leader" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EEXG6snKHaXY" + }, + "source": [ + "This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble \"All Models\" model. The \"All Models\" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4mnpHT3wHYq9" + }, + "outputs": [], + "source": [ + "model_ids <- as.data.frame(aml@leaderboard$model_id)[,1]\n", + "# Get the \"All Models\" Stacked Ensemble model\n", + "se <- h2o.getModel(grep(\"StackedEnsemble_AllModels\", model_ids, value = TRUE)[1])\n", + "# Get the Stacked Ensemble metalearner model\n", + "metalearner <- se@model$metalearner_model\n", + "h2o.varimp(metalearner)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6m1zg2e-HhQV" + }, + "source": [ + "The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J6azOyskHcMu" + }, + "outputs": [], + "source": [ + "h2o.varimp_plot(metalearner)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QDyCj4RdH-PK" + }, + "source": [ + "## Generating Predictions Using Leader Model\n", + "\n", + "We can also generate predictions on a test sample using the leader model object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c6mkVpADH-hB" + }, + "outputs": [], + "source": [ + "pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample\n", + "head(pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GwLL8pywIBBI" + }, + "outputs": [], + "source": [ + "y_test <- as.matrix(test_h$lwage)\n", + "R2_test <- 1-summary(lm((y_test-pred)^2~1))$coef[1]/var(y_test)\n", + "cat(\"MSE, SE, R^2:\" , summary(lm((y_test-pred)^2~1))$coef[1:2], R2_test)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "brC7ST6qInti" + }, + "source": [ + "We observe both a similar MSE and $R^2$ relative to the better performing models in our previous results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yoJihU54Ioxs" + }, + "outputs": [], + "source": [ + "h2o.shutdown(prompt = F) # shut down the h20 automatically without prompting user" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + }, + "papermill": { + "default_parameters": {}, + "duration": 90.376935, + "end_time": "2021-02-13T18:21:10.266455", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-02-13T18:19:39.889520", + "version": "2.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From 15fb3e85d569e1ce86597c585eb9716c21130faa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:17:23 +0000 Subject: [PATCH 071/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM3 --- CM3/r-dagitty.Rmd | 204 ++++++ CM3/r-dagitty.irnb | 1573 ++++++++++++++----------------------------- CM3/r-dosearch.Rmd | 205 ++++++ CM3/r-dosearch.irnb | 874 +++++++++++------------- 4 files changed, 1304 insertions(+), 1552 deletions(-) create mode 100644 CM3/r-dagitty.Rmd create mode 100644 CM3/r-dosearch.Rmd diff --git a/CM3/r-dagitty.Rmd b/CM3/r-dagitty.Rmd new file mode 100644 index 00000000..87d99014 --- /dev/null +++ b/CM3/r-dagitty.Rmd @@ -0,0 +1,204 @@ +--- +title: An R Markdown document converted from "CM3/r-dagitty.irnb" +output: html_document +--- + +There are system packages that some of the R packages need. We install them here. + +```{r} +system('sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable') +system('sudo apt-get update') +system('sudo apt-get install libglpk-dev libgmp-dev libxml2-dev') +``` + +# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests + +```{r} +#install and load package +install.packages("dagitty") +install.packages("ggdag") +library(dagitty) +library(ggdag) +``` + +# Graph Generation and Plotting + +The following DAG is due to Judea Pearl + +```{r} +#generate a couple of DAGs and plot them + +G = dagitty('dag{ +Z1 [pos="-2,-1.5"] +X1 [pos="-2,0"] +Z2 [pos="1.5,-1.5"] +X3 [pos="1.5, 0"] +Y [outcome,pos="1.5,1.5"] +D [exposure,pos="-2,1.5"] +M [mediator, pos="0,1.5"] +X2 [pos="0,0"] +Z1 -> X1 +X1 -> D +Z1 -> X2 +Z2 -> X3 +X3 -> Y +Z2 -> X2 +D -> Y +X2 -> Y +X2 -> D +M->Y +D->M +}') + + +ggdag(G)+ theme_dag() +``` + +# Report Relatives of X2 + +```{r} +print(parents(G, "X2")) +print(children(G, "X2")) +print(ancestors(G, "X2")) +print(descendants(G, "X2")) + +``` + +# Find Paths Between D and Y + + +```{r} +paths(G, "D", "Y") +``` + +# List All Testable Implications of the Model + +```{r} +print( impliedConditionalIndependencies(G) ) +``` + +# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \to Y$ + +```{r} +print( adjustmentSets( G, "D", "Y" ) ) +``` + +# Identification via SWIG and D-separation + +```{r} +SWIG = dagitty('dag{ +Z1 [pos="-2,-1.5"] +X1 [pos="-2,0"] +Z2 [pos="1.5,-1.5"] +X3 [pos="1.5, 0"] +Yd [outcome,pos="1.5,1.5"] +D [exposure,pos="-2,1.5"] +d [pos="-1, 1.5"] +Md [mediator, pos="0,1.5"] +X2 [pos="0,0"] +Z1 -> X1 +X1 -> D +Z1 -> X2 +Z2 -> X3 +X3 -> Yd +Z2 -> X2 +X2 -> Yd +X2 -> D +X3-> Yd +Md-> Yd +d-> Md +}') + +ggdag(SWIG)+ theme_dag() +``` + + +# Deduce Conditional Exogeneity or Ignorability by D-separation + +```{r} +print( impliedConditionalIndependencies(SWIG)[5:8] ) +``` + +This coincides with the backdoor criterion for this graph. + +# Print All Average Effects Identifiable by Conditioning + +```{r} +for( n in names(G) ){ + for( m in children(G,n) ){ + a <- adjustmentSets( G, n, m ) + if( length(a) > 0 ){ + cat("The effect ",n,"->",m, + " is identifiable by controlling for:\n",sep="") + print( a, prefix=" * " ) + } + } +} +``` + +# Equivalence Classes + +```{r} +P=equivalenceClass(G) +plot(P) +#equivalentDAGs(G,10) +``` + +Next Consider the elemntary Triangular Model: +$$ +D \to Y, \quad X \to (D,Y). +$$ +This model has no testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$. + +```{r} +G3<- dagitty('dag{ +D -> Y +X -> D +X -> Y +} +') + +ggdag(G3)+ theme_dag() + +print(impliedConditionalIndependencies(G3)) + +``` + +```{r} +P=equivalenceClass(G3) +plot(P) +equivalentDAGs(G3,10) + +``` + +# Example of Testing DAG Validity + +Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity. + + +There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests). + +```{r} +set.seed(1) +x <- simulateSEM(G) +head(x) +#cov(x) +localTests(G, data = x, type = c("cis")) + +``` + +Next we replaced $D$ by $\bar D$ generated differently: +$$ +\bar D= (D + Y)/2. +$$ +$\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional independence restrictions, exploiting linearity. We end up rejectiong these restrictions and therefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure. + + +```{r} +x.R = x +x.R$D = (x$D+ x$Y)/2 + +localTests(G, data = x.R, type = c("cis")) + +``` + diff --git a/CM3/r-dagitty.irnb b/CM3/r-dagitty.irnb index b5a9e8fd..12e7fa72 100644 --- a/CM3/r-dagitty.irnb +++ b/CM3/r-dagitty.irnb @@ -1,1073 +1,506 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "There are system packages that some of the R packages need. We install them here." - ], - "metadata": { - "id": "POIGemzqP6P9" - } - }, - { - "cell_type": "code", - "source": [ - "system('sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable')\n", - "system('sudo apt-get update')\n", - "system('sudo apt-get install libglpk-dev libgmp-dev libxml2-dev')" - ], - "metadata": { - "id": "9TqXQzIlOelc" - }, - "execution_count": 12, - "outputs": [] - }, - { - "metadata": { - "id": "mll4gH73k2-A" - }, - "cell_type": "markdown", - "source": [ - "# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests\n" - ] - }, - { - "metadata": { - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "_execution_state": "idle", - "trusted": true, - "id": "md3VArZXk2-G", - "outputId": "2b1a6a60-18a4-4de3-cec6-de65a2516641", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "#install and load package\n", - "install.packages(\"dagitty\")\n", - "install.packages(\"ggdag\")\n", - "library(dagitty)\n", - "library(ggdag)\n" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘graphlayouts’, ‘ggraph’, ‘tidygraph’\n", - "\n", - "\n", - "\n", - "Attaching package: ‘ggdag’\n", - "\n", - "\n", - "The following object is masked from ‘package:stats’:\n", - "\n", - " filter\n", - "\n", - "\n" - ] - } - ] - }, - { - "metadata": { - "id": "5DaJmPrbk2-J" - }, - "cell_type": "markdown", - "source": [ - "# Graph Generation and Plotting" - ] - }, - { - "metadata": { - "id": "eeldklHqk2-K" - }, - "cell_type": "markdown", - "source": [ - "The following DAG is due to Judea Pearl" - ] - }, - { - "metadata": { - "trusted": true, - "id": "1fgBFtRxk2-K", - "outputId": "88af1fad-7fbe-46a6-986c-637462e8e86d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 437 - } - }, - "cell_type": "code", - "source": [ - "#generate a couple of DAGs and plot them\n", - "\n", - "G = dagitty('dag{\n", - "Z1 [pos=\"-2,-1.5\"]\n", - "X1 [pos=\"-2,0\"]\n", - "Z2 [pos=\"1.5,-1.5\"]\n", - "X3 [pos=\"1.5, 0\"]\n", - "Y [outcome,pos=\"1.5,1.5\"]\n", - "D [exposure,pos=\"-2,1.5\"]\n", - "M [mediator, pos=\"0,1.5\"]\n", - "X2 [pos=\"0,0\"]\n", - "Z1 -> X1\n", - "X1 -> D\n", - "Z1 -> X2\n", - "Z2 -> X3\n", - "X3 -> Y\n", - "Z2 -> X2\n", - "D -> Y\n", - "X2 -> Y\n", - "X2 -> D\n", - "M->Y\n", - "D->M\n", - "}')\n", - "\n", - "\n", - "ggdag(G)+ theme_dag()" - ], - "execution_count": 15, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deYAV1Zk28NMNTaNA001iFBQa\n3GKcxCigoMYdiYo4aiSZuKNiNKsjGtAxAddgvmQGjQoouICJiXG+uKCgKEjilogbsqigKFtj\niMjSNEgvNbe6eqnbt+pWnVNv1dme3x82SFHnUFVP973vfesUcwAgMSZ7AgAmQJAACCBIAAQQ\nJAACCBIAAQQJgACCBEAAQQIggCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAgARBA\nkAAIIEgABBAkAAIIEgABBAmAAIIEQABBAiCAIAEQQJAACCBIAAQQJAACCBIAAQQJgACCBEAA\nQQIggCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABBAkAAIIEgAB\nBAmAAIIEQIA0SFv+et+1F58z7JRRl0145I1dlHuGuOpee+j6S0YNP3nUJdc/9Fqd7Nko6YtF\nf/jlZaO+PWzUxT+f/tJWop3SBem1cUd0Yj7dT/n1J2Q7h1iW3nhcuf8clB9341LZc1LMR5OG\nd/Mfos5Dr3udYr9EQfrstq+yQqXHz6qn2T9E23734IBzwAbdtV32zJSx68FjSwMO0ddu/zzx\nrkmCtOHaHkGn0DVgyhcUI0CUrbd9Jewc7HEb1esXve38Xb+wQ1Rx3caEOycIUsPveobNrznu\n85MPAVH+tHexc7D3n2TPTwHPHlDsEPWa1pho78mD9P6gYvNzjd6WeBAoasMpUefg1E9lz1Gy\nLedGHaIjP0qy/8RB+n3oq7p2By1OOgoU80Lv6HPQ2+4XBm/sH32IKh9LMEDCIDVdHz2/nG7P\nJBsGirk31jkov1f2PCV6fLdYx+gm8RGSBanhsljzy53FmYnGgSJujXkO2M2yZyrNjPLoo9Ps\nR8JvlBIFqSlujnIeSTIQhPtV/HNwm+y5SvJQ/EP0I9ExEgXpuvgTZOVzkowEYWZynAN2n+zZ\nSjE77s8jl+iruyRBepjnHLKeKxMMBSFe4rlIWPnfZM9XguXdua5TwYpDgiCtqOCaIDtkh/hY\nEGxTNd852Cfp54762XEo3yGq/FBoGPEgNUR+ftTRtcJjQYjv8J6Ds2XPOHM/5T1ERwoVHMSD\nNJl3gqzsbeHBINBc7nPAnpI954y9UcZ9iIQ+JxAOUg3nCzvXMaKDQaAd+/Kfg33teoHdNIT/\nEH3pXwIDCQfpav4JMjZXdDQIco/IObhb9qwz9YTIIfovgYFEg/QvvlJIi6MER4MguwaInIN+\nVrXjHyVyiCoE7qoQDdLNIhNk7DXB4SDAI2Ln4Pey552hv4odol/zjyQapAPFZvhDweEgwLfF\nzsG3Zc87Q5eIHaKD+UcSDNJLYhNkvXaKjQeF1nWKPt5BOq2VPfPM1BW9U66IN7mHEgzSWMEJ\nsufExoNCU0TPwRTZM8/Mk6KH6AbuoQSDNFB0hteJjQeFvit6Dr4re+aZ+U/RQ3Q091BiQdoU\ntIREzkvuH25bfm/gKhzNjhAaDwo1hazRkDsHG5tf9Lk/ePYK2mKPJtlzz0pId9AYx7nQ/dq3\nznk8eIsy7pu6xYI0PyQmL7X8edPtYUEqbxAaEJzlb+X/fnWRc3Cc+4s1TkiQ2Mcy5p+Blzv0\nJHzROfgQlS5yVnfNfZ0V/pH2y7xjiwUp7OX5S87qysoBo1c5zlVhSRJrCYT6HiXnrvH/j3mh\n52CL8z+5r4OcrWFBMvWNahnbI+9mnWVhF+HQJmd87gg1OTeFbfEA79hiQQp77fmS87H75Svr\nnE1h9/Y+LTQg1OaO3e43+ZZOvTv0HDzrfJT7OtF5ISxId8r7Z6SqJPdvG7S6/fchr9tyHnK2\nfJktdFbvHrbBeN6xxYIUtiJLS5DcPu9TQzZ5QGhAqG0+etXt62r9MvQc/G6D8023gjshLEj8\nNSk9uEFipRe0LUo6LSwmbK8tzp1nOc6o0A1G844tFqSRoSfx4+avR4f/0Gr9bvjRLOAxveX4\nHXT/Ik/4N7MpM3Mp28fZcnJYkL7fsouXZP+jiJV4/7wuN7ZcY78JzQkb6+z6JPStfs452QTp\nhNCT+HHz16+Hf8O8tWUXoW+iIJmXnKnnOG+yK51Hjg8LkvHu8K6xieFblC3Pve38evifD88m\nSMNChm8N0gmO85OQTSa17GJ8CXBpPYClPauahd1jngtStx3O3nOc74YGqdzbQ1WF7H8TsdZ/\nX8mT3jUWWkrIOaPYKz/GTs0mSGeGnsSPm7/+ynGOCdnEri5+Ot57JFb20y0t/+Oa0HMwlT3l\nXL5zZ4/QIF0t9V+SnpYk7dv6BI7/KZKU3Ium8UX++Hu8Y4sF6YLQk/ix+6XvdmdNSAmfYYE7\nMV6QzmxfQebG0HMwlV3ifOg8zUKDNEHePyNVzUGqfLTt99MD//WeiCBdxju2WJDC1uFq/hyp\n93dXhUeNLRAaENwgfX2e73/MCD0HU9mXGxxnTHiQpkv7V6QrF6TO1/h+X+xO/Igg3Rg6SAix\nID0YehI9TRNDZ7heaEBoPKDXXXkPmwprwHeDxBY6jXuGB8nURbl6smF5t+StKpKUiCD9kXds\nsSC9FnoSc+rX/mFo6AQrhMYDx/miwx0oG0PPQS5IV7k5Cw3SP+X8C1K347P83zcWWfE7Ikhv\nBY8QTixIO+KtSR6Au6wIYWI8XyHYfrJnnpnjRA9Rd+4nIAveRhH2QVKkSdH7hnjGiJ6DMbJn\nnpmJoofoNO6hBIN0i+gMSR58C64/ip4D7tf/2vqb6CH6LfdQgkFaITjB/ay5FSZ9tUILOTHW\nzZ4HKDb2FTtEpauj992B6OInQsscmfsJhhShHzEUd77seWeoWD2hiBP4RxINUrHPusJ1TvSY\nTsj3othVYtMzMN8XO0Sz+EcSDdKu0CetF2PTN8MMHC1yDobInnWmuJ8y4Nq3PnrHHQkvWcy/\nhn7OEtHRIMhskXNg1yr6b4ocoqkCAwkHaafAEpGXig4GwU7kPwfHWFbuCbttq4iDuT9EcpI8\n1mUO9wR72feUq5Qt7cJ7Drosjd6rUWr414hcIDJOgif2fZ93gveLjwXBbuA9B6beZR6Oex1N\n7rvMmyUI0lbOF3fct3hApPpv8Z2DoSKvWjTH+Q3/gK1CoyR5GPMiro67g+z5HDBDa/bgOQd7\nrIneo3E278dziLotFhslSZC4llbusyrJSBDmdY7+ht1fkT1bKVZyrFzR9RnBQRIFybk39gQr\n8fzYlDzbNe45KJ8TvTcjLYr/mNaHRMdIFiTngbAlODrojRylZl6PeOegu6kLrEZbFLJOekfl\nfxAeImGQnKdivbI4CK1BKXo91kuXvWxuvP8g1vukigTfapIGyVn+jegJnttWCNm6UOSJ0VDc\n+hi3h51YI3uWUm2O0Ss0cGX0fkIlDpJTF7aCXateM9o3/h5j/c6+5ZlPE48Kfg23RhRQd7vN\n+qeATIn4ZLbLNYmeJpk8SI7zxpBiE7zYv0JAa+f/PmcnST8U+HBEsXMwAs8AcZya84odom+9\nk2zvFEFymp4Mi1Ln77+bt2X70je3UwwM7f52Stg1coqpqwbxemtU2GqLR4tWvduQBCnnr5cG\n/OQc8IuOP3eaWu++OPjzwN1AAm9ftWfhOdjzKrsLpovu8L8i+uD6gNt/qi7nfqxYIaogOc6O\np68+zBf4bifd+lpAo3HLU5x7m/rQOLnq599wlO8DifKjbpgvcGuNOVbedFDHRx01vnLzCb6n\nIpUNvGZOovdGreiC5Nr13hPTfjPhljtmzPskpFv/H978uReyhLgaPnzmvv++6ab/vu+ZlXYX\nGGru8N5w3FzwJ00fPzdj8i0TfzPtyffJeg9pgxStyX1oZwkruTV6UwBxn57aqeWHzl8zGS/r\nIDUvGz7lMsbGWP2aA9L2l9YXbyWbMxkv8yC9zdh1jjO5lJ28JXpjAEENV7YEad9sxss8SM7E\nie7bp0e7sm/wLx4GENtk77Xdd7IZLfsgtZhfyfpwr1QOENsGr1P1pmxGkxYkZ0k162FrXz+k\nr+5ItocbpSeyGS6FIM1fHm+7dYexLgIr8QHE0DiK9Xh79SGMhb2BWLqAdDz6ID1eUvFZ9Fau\nraegDA4pGc86PZ27xC66IuTPN3YvSdwW5EcepPr9GftJ3G3HMHY5yuBAbzpjdxXd4ArGDqL8\nwJo8SHfnXpZ2jr2iKsrgkIYXu7Crim7wtlvSu5dwROogbfqyWyk5Mfb2s7qwgXisLNBaWslG\nFv9xc7x7me5B2DlNHaSWu/wej/0X5leyaiwJDpQ+HcAGFl/87VHvMr2abkziIC0ra/k4OX5H\nrVsGn0s7C7DajiNZn+If9u/o39L7/R7ZoMRBGt7a4cRx3976QSiDA52mc1n3iI/6257cyv+s\n2DC0QWp/ZmcFx00e20awkgmk8wCLjWOdIj6E3d6+9NWrVKPSBunD/aqqKlhJVVXVCY0cf63h\nCsZGW7gqNaRgBmO/i9ik/tjcFVrCKqqq9l9FNSz9B7KLWBX/X0IZHGhEFr5bdWcJlzvJp0iQ\n3DL4YevI5wK2WVrJzoj3OauhQXJeqGT9UAaHZDYMYANr421qapBQBofE3MJ33CfXGBskdIND\nQs0d33E3NjdI6AaHZLyO75gMDpJTj0VRQNz06MK3j8lBcpzJJSiDg5jYhW+P2UFyZqIbHIRE\nd3znMzxIbhkc3eDALbrjuwPTg4QyOAiI7vjuyPggOesORRkc+MTo+O7I/CChDA68oju+C1gQ\nJHSDA58YHd8FbAiS49xcwk7leusIFptXxn7G/ZfsCBLK4BBb/I5vP0uChDI4xPRp/I5vP1uC\nhDI4xMLT8e1nTZCwKArEwNXx7WdPkLAoCkTj6vj2syhIKINDFJHCt8emIDnOLSiDQxFChW+P\nXUHC2uBQhFjh22NZkJrXBl9KuUMwhmDh22NbkJyl1axyAekewQyihW+PdUFyF0Upf5h2l2AA\n4cK3x74goRscgggXvj0WBgllcCgkXvj22BgkLIoCHSUofHvsDJLzaFf2DfE3lmCaJIVvj6VB\ncsvge3PeTAzGSlT49tgapOZu8Dmp7Bl0k6zw7bE2SFgbHFokLHx77A1Scxn8lpT2DRoZzzrN\nTrwTi4OEMji4kha+PTYHCYuiAPca32HsDpK7KMph6Aa32bIqdlqywrfH8iChDG65jfuzw0he\nk9geJOfdfqxyfqojgLrqhrB9aB7abX2QUAa3WOM5BIVvD4KEMri9xiXr+PZDkFAGtxZN4duD\nILlQBrdR4o5vPwSpGcrg9kne8e2HIHlQBrcNWeHbgyC1QBncLnSFbw+C1AplcJsQFr49CFIb\nlMEtQlj49iBI7VAGtwZl4duDIPlhbXA7PE9Z+PYgSHlmdWGHUb4FBRUtrWQj6QrfHgQpH8rg\n5iMufHsQpA5QBjcddeHbgyB1hDK42cgL3x4EqQDK4EYjL3x7EKRCKIMbjL7w7UGQgqAMbqoU\nCt8eBCnQLHSDGymNwrcHQQrmlsFTeEsKUqVS+PYgSCFQBjdPOoVvD4IUZt2hrAsekWkSkjW+\nwyBIofCITMOMY52eSG3nCFI4lMGNcn9KhW8PglTM5FI8ItMUqRW+PQhSUe4jMldLHB+opFf4\n9iBIxc2vZH3QDa6/FAvfHgQpwhKUwQ1QR/Bwy+IQpCjoBtdf0/dZj5RfVyBIkWpHsJIJkucA\niaRa+PYgSNHqL2PssnrZswBh9zF2V9pjIEhxoBtcZ/PKSB5uWRyCFAu6wfVFu8Z3GAQpHnSD\n6yr1wrcHQYoJ3eB6qhuaXse3H4IUl9sNjjK4blLt+PZDkGLbhjK4fjIofHsQpPjcMvgYlMF1\nMj2DwrcHQeKBMrheUu749kOQuKAMrpO0O779ECQ+WBtcHxkVvj0IEieUwXWxI/WObz8EiRe6\nwfXQdG7qHd9+CBI3lMG1kFnh24Mg8cOiKBpIa43vMAiSiMmlbBgWRVHZgi4ZdHz7IUhCsCiK\n2pZVstOyKnx7ECQx8ytZvyWyJwEhMi18exAkQVgURV11R7C+WT9SG0EStW4g1gZXU0oPtywO\nQRJWezrK4Er6ebaFbw+CJK5+DGOXoxtcNdMzLnx7EKQk0A2unufL2E8lDIsgJeKWwbPr54Jo\ny6qyLnx7EKRk0A2uFgmFbw+ClNCSatZjruxJQAu341vOB+UIUlLrB6EbXBUZd3z7IUiJoRtc\nGRl3fPshSMlhbXBFZN3x7YcgUUAZXAWZd3z7IUgkUAaXL/uObz8EicYL6AaXbEN/NrBW3vAI\nEhEsiiJX3ZDsO779ECQq7qIo6AaXRUrHtx+CRAZlcImkdHz7IUh0sDa4NNMZu1PuDBAkSjej\nDC6FpI5vPwSJFMrgMsjq+PZDkGihDJ49aR3ffggSMZTBsyav49sPQaKGMni2ms5l3RW4IwxB\nIrf1FFZyq+xJ2ENix7cfgkQPZfAMyez49kOQ0oAyeFZelNnx7YcgpQJl8GyoUPj2IEjpwKIo\nWZDc8e2HIKXEXRRljuxJGE52x7cfgpQWd1GUmbInYbSm70vu+PZDkFLjlsFvkT0Jk41jnZ6W\nPYc2CFJ6UAZP1XTG7pI9h3YIUppuLWGnoQyejufL2M9kz8EHQUrVo13ZISiDp0GdwrcHQUrX\n/EpWjW5wep8OYAOV+lmPIKUMj8hMQ91Qto8qhW8PgpS29QOxNjg1RTq+/RCk1GFRFHKKdHz7\nIUjpa7iCsdG7ZM/CIKp0fPshSFmYXMpO3iJ7EsZQpuPbD0HKhNsNLv92aDOoVvj2IEjZwKIo\nVBTq+PZDkDKCMjiNuiGqFb49CFJW8IhMCu7DLZXp+PZDkDKDbnAC41Xq+PZDkLKDbvDE1Or4\n9kOQsoRFUZJRsvDtQZAyNasLO0zFt8p6ULPw7UGQsoUyuLgNA5QsfHsQpIyhDC5KvY5vPwQp\nayiDi3EL34p1fPshSJlDGVyIsoVvD4KUPZTBBahb+PYgSDKgG5yXwoVvD4IkxcwubOB62ZPQ\nyNJKdoaqhW8PgiTHC1gUhYPShW8PgiQJHpEZn9qFbw+CJAvK4HEpXvj2IEjSYFGUmBQvfHsQ\nJHmwKEos0xVc6qQQgiQTyuDRlC98exAkqdxucJTBi1layUaqXfj2IEhyuWuDL5U9CYV9qn7h\n24MgSbakmlUtkD0JZelQ+PYgSLLVoAweSovCtwdBkm7bKazkVtmTUJMWhW8PgiQfusFD6FH4\n9iBIKphcgjJ4IU0K3x4ESQnu2uB4RGY+XQrfHgRJDfMr2d6avK3OiDaFbw+CpIgl1azHXNmT\nUMiOI1kfnX5GI0iqWHcYyuDtFHy4ZXEIkjKwKIqPRoVvD4KkDpTB2+hU+PYgSCpBN7hHq8K3\nB0FSCrrBXSqv8R0GQVILFkVxnI37s8O0e2YHgqQYtww+R/YkpKrTrPDtQZBUY3sZvPEcRR9u\nWRyCpBzLy+DjdCt8exAk9VhdBld9je8wCJKK7C2Da1j49iBISrK1DK7+Gt9hECQ12VkG16zj\n2w9BUpSN3eC6dXz7IUiqWnco6/Kw7ElkqmmUloVvD4KkrK3ftmxRlPGs02zZcxCGIKmr/lKr\nyuD6dXz7IUgqs2lRFG0L3x4ESWn2LIqiY8e3H4KkNlsWRdGy49sPQVKcHd3genZ8+yFIqnO7\nwU0vg2va8e2HIClvq/lrg2va8e2HIKmvfgxjl5tcBp+hdeHbgyDp4JYSdprWb8WL0rzw7UGQ\ntDCrCxtoaje4Xmt8h0GQ9PBCT1O7wTXu+PZDkDRhaje4zh3ffgiSLtYPNHFRFLfj24gPnBEk\nbWwbwUomyJ4ENa07vv0QJH00XMHY6F2yZ0HqfsbulD0HGgiSTkxbFOXFLuxnsudABEHSilnd\n4Lp3fPshSHoxqRtc+45vPwRJM+Z0g5tS+PYgSLoxZW3wpnMNKXx7ECTtGLI2+DjW6QnZcyCE\nIOmn4UoDyuAmdHz7IUg6uqWEnar32/R5ZQZ0fPshSFrSfW1wfdf4DoMg6UnvtcE39Dei49sP\nQdKUzt3g+i91UghB0pW+i6I06rzGdxgESVvalsHH67/USSEESV+adoPfb1jh24Mg6exmDcvg\nz5cZ0/HthyBpTb8yuBlLnRRCkPSmWxncqI5vPwRJc+/2Y5XzZU8iNrfje7XsSaQCQdKdTt3g\nhnV8+yFI2tOoDG5Yx7cfgqQ/bcrgpnV8+yFIJtBjUZQFJqzxHQZBMoK7KIrqb+KXVZqz1Ekh\nBMkM8ytZH7Xfxhtb+PYgSIZYongZvG4I67tO9iRShCCZQu1u8MbvGNjx7YcgGWPbqQo/IvPn\nJnZ8+yFI5nAXRblEzTK4yYVvD4JkElUfkfl8Gfup7DmkDEEyipqPyFxayU43t/DtQZDMouLa\n4IYXvj0IkmHUWxTF3I5vPwTJNOsHqdUN3nQu667az8gUIEjGUewRmQZ3fPshSOZRqhvc/MK3\nB0EykTprg79ocse3H4JkJHdRFBU620x6uGVxCJKZ1CiDW1H49iBIhlKhG9z0jm8/BMlU8hdF\nMb7j2w9BMpa7KIrUbvCf21H49iBI5qq/jLEx9dKGn8HYndIGzxyCZDKZa4Nb0PHthyAZbaa0\ntcFNXeM7DIJkNlllcIsK3x4EyXBy1ga3o+PbD0EynYwyuCUd334IkvEkdINb0vHthyCZL/My\nuC0d334Ikg2yLYNb0/HthyBZIcsyuD0d334Ikh2yK4NbV/j2IEiWcBdFmZPBOHVD2D62dHz7\nIUi2yGZRlMZzLOr49kOQrJFJGdzCwrcHQbKHWwa/LN0y+HTG7kp1AGUhSDZJe1GU58vYz1Lc\nvcoQJKukuza4bR3ffgiSXV7+Ets7rWKApYVvD4JkGXdRlAWp7LluqJ2Fbw+CZJu0HpHZOMrS\nwrcHQbJOSmVwawvfHgTJPqmsDW5jx7cfgmSjyaXs5C2ke7Sy49sPQbLSo13ZN9YQ7s/Ojm8/\nBMlOtN3gVhe+PcoH6YNhZ5LvE2gXRbG149tv5LAPKXdHHyRICd2iKNZ2fKcIQdIHWRnc8sJ3\nKhAkjRAtimJvx3eKECStUCyKMs/eju8UIUh6ccvgyVZEReE7FQiSZl6oZP2WJPj7GwawgbVk\ns4FWCJJukj0iE4XvlNAH6fM1Gyz/qC9lMcrgW9avC24osrzj22dbzdrPKfdHGaQlv7vk6C8z\nV0n1yT/5vZzH/FigWBn8w+k/OKF38zlgvU/4wfSOnznWndbp6ZRnp7x1s348rF9J8yHa41uX\n3r2MZq9kQXr5ij1ZBwff+BHV3sEvrAy++Jr+Hc9B/2sW523S8GJGc1TUigkHdTxEvX/0KsGO\naYK0c9qBHafXrPTEeST7hw4CyuCNjw4OPAds0KONciapnjnHBx+ig2Z8kXTXFEFqnLJ38Pxc\nRywkGAE6mtmFDcx7jf9YwXda33Xyv7KmqZTnB4Yfon4zmpLtnCBIr4d8J2x14afJx4CO5leO\n8v2gWXFK8XNw6kp5M1XE+v8ofoiOSlaESRykxlu7FJ8gY195NukgUOi9He2/frBb1Dno/pC8\nmSrhqS9FHaLy3yb5oZQ0SJ8Nj5qf64aEPzehmJ0XxzkHF++UPU+JGq+Nc4hGJrgJOWGQ1v5b\nnAkydh71igPQZssJ8c7BCbT3qutk56h4h+jQGuEhkgVpZXW8CTJ2is3fD1P1WZG30PkO+5fs\nuUpSNyzuIdr3Y9ExEgWpZr+4E2TsHDRKpmL7kfHPwdDtsmcrRf0Z8Q/RVzcKDpIkSNsOjT9B\nxn6YYCQI0zCC5xzY2fZ9Kc8hOqJObJAkQTqfZ4KMzUwwFISYyHcOJsierwT38R2iy8RGSRCk\nGXwTZN2Xi48FweZzngP2guwZZ+7d3TkP0e+FhhEP0toevCfxKBTBiW3vz3sOqm17m9QQ0S5Q\nqEqogUA8SN/lnSBjM4QHg0DX85+D62XPOWN38R+ii0TGEQ7Si/wTZHtsFR0NgnxUzn8Oyu3q\nyP+8UuA6fU1gIOEgnSQwQfZr0dEgyA9EzsEPZM86UzeJHKLTBQYSDdLfRSbI9hSsLUKQmq4i\n56B8rex5Z6h2D6Hr9A3+kUSDdKHQBBnNSqHQTOi7LWMTZc87Q9PFDtEY/pEEg1TbXWyGw8WG\ngyBfFTsH+1lUPD1O7BD15H/lJBikWWITZJ2wkAOZVwXPAXtF9swz83Gp4CH6M/dQgkE6T/Qk\n2n5bDKEJoufgl7Jnnpl7RQ/RpdxDCQapyL3lxV0sNh4UOkb0HBwje+aZibgpNty+3EOJBemD\nwqH/7QvnIffreY7zs9yXCz93lgTNsL/QeFBoZ+GdyfnnYNQrtZvf+kXAR01drKmd9ok4ROe/\nsmnn0l/uFnCdfsI7lFiQ/hIw9C+b39r1rHHe6MR6P5XbKDBIJVg8ksi7EefgAm+zPwVstrj4\nno3xecQhusXbbG7AZnN4xxIL0qSAocvecZaWsbuchkGMPes893pwkNibQgNCgceKn4PSjc5b\nB/TJfUPbt3Az/rfSegoqx/gO0YAm55G9+811nL6Fm03mHUssSGOCIjKowRk/qNGZnPvlny5m\ns0OC9KjQgFDg9uLnoO/cuacwdobjnFy41a9kzz0jDxc/RP2vuOIruTchjvPNwq1+zDuWWJCC\nb4G/3dn+rrPGbQovY6FBuldoQChwXcQ5aDbecQ4u3Gic7LlnJLhhNe8QlR74krO8c+FG5/OO\nJRakUwNn2PX93B+d2fKbsCD9VmhAKPCT6HPAhtQ5CwI2ulL23DPyq8hDdI7jbHmwd8BGZ/CO\nJRakkA+MH3echgMignSz0IBQIOQGav85GLbF2RjwFknsPgEN/TLyEOWCVP/XoFUvhvGORfkT\n6czcrJznIoKEn0hEfhx5Di7Y5az5WtBGdv9E8h+iThWHznV2DijcKKOfSIHvkXquc567uu1D\nMLxHSlngeyT/ObjUcV7dK/AcWP0eyX+I3A6iAYGHMqP3SIFVu3udnQd0fttZX1E0SKjaEQms\n2vnOwfAG54mgjxqZ5VW79nYFhzgAABmxSURBVEP0gw2NuRe+vR3n14VbZVS1C/oc6dgmd0mb\noU3OnUWDhM+RiAR9juQ7B7utclZV75UT0Kdv8+dIvkN0lOM8/7UBfwhcNSGjz5ECOhu6vu+s\ncG80m+Y0DGQvrl27w6lfu7awSRydDVQCOhv85+B7rdtNLNzOls6GTRGXact6QYsCyt8ZdTYE\n9Nrd5jjN6+lXfer8o/S9lu0Kmy/Qa0cloNfOfw4ubN2uMEhl1vTaFRa28y7T0p+8WffF+5OC\nlsPKqNcuoBswpovExoNC3xI9B9+SPfPMfE/0EFVzDyUYpHNFZ4j7kciEfEoSzZ77kaaJHqLM\n7keaKThB3CFL5xXRq8SiO2RFD1Fmd8hizQYFBD8AO5JNazYcK3aIsluzwblAbIZYSJ/QjWLn\nYILseWdIcBUhgYX0RYP0mtAEsa4dpXUCC60yVr5G9rwztC3yybGB3uIfKduVVm8XHQ2CYKXV\nSMqvtIq1vxWAtb8jbVJ+7W88jUIBeBpFJOWfRoHnIykAz0eKpP7zkfDEPgXgiX2RFoe0wIfK\n+ol9eIasCibynYMJsucrAedyq9k/QxZPNVdAw2k858DOp5pfwnOIZDzV3KkJWhAgxNlWnsP0\n1Q6Nfw6G1MqerRT1p8c/RAf+U3CQREFyVgQsrRds+I5EA0Gof8V+XfDNjbLnKsn2E+MeogGr\nRMdIFiRndcCqaUHO/SLZOBBu8/HxzsHxm2XPVJqd58Q7RN8U76lOGCTns4CFPAv9V2PCYaCI\nnbEen3jRTtnzlKhhbJxDdHqCbzVJg+Q03hw5vz3mJh0EOvrnnKvPa+8Iu3/3qHPQ7QF5c1XC\nk72iDlH5/0vyOWfiIDnOPwYVn+D5G5KPAe02zL7p35vfm/pWuvlgePFz8O0V8uariHURt8sO\nTbYsD0GQnIZ7itx5fvgCghGg3eS2Q5v3wdyfi9yddKAtywYVN++w8EPU976E7z4ogpR7lX7P\nfsHzOx6v6qi1LYXWI78Q2vjHkOvk0EfwyYOn6emQpxweeG/iYhhNkHJz/NvlX+k4va9NXEm0\nd2jX0LpE/gUFf/T22OqO56D66rf9W7wz8PQXrC78vP+Lgp/ce135MsGOqYKU07R48kVDqrzJ\n7XPSD2euo9s1+NQN8Y7x00F/uOLeMce2fEfb45jLpnV8a3RH7v/3+c/XM5ilutY8dMWJLc9A\n7jV09J1LaPZKGCRP/aaP1nxOvVPwqfE+ga0KfzXStOmTjzcFlqAWtryUmfh+atPTxOdrVm2i\nfMlLHiRI2bL+rLu7vPolIn95S0nL65mSx6jnZTkESTMLq9g+77zc1X1Or4jWolDpbOJ52Q5B\n0sufu7Kvr859Ke1TL/T3v+PlqOsjxPOyHoKklUkl7OQt7i8WfSy2A28xkG7/oJwUOAiSVhqu\nZOziXYl28aT3E2kq0YygFYKkj9qRjI1LuO7F6lyKhv+YdXqKZkrQij5IOy/6Nfk+IadmMCt7\nIPFeerPL6xvPYhXvJp+Q3m4bTXprD32QxjL2KvlOwVk+gFU8l3w3bz6Z+8/2way6Jvm+dPY3\nxsZT7o88SDXdGBtkdRdKOhb2Yvu8Q7a3mmo2yM77zls0DmasG+WjUciD1HyTGZ6CRM0te3M/\nRa6IpZXsdJt7WZtX1xf6TDsEdZAWuU9cZ3tuId6t7SaXspNo7xSf25ldQ7pDrWzdq/lTacJP\nAYiD1HS0V161bWHcdDX8iLGLkpW9C+W+J99NvEt9XONdpkfSrf1LHKSHWjpQunxAu1+rUZS9\nA4y1twi+svXpA38g2yVtkGr3br3J42zS/VqNpuxdqPFs1oOufKGVka2XaV+ypdBpgzS77W6p\nEjxSjMiy/qxiXip7rhvK9rbpqWNttrTf1Ud2BzdtkHbcOm7cRazruHHjULcj8sqXWZ9ky3KE\nq6lmA60sgt+fu0K7sEvGjbuNbI0y+g9kF7Eq8n3a65Fydkh6PzXerWBn2vqZX3dG+roWQVJa\nW7d3Sp7tzK5OcfcqQ5DsQdDtHWUqY1NSHUBZCJI1Uip757va1iI4gmSLtMre+RrPtLQTHEGy\nhFv2Juj2jlRraSc4gmSHNMve+SztBEeQrJBu2TufnZ3gCJIN0i5757OyExxBMl8GZe98NnaC\nI0jGy6Tsnc/CTnAEyXTZlL3zWdgJjiAZLquydz63E3xt5qPKhCCZLbuydz7riuAIktGoFzmJ\n790KdpZNneAIksnoFzmJ79nObKykoWVAkMzV8MMUFjmJz64iOIJkrNoRrGRitmXvfFYVwREk\nU20YzMoelDoDq9YER5AMtXwA65nOIifxbT/cnk5wBMlMbtn7LdmTsKkIjiAZ6ZFy9k0VPhBd\nbE0RHEEy0e2ZdnsXY00nOIJknlTW9hZlSxEcQTKOhG7vYiwpgiNIpqkZzDpPkz0Jn8azrOgE\nR5AMs2J/1v0Z2ZPIY0cnOIJklpcldXsXY0URHEEySpaLnMRnQyc4gmSSbBc5iW+u+Z3gCJI5\nlCp75zO/CI4gGaP2dMnd3sVcwzqTPYRLSQiSKWQschKf8Z3gCJIhlg+QschJfKZ3giNIZpC1\nyEl8hhfBESQjqFn2zre4gp1tbhEcQTKBqmXvfEZ3giNI+lO47J3P5CI4gqQ9pcve+QzuBEeQ\ndKd22TufwZ3gCJLm5KztLcrcTnAESW8Le7F9dPoeb2wRHEHSmg5l73ymdoIjSDrTo+ydz9BO\ncARJX9qUvfOZWQRHkLSl2CIn8Y01sRMcQdKVaoucxGdkJziCpCn1FjmJz8QiOIKkp4VVepW9\n89X0Y4dvlz0JWgiSltyyt87f080rgiNIOrq9hA3Xreydz7giOIKkH7fsfbF+Ze98phXBESTt\n1I5kJTfKnkRyhhXBESTd6NTtXYxhRXAESTMrDtC37J3PrCI4gqQX9Rc5ic+oTnAESSt/7sq+\n/onsSZBZWslGNMieBBEESSeTS9lJm2VPgpBBy6EgSPrQtNu7GHOK4AiSNrTt9i7GmDXBESRd\n6NvtXUzj2axisexJUECQNKFzt3cxphTBESQ9vFjF9jHiO3eBmmojOsERJC38QfNu72LM6ARH\nkHSg4yIn8RlRBEeQ1NdwBWOjzSp75zOhCI4gKU+jtb1FGVAER5BUZ0q3dzFuJ7jmpRQESXGm\nlr3z6V8ER5DUptva3qK0L4IjSErTb21vUboXwREklZnW7V2M5kVwBEldbtlb+0VO4pvK2FTZ\ncxCHICnLgrJ3Pq2L4AiSqmwoe+fTugiOICnKjrJ3Pp2L4AiSmmwpe+fTeE1wBElJ9pS98y3W\ntgiOIKnIprJ3Pm2L4AiSeozv9i5G1yI4gqSc2hGWlb3zaVoER5BUY1/ZO5+ma4IjSIqxseyd\nT88iOIKkFjvL3vm0LIIjSEqxteydT8ciOIKkEnvL3vk0LIIjSOpouNKqbu9i9CuCI0jKcLu9\nJ9hb9s6nXREcQVKF7WXvfI1nsx5aFV0QJEWg7J1PtyI4gqQGkx5pSUOzIjiCpASzHmlJQ68i\nOIKkApS9g2hVBEeQ5LO627sYndYER5Cks7zbu5ixrNNTsucQE4Ikm5mPtKShUREcQZJsWX9W\nMU/2JJS1/XDWb73sScSCIMm1sArd3sXUVLPBWhTBESSp0O0dRZc1wREkmVD2jqZJERxBkgfd\n3rHo0QmOIElj3dreorToBEeQZEHZO67GszQogiNIkrhl7+dkT0IT2w9n1TWyJxEBQZIDZW8e\nGnSCI0hSoOzNR/1OcARJBpS9eSlfBEeQsoeytwDVO8ERpMyh7C1E8U5wBClrKHuLUbwTHEHK\nGLq9RandCY4gZQtlb3FKd4IjSJlC2TuJpZVsRIPsSYRAkLI0qYSdvEX2JDSmcBEcQcoOyt6J\nqdsJjiBlBmVvAmNV7QRHkLKCsjcFZTvBEaSMLB+Abm8K2wer2QmOIGUDj7Sksl7NTnAEKRNY\n25vO0kp2unpFcAQpC+j2pqRkERxBSl/Djxi7CGVvOip2giNIqasdydg4lL0pKdgJjiClDWVv\negp2giNIKUO3dxrUezAmgpQudHuno6aaDaqVPQk/BClVbtl7texJGEm1TnAEKU3o9k7Ps53Z\nWNlz8EGQ0uM+0hLd3qmZxtg9sufQDkFKDbq9UzaWdZ4jew5tEKS0oOydNqU6wRGklCxDt3fq\nVOoER5DSgW7vLCjUCY4gpQLd3tlQpxMcQUoDyt5ZUaYTHEGih7J3hlTpBEeQyKHsnSlFOsER\nJGo1g1nZA7InYRFFiuAIEjE80jJranSCI0i0UPbOnhKd4AgSKZS9ZVChExxBooRFTuSYK78T\nHEGi0/BDLHIiyVTGpsidAYJEpnYkK5mAsrcc0ovgCBKVmkEoe8vTeBareFfmBBAkIssHsJ5Y\n5EQe98GYMjvBESQaf+3F+i6WPQmr1cjtBEeQSDy2GxY5kU1uJziCROH2EjYc3d6ySe0ER5CS\ncx9pORplb/mmSHwwJoKUGNb2VobEIjiClBQWOVGHxE5wBCmhFfuz7s/IngS0kNcJjiAl88qX\nWZ83ZU8C2kjrBEeQEkG3t2pkdYIjSElgkRP1SOoER5DEuYucoOytnKlSiuAIkjCUvRUlpQiO\nIInCIieqktIJjiAJwiMt1bX98OzXBEeQxGCRE5VJ6ARHkIQ8hrK30rLvBEeQRNyOsrfiMu8E\nR5D4ud3eWNtbcVl3giNI3FD21kLGRXAEiRfK3nrIuBMcQeKEtb11kW0nOILEB93e+si0ExxB\n4vJIOTtkjexJQEzvVrCzGjMaC0HigW5vvWTYCY4gxYeyt3ay6wRHkGJD2VtDmRXBEaS4sMiJ\njjLrBEeQYsIiJ3raPjibTnAEKZ6XUfbWVEZFcAQpFpS99ZVNERxBigPd3jrLpAiOIEVzH2l5\nCcre+sqiCI4gRWp+pKXkOUAiGRTBEaQoGw5nZQ/KnQIklEERHEGKsHwAFjnRX/pFcASpOHR7\nmyH1IjiCVBTW9jZF2kVwBKmYyaXspM3yhgdCKRfBEaRwDT9i7CKUvU2RbhEcQQqFbm/DpFoE\nR5DCYJET06RaBEeQQmBtb/OkWQRHkIKh7G2iFIvgCFIgdHubKb01wRGkIFjkxFSprQmOIBVC\n2dtg0xm7O439IkgFak9nJRNR9jbVNazz3BR2iyB1hLK32dwi+GL63SJIHaDsbbrth7N+9EVw\nBCkfyt7mS6UIjiDlQdnbBml0giNIfih72yGFIjiC1A5lb2vQF8ERpDZu2XsCyt52GMs6P0u6\nQwSpFcreNiHvBEeQWiwbgEda2oS6ExxB8qDsbRviIjiC1AyLnNiHthMcQXJhkRMbkRbBESRv\nbW+UvS1EWQRHkJzaEej2ttRYuk5wBAllb3sRdoJbHyR0e9uMrhPc9iCh7G03siK45UFCt7ft\nqDrB7Q4Sur2BaE1wm4OEbm9wqIrgFgcJi5xAM5LlUOwNEsre4CEpglsbJLfsjW5vcNUNZXuv\nTbgPW4OEsje0q+nHDt+ebBeWBgnd3uC3OHER3M4godsb8iUugtsYJHR7Q4GkRXALg4RubwiQ\n8MGY9gXJLXs/SLtLMEDj2axHgixYF6QV+7Puz5DuEczgFsHF+y5tC9LCKrZPCs8iAAPUVLPB\nwkVwy4KEbm8Il6QT3K4godsbinlWvAhuU5DQ7Q0RxIvgFgUJ3d4QSbgT3J4godsbogmvCW5N\nkFD2hjhEO8FtCZJb9ib9l4KhBDvBLQkSyt4Ql1gnuB1BQtkb4hNaE9yGIKHsDVxEiuAWBAll\nb+Ak0AlufpBQ9gZeAp3gxgcJZW/gx18ENz1IL2ORExDAvSa44UFyFzlZTTYXsMfSSjaC58GY\nZgcJi5yAKM7lUEwOUsMVjI1G2RvETGfsrvhbGxwkt+x9I+lcwCrXsM5zYm9sbpA2DGZdsMgJ\niONaE9zYIK04gHWP//0EoFDdkPjLoZgapIW9sMgJJLU+fie4oUFyu72TPl8AIH4nuJlBQtkb\naMTuBDcxSOj2BjJxO8ENDFLtSMbGodsbaMTsBDcvSDWDWedp5NMAWzWeFasT3LggodsbaMXr\nBDctSFjkBKjF6gQ3LEgoewO9OJ3gZgUJZW9IQ4wiuElBQrc3pGQqY1OKb6F8kDY9+njMLbHI\nCaQmshP8/z9K+lKIPkixYZETSA9XJzgBeUFC2RvS5HaCZ1jGkhYkt9sbZW9ID0cnOAFZQcLa\n3pC2xRXsbNEHY3KTFCSUvSF9czuza7MaS0qQ3LL3xSh7Q9qmMDY1o6FkBAllb8jIWNEHY3Kj\nDVL9itkz7pg06Z6H5q8L3whlb8iKWwQPKN2teeGhuydNunPG0yt5lpQsii5IO565dlAX1qbH\n8F/9PfCnzrL+rOI5slEBiqk7ckL+/2h69daTurdfpl2OGDd3J8VAVEF6eUwlK7DvhI8KNkTZ\nG7JUn/e7lb/oX3iZVl3xavJxSILU9NSRhdNr1vm8JfmbouwN0rzzvc4h1+kxid9JUQTpzaEh\n02t2yUbfpu91wiMtQY4N5xe7TI99N9nekwep7mdlxSbI2Jf8y6dOwiInIMW0gPcefl1+/kWS\n3ScO0nuHFJ+f6/xtSUcBSGTzOdGX6eAPEwyQNEizu0dPkLGDViUcBiCJFfvHuUx7zhMfIWGQ\nHiqPM0HG+mA1YpDnzT3jXaZd/yg8RLIgTY83v5wqJAlkeaNn7Ot0lugYiYL0ZMyfR64+q5KM\nBCBs5V7xL9OuorfIJQnSot3iTzD3PgkVB5Bhc6z3R626Cb50ShCkbV/lmSBj3xMfCkDY9/ku\n0wO2Co2SIEicE2TsfvGxAARN5b1MRwsNIx6kObwTZL02Ru8VgFRN/EJDqwUi4wgHaeeB3BNk\nl4oOBiDoXP7L9ECRdnDhIE3mnyBjS6L3C0DoTZHLVOSuWtEg7eonMsPzBUcDEBOjM6jQvvXR\nO+5INEgzRCbIOhfenwSQnvfD7pso7mH+kUSDdJTQBNlEweEARFwndpmeyD+SYJBWiE2Q7Yc1\nTyA7jX3FLtNOq7mHEgzSrYJBYq+LjQcg4G+il+lvuYcSDNKJojOcJDYegICJopfpCO6hxIK0\nk6vLzu/bQuMBiDhO9DLtwX0ft1iQXgsY+6XWP/yP3G8u/NxZEjjDCqHxAAQ07h5xnY56pXbz\nW78IuofhLd6xxIL0YLEJfpf1fir3JThIbL3QgAD8VgVdgL7r9ALvF38K2Ir7Dj+xIAVVFXvt\nlfPV9c7i3dizznOvhwVpgdCAAPzmBl2A7ddp6UbnrQP65L7p71u41Y28Y4kF6cLgkDD2v803\nV/zpYjY7LEgzhQYE4Bd+A3fzddp37txTGDvDcU4u3OAy3rHEgnRmyPyu8t4hlbHwIN0tNCAA\nv/8Jy1HLddpsvOMcXLgF981zYkEaFjy/Ibuce1p+GRok1L8hKzeH5Mh3nbIhdc6CgE1O5R1L\nLEgnBM6v1yfOG60VkNAg3SI0IAC/CcGXoP86HbbF2RjwFokN5x1LLEgjg+ZX8rSzuW1OoUG6\nU2hAAH6/CbwC/dfpBbucNV8L2ugc3rHEghR4t9R1jnN2229Cg/SA0IAA/ILvMvddp5c6zqvB\nSwxx328uFqSrAoY+rsF/s19okJ4WGhCA31+CLkDfdTq8wXkipEdnHO9YYkG6p3DkPdc7S/u6\nJfq9qooGaaXQgAD8lgZcf77rdLdVzqpq91cBy25zL9QjFqT5hSNf3faHc9mLa9fucOrXrp1V\nuFk52bMGASJ8EXBbn+86bStxTyzc7GXescSC9FlpwcjXtE+Qvdf2qwKHC40HIOKbhReg7zpt\nW/igMEhl3IvbCd5GcVjhDOMZLzYegICg9/KxHMU9lGCQxorO8Fmx8QAEPCl6md7APZRgkERv\nPawieYI0QCx1/KtDet7gHkp08ROB5SFdVwoOByBitNhlejD/SKJBuklshgTPYQeIbaHYZXo7\n/0iiQfpXrEdednSk4GgAYo4UuUwrNvEPJLxk8dUiM5wrOhqAkMdFLtPrBQYSDlJNBf8EjxEd\nDEBM0xH8l+mX/iUwkPhjXfhX0S97W3gwADFv8AfpXpFxxIPUMJB3gtcKjwUg6ie8l+mRjSLD\nJHhi3wecL+4O2SE+FoCgHYfyXaaVHwoNk+RhzA9zTbAn+r5BhuV8BebHxEZJEiSutf7LUbED\nOWYHrQAZ5ibBQRIFqemS+BP8fZKBABJ4IP5lKtx6kyhITkPcJJU/mGgcgCTujZ0j4dvlkgXJ\naYr36q7b7GTDACTyl3hPfeBeX7VdwiA5zsMx3sp99Z2kowAksmi/6Mu0558TDJA4SM57kZ8n\nXbgt8SAAyWz+j6jLdIhY3btF8iA5DXcUvevjoOeTDwGQ2Jz9i12mVVOEPodtQxAkx9kwNvT1\nXfXdX1CMAJDYjjtCHynbY9w/E+6cJEiOs/HmAwKmV3rsg9xPPgNIzRczjg580XTbZ4l3TRSk\nnFd+PriTf3a7D799FdnOAWh8+KtheRW8TkeM/zvFfumClPP5gmlXn/fvw4Z95+IbHv4HXtKB\nmnb+feZ/XfSdYcP+/fyx017cTLRT0iAB2ApBAiCAIAEQQJAACCBIAAQQJAACCBIAAQQJgACC\nBEAAQQIggCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABBAkAAII\nEgABBAmAAIIEQABBAiCAIAEQQJAACCBIAAQQJAACCBIAAQQJgACCBEAAQQIggCABEECQAAgg\nSAAEECQAAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABBAkAAIIEgABBAmAAIIEQABBAiCA\nIAEQQJAACCBIAAT+D9+b0UrtI1DrAAAAAElFTkSuQmCC" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ] - }, - { - "metadata": { - "id": "SfZKYPLgk2-K" - }, - "cell_type": "markdown", - "source": [ - "# Report Relatives of X2" - ] - }, - { - "metadata": { - "trusted": true, - "id": "PDE3ROjfk2-P", - "outputId": "0890ac94-4658-4ec7-b984-e7d0b2913483", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "print(parents(G, \"X2\"))\n", - "print(children(G, \"X2\"))\n", - "print(ancestors(G, \"X2\"))\n", - "print(descendants(G, \"X2\"))\n", - "\n" - ], - "execution_count": 16, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[1] \"Z1\" \"Z2\"\n", - "[1] \"D\" \"Y\"\n", - "[1] \"X2\" \"Z2\" \"Z1\"\n", - "[1] \"X2\" \"Y\" \"D\" \"M\" \n" - ] - } - ] - }, - { - "metadata": { - "id": "LEAIT0-_k2-R" - }, - "cell_type": "markdown", - "source": [ - "# Find Paths Between D and Y\n", - "\n" - ] - }, - { - "metadata": { - "trusted": true, - "id": "5JhG_60wk2-R", - "outputId": "10ada2ca-f201-4a3a-ff84-89b2948775d6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 114 - } - }, - "cell_type": "code", - "source": [ - "paths(G, \"D\", \"Y\")" - ], - "execution_count": 17, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - "\t
$paths
\n", - "\t\t
\n", - "
  1. 'D -> M -> Y'
  2. 'D -> Y'
  3. 'D <- X1 <- Z1 -> X2 -> Y'
  4. 'D <- X1 <- Z1 -> X2 <- Z2 -> X3 -> Y'
  5. 'D <- X2 -> Y'
  6. 'D <- X2 <- Z2 -> X3 -> Y'
\n", - "
\n", - "\t
$open
\n", - "\t\t
\n", - "
  1. TRUE
  2. TRUE
  3. TRUE
  4. FALSE
  5. TRUE
  6. TRUE
\n", - "
\n", - "
\n" - ], - "text/markdown": "$paths\n: 1. 'D -> M -> Y'\n2. 'D -> Y'\n3. 'D <- X1 <- Z1 -> X2 -> Y'\n4. 'D <- X1 <- Z1 -> X2 <- Z2 -> X3 -> Y'\n5. 'D <- X2 -> Y'\n6. 'D <- X2 <- Z2 -> X3 -> Y'\n\n\n\n$open\n: 1. TRUE\n2. TRUE\n3. TRUE\n4. FALSE\n5. TRUE\n6. TRUE\n\n\n\n\n\n", - "text/latex": "\\begin{description}\n\\item[\\$paths] \\begin{enumerate*}\n\\item 'D -> M -> Y'\n\\item 'D -> Y'\n\\item 'D <- X1 <- Z1 -> X2 -> Y'\n\\item 'D <- X1 <- Z1 -> X2 <- Z2 -> X3 -> Y'\n\\item 'D <- X2 -> Y'\n\\item 'D <- X2 <- Z2 -> X3 -> Y'\n\\end{enumerate*}\n\n\\item[\\$open] \\begin{enumerate*}\n\\item TRUE\n\\item TRUE\n\\item TRUE\n\\item FALSE\n\\item TRUE\n\\item TRUE\n\\end{enumerate*}\n\n\\end{description}\n", - "text/plain": [ - "$paths\n", - "[1] \"D -> M -> Y\" \n", - "[2] \"D -> Y\" \n", - "[3] \"D <- X1 <- Z1 -> X2 -> Y\" \n", - "[4] \"D <- X1 <- Z1 -> X2 <- Z2 -> X3 -> Y\"\n", - "[5] \"D <- X2 -> Y\" \n", - "[6] \"D <- X2 <- Z2 -> X3 -> Y\" \n", - "\n", - "$open\n", - "[1] TRUE TRUE TRUE FALSE TRUE TRUE\n" - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "i8h9AIZYk2-S" - }, - "cell_type": "markdown", - "source": [ - "# List All Testable Implications of the Model" - ] - }, - { - "metadata": { - "trusted": true, - "id": "B5LnSoCik2-T", - "outputId": "a03a58e3-5ae1-4d22-beea-ff8f97f6c64b", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "print( impliedConditionalIndependencies(G) )" - ], - "execution_count": 18, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "D _||_ X3 | Z2\n", - "D _||_ X3 | X2, Z1\n", - "D _||_ X3 | X1, X2\n", - "D _||_ Z1 | X1, X2\n", - "D _||_ Z2 | X2, Z1\n", - "D _||_ Z2 | X1, X2\n", - "M _||_ X1 | D\n", - "M _||_ X2 | D\n", - "M _||_ X3 | Z2\n", - "M _||_ X3 | X2, Z1\n", - "M _||_ X3 | X1, X2\n", - "M _||_ X3 | D\n", - "M _||_ Z1 | X1, X2\n", - "M _||_ Z1 | D\n", - "M _||_ Z2 | X2, Z1\n", - "M _||_ Z2 | X1, X2\n", - "M _||_ Z2 | D\n", - "X1 _||_ X2 | Z1\n", - "X1 _||_ X3\n", - "X1 _||_ Y | D, X2, X3\n", - "X1 _||_ Y | D, X2, Z2\n", - "X1 _||_ Y | D, X2, Z1\n", - "X1 _||_ Z2\n", - "X2 _||_ X3 | Z2\n", - "X3 _||_ Z1\n", - "Y _||_ Z1 | X1, X2, Z2\n", - "Y _||_ Z1 | D, X2, Z2\n", - "Y _||_ Z1 | X1, X2, X3\n", - "Y _||_ Z1 | D, X2, X3\n", - "Y _||_ Z2 | X2, X3, Z1\n", - "Y _||_ Z2 | X1, X2, X3\n", - "Y _||_ Z2 | D, X2, X3\n", - "Z1 _||_ Z2\n" - ] - } - ] - }, - { - "metadata": { - "id": "TKxYwKyuk2-U" - }, - "cell_type": "markdown", - "source": [ - "# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \\to Y$" - ] - }, - { - "metadata": { - "trusted": true, - "id": "dXKGvXgTk2-V", - "outputId": "daefd9dd-eba8-4057-b2d8-e5347ee30c79", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "print( adjustmentSets( G, \"D\", \"Y\" ) )" - ], - "execution_count": 19, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{ X2, X3 }\n", - "{ X2, Z2 }\n", - "{ X2, Z1 }\n", - "{ X1, X2 }\n" - ] - } - ] - }, - { - "metadata": { - "id": "4QqmWahfk2-W" - }, - "cell_type": "markdown", - "source": [ - "# Identification via SWIG and D-separation" - ] - }, - { - "metadata": { - "trusted": true, - "id": "Zv3rbjEuk2-W", - "outputId": "ce658fd8-b475-4deb-c809-602de1686b60", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 437 - } - }, - "cell_type": "code", - "source": [ - "SWIG = dagitty('dag{\n", - "Z1 [pos=\"-2,-1.5\"]\n", - "X1 [pos=\"-2,0\"]\n", - "Z2 [pos=\"1.5,-1.5\"]\n", - "X3 [pos=\"1.5, 0\"]\n", - "Yd [outcome,pos=\"1.5,1.5\"]\n", - "D [exposure,pos=\"-2,1.5\"]\n", - "d [pos=\"-1, 1.5\"]\n", - "Md [mediator, pos=\"0,1.5\"]\n", - "X2 [pos=\"0,0\"]\n", - "Z1 -> X1\n", - "X1 -> D\n", - "Z1 -> X2\n", - "Z2 -> X3\n", - "X3 -> Yd\n", - "Z2 -> X2\n", - "X2 -> Yd\n", - "X2 -> D\n", - "X3-> Yd\n", - "Md-> Yd\n", - "d-> Md\n", - "}')\n", - "\n", - "ggdag(SWIG)+ theme_dag()" - ], - "execution_count": 20, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3dd4AV1b0H8LPAAkrbJRoVpamo\nMYmFIqixIxbEp0ZMYkfBYEyiT1TQmIA1mpe8oLEAgj0xGl9iQQFRij0RG1JUUJS2GCJSl7Jl\n3p2dLXP3zsydc+Y3c9r380fQcJ1zpnx37/3d35xhDgAkxmRPAMAECBIAAQQJgACCBEAAQQIg\ngCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABBAkAAIIEgABBAmA\nAIIEQABBAiCAIAEQQJAACCBIAAQQJAACCBIAAQQJgACCBEAAQQIggCABEECQAAggSAAEECQA\nAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABBAkAAIIEgABBAmAAIIEQABBAiCAIAEQQJAA\nCJAGacOrD1x78dkDTx46fOwT7+6g3HJWNr354OhhQweeNPTSGx9/Z5vs2YiofPuRGy4ZOujE\noZfc8MjblbJno6Tt8/7ym+FDTxo49OLrJr++kWijdEF6e/RhLZlP+5N/9yXZxjPx/o1Hlvr3\nYKcTblsie058Ft50TBv/HrQ55qaFsuekmM/vGNTOf4haDbj+HYrtEgXp69v3Z4VaHPtYFc32\n07fhD98P2AN2xAPa/F7acm/foD3oc88W2TNTxo6Hj24RcIi+c+c3iTdNEqQ113YIOoWunvdv\npxghbV//ujxsD/b8oxZvkDbe/u2wPdj1dqr3L3rb9qduYYeo4/VrE26cIEjVf+oUNr+6uM9K\nPkTKah/cNWoP9p4qe4LFPbln1B7s+aTs+SlgRq+oQ9R5Yk2irScP0id9oubnGrYp8SCp+vKo\nYnswNPmv/lStObnYHpzylew5Srbh3GKH6PDPk2w/cZD+HPqurskB85OOkqbnOhffgx7/lD3L\nKK/sUXwP9lD/jUGa3t23+CEqezrBAAmDVHtD8fnltHsx2TBp+p9Ye9BW4TdHk2LtQZtJsucp\n0TM7xTpGN4uPkCxI1cNjzS93Fh9NNE56aq+JuQfsT7KnGua2uHtwi+yZSjOlTfGjU+cK4Q9K\niYJUGzdHOU8kGSg918bfg/tlzzXYb+Pvwe2y5yrJI/EP0RWiYyQK0vXxJ8jaTEsyUlru5tgD\n9pTs2QZ5lGcPHpA9Wymmxv195BJ9d5ckSI/znEPWaWmCoVLyAtce7PSh7PkWep3nImFtXpM9\nXwkWt+c6y4IVhwRBWtKRa4LsoK3iY6VjxS58e9Brg+wZN7euO98e7JX0e0f9bD2E7xCVfSY0\njHiQqot+f9TctcJjpaP2eN49GC57ys39kHcPzpI948z9kvcQHS5UcBAP0njeCbLSD4QHSwXX\nxwvPXNlzzjedfw+elz3njL1bWvyYNCP0PYFwkCo439i5jhIdLBXrd+Pfg4OrZc/ab+ve/Huw\nt3JvsFNV25//EH3rPwIDCQfpav4JMjZddLQ03CqyB0pV8e8T2YN7Zc86U8+KHKJfCQwkGqT/\n8JVC6h0hOFoatkQ2qob5brLWRlI7eorsQTct2vGpHCFyiDoKtFaKBukWkQky9rbgcCm4V2wP\nFPo67AmxPfiz7Hln6FWxQ/Q7/pFEg7Sf2Ax/JjhcCvqJ7cFPZM+7yUlie3CS7Hln6BKxQ3Qg\n/0iCQXpdbIKsszL3m34iuAdt18ueeYNVLYvPNkjLlbJnnpnKyDvlIrzHPZRgkEYJTpC9JDYe\nvditns0p0wZ+v+geKNo0mILnRA/RjdxDCQapt+gMrxcbj95A0T0YKXvmDc4R3YNzZM88M/8t\neoiO5B5KLEjrgpaQyHnd/ctNiycFrsJR5zCh8eht2znyQI53Nof9VS/ZU69XW7BGQ+7or617\nu+f+ytk9fE92rZU996wUdgeNcJwL3T+7VjrPRJzrUu6busWCNCvkInu9/u9r7wy7DNso8o3m\nB2ETDD+49UokrSSy+P38f18eePSPcf9hhRMZJPaFjPln4Is38/99e6uCXW8xz1neNvfnY03f\nZgee6zd4xxYLUtjb89ed5WVlPYctc5yrwq5DsZZAck+GzS/i4NabJ2XCVR1Kzl3h/z9mBhz9\nDc4fc3/2cTZGB0mZD6rE9mWnfuz/90UBZ29ArTMmd4hqnZsjz/VDvGOLBSnsvefrzhfuH99e\n5awLu7f3BaEByYV/Dzb8k20LL4gK0uNSJuzOZ+ebfSuDFX4P9rozw/k89+c455W6IIXuyd1S\n9iB9Zbn3ZFf7qqrPFOx5ziPOhl3YXGf5zpHnegzv2GJBCluRpT5Ibp/3KSEveUhoQHI/D5ke\nOy/3QePVLQsigvS/Uibszad7U83wNwFH/09rnIPd2u1YN0jhe8Jfk9JDmbtz357U2HwyMej0\n7b7BuftMxxkafa6H8Y4tFqQhIddYQ5CODP+lpchPw4tCpseWOCt3YQdtiwjSuPpNbJiXpdfq\nR+/d8Dmg8AC/7tz/aC5fezkbTnSDFL4nV9ZvYkume5C++gWt9m941/P7wPM3ytnxpfcpP+Jc\nn817QYkF6biQa6whSN8L+oHpuU1oQHJhN/Ls5i0r9HxEkEbVb6LwQ0omWi32hh9R8DevOxPO\ndt5jlztPHJsLUsSeNPy4fTeD6cpQMsfbv3GBf1uaO4BV32PR53oQ7wUlFqSwL2EagnSc4/wi\n5CV3CA1ILuxLmN6Oczlz376FB+m6+k3MKc9SWcPwu1V4w/804OhPaLfV2XOac44bpIg9abg/\n8cNM9yB9JfX716H+yQE3F+x5ndPr3/RFnetTeC8osSCdEXKNNQTpt44TtnqpIl38YU1YfR3n\nShZdtbtVyoTr51P6y4bb3QvXEcsFKffj9bJt2zq4QYrYk6ul7EH6vB82LS5oWFX2j8EnMPd+\naUyxc/0j3rHFgnRByDVWH6SuW5wVhSV8jyIL3F0ZMr0ujnNX7o+ZEUG6S8qEvfmc0bSCzE0B\nR38Cu8T5zHmBuUGK2JOxUvYgfXVBOrbpPuzJwSewIUhR55p7UQGxIIWtw1X3PdIe5ywLjxqb\nLTQgudD1VRc7a7qwY6sjgvR3KRN25/O9mb7/Y0rA0Z/Adql2Pz25QYrYk8lS9iB9uSD19C8C\nFHInfkOQos71TbxjiwXp4ZBrrLGzYVzoZbhaaEByoe2MIxxn43vb33C2hu6BnEd31fTqfE/e\nw6YKG/DdILG5Ts1uXpDC98TURbmObndr3o30y4JPYGOQIs71X3nHFgvS2yHXWF2Qqlb+ZUDo\nVdhRaDx6n4bOcNTybR+dc4NTG9JOyFpKuhNke7Nx1wYc/VyQrnITVhek8D35t5w9SF1VsydZ\n1QR3BTQGKeJcvx88QjixIG2NtyZ5AO6yYkpqOZe0a9JP9tQbxHi+QrB9ZM88M8eIHqL23E9A\nFryNIuyLpKIUqX4LrAjXYLTsmTco/CIpphGyZ56ZcaKH6FTuoQSDJLQCj4vkwbcUhFbgcSnT\n8flX0T3gfv+vrdeKH4xgf+AeSjBISwQnuI8yt8J8xb9wYJ1duX/pp2Wz0EJOjLVT/AGKhGq6\nih2iFsu5hxJd/ERomSOlvsE4TWwPfil73k1Cv2KIdr7seWdojNghOo5/JNEghXzXVUSrRI/p\npPV3sWPMXc5JzxyxPbDpGZiCS9w8xj+SaJB2hD5pPYpKPwxrviuyB4NlT9vvSJE96C971pkS\nqintXVV8w80JL1nMv4Z+zgLR0dIgsIa+wC3IaZoqsgd2raL/nsghmiAwkHCQtgksEXmp6GCp\nqD6Ufw8UeyoK93NpGDtKmXJPNsLuQY1woEg9SfyxLtO4J9hZsadcvcW9B+2+lD3nfAtb8+5B\nazkNTvJU8K8ROVtknARP7PsJ7wQfFB8rHVfw7gH/1wspu5F3D0y9yzwc9zqa3HeZ10kQpI2c\nb+64b/FI3VbON3enKPe2qOoHfHswQJlvwbLD+QO/l9hya0kexjyPq+PuAAW/B/yY62lp3UQe\nQJWyFVwPp9l1RfEtGmf9PjyHqN18sVGSBIlraeUuy5KMlJbZbePvQScFH2ruOO9w9Dfs/Gbx\n7Rlo6e7FD02Dti8KDpIoSM6k2BMsU+z5sQ2ein8VKlX5bjIj9s+CNgo92ylT8+K/8XhEdIxk\nQXIeahNvfnsomiPH+XvM96edlf1pPrNDvD1or0y7bebmFayTHqzNX4SHSBgk5/lY7ywOUKg1\nqLk55XH2oMci2fMM906sty67K9N4L8GnsT4ndUzwoyZpkJzF3y8+wXMbCyEb56r3gX1ZjAdf\nn/617FlGWR3j9rDjK2TPUqr1MXqFei8tvp1QiYPkVIatYNeg85SmF/+IsW5n3friV+Gbk2D7\n9UW+1+xwt3J173zVtxV5g7rT7Yo8BUSe+4t8M9v6mkRrCCQPkuO8G/kj/WL/CgENnf97nZUk\n/eQWRv5IP0eDZ0V+NjhqDwYr8gwQqSrOizpEP0hYk6UIklP7XFiUWv3ko7xXNi19cyfFwHRe\nCr29/wxNPlu8dnLYHpxs6qpBvN4fGrba4pGiVe9GJEHKefXSgN+cPX/d/PdObcPdFwd+QzQw\nmX/+7FuFe7DXdQoXGZr74KrdCvdgt6uULZhmYt5d/ndEn94QcPtP+WUE32xQBclxtr5w9aG+\nwLc74ba3Az5Z1D/FeQ8VHxq3fcbofr5PSzsdPe61muL/lUqqZt14hO8LiTZH3DhL4NYacyy9\n+YDmjzqqefOW43yPPS3tfc00kvXV6ILk2vHxsxN/P/bWu6bM/DLk4/m/vPlzL2SZlapPp076\nw023/HHyjGWahahB9WcvPvC/N9/8vw+8uNTuAkPFXd4HjlsK/qb2i5emjL913O8nPvcJWe8h\nbZCKq3Wf3FnCShR5uguY6qtTWtb/0nk1k/GyDlLdsuH3D2dshNXvOSBt/2h481ayvviLCWQe\npA8Yu95xxrdgJ24o/mIAQdWX1wdp72zGyzxIzrhx7senp9qy7/MvHgYQ23jvvd0Psxkt+yDV\nm1XGuii0tBUYZ43XqXpzNqNJC5KzoDvrYGtfP6Sv8nC2qxulZ7MZLoUgzVoc73WrDmWtBVbi\nA4ihZijr8MHygxgL+wCxcDbpePRBeqakY8xW6Y0nowwOKRnDWr6Qu8QuGhny92vblyRuC/Ij\nD1LVvoz9Iu5rRzB2GcrgQG8yY/dEvmAkYwdQfmFNHqR7c29LW8VeURVlcEjDnNbsqsgXfOCW\n9CYRjkgdpHV1T8I7PvbrH2vNeivyWFkwxsIyNiT6182x7mW6K2HnNHWQ6u/yeyb2fzCrjHVX\naklw0N5XPVnv6MXf6te8uZpuTOIgLap/fNfe8Ttq3TL4dNpZgNW2Hs66RH/Zv7VHfe/3x2SD\nEgdpUEOHE8d9e6v7oAwOdGrPZe2LfNXf+ORW/mfFhqENUtMzOzty3OSxaTArGUs6D7DYaNay\nyJewW5qWvnqLalTaIH22T3l5R1ZSXl5+HM/dPNUjGRtm4arUkIIpjP2pyEuqjs5doSWsY3n5\nvsuohqX/QnYeK+f/j1AGBxpFC98N2jPSJagVCZJbBj90FflcwDYLy9jp8b5nNTRIzitlrBvK\n4JDMmp6s9+Z4LzU1SCiDQ2Ju4Tvuk2uMDRK6wSGhuo7vuC82N0joBodkvI7vmAwOklOFRVFA\n3OTihW8fk4PkOONLUAYHMbEL3x6zg+Q8im5wEFK84zuf4UFyy+DoBgduxTu+mzE9SCiDg4Di\nHd/NGR8kZ9UhKIMDnxgd382ZHySUwYFX8Y7vAhYECd3gwCdGx3cBG4LkOLeUsFO4PjqCxWaW\nsiu5/yM7goQyOMQWv+Pbz5IgoQwOMX0Vv+Pbz5YgoQwOsfB0fPtZEyQsigIxcHV8+9kTJCyK\nAsVxdXz7WRQklMGhGJHCt8emIDnOrSiDQwShwrfHriBhbXCIIFb49lgWpLq1wRdSbhCMIVj4\n9tgWJGdhd1Y2m3SLYAbRwrfHuiC5i6K0eZx2k2AA4cK3x74goRscgggXvj0WBgllcCgkXvj2\n2BgkLIoCzSUofHvsDJLzVFv2ffEPlmCaJIVvj6VBcsvge3LeTAzGSlT49tgapLpu8GmpbBl0\nk6zw7bE2SFgbHOolLHx77A1SXRn81pS2DRoZw1pOTbwRi4OEMji4kha+PTYHCYuiAPca32Hs\nDpK7KMqh6Aa32aJydmqywrfH8iChDG65tfuyQ0nek9geJOejbqxsVqojgLoq+7O9aB7abX2Q\nUAa3WM3ZBIVvD4KEMri9Rifr+PZDkFAGtxZN4duDILlQBrdR4o5vPwSpDsrg9kne8e2HIHlQ\nBrcNWeHbgyDVQxncLnSFbw+C1ABlcJsQFr49CFIjlMEtQlj49iBITVAGtwZl4duDIPlhbXA7\nvExZ+PYgSHkea80OpfwICipaWMaG0BW+PQhSPpTBzUdc+PYgSM2gDG466sK3B0FqDmVws5EX\nvj0IUgGUwY1GXvj2IEiFUAY3GH3h24MgBUEZ3FQpFL49CFKgx9ANbqQ0Ct8eBCmYWwZP4SMp\nSJVK4duDIIVAGdw86RS+PQhSmFWHsNZ4RKZJSNb4DoMghcIjMg0zmrV8NrWNI0jhUAY3yoMp\nFb49CFKU8S3wiExTpFb49iBIkdxHZC6XOD5QSa/w7UGQos0qY13QDa6/FAvfHgSpiAUogxug\nkuDhltEQpGLQDa6/2p+wDim/r0CQito8mJWMlTwHSCTVwrcHQSquajhjw6tkzwKEPcDYPWmP\ngSDFgW5wnc0sJXm4ZTQEKRZ0g+uLdo3vMAhSPOgG11XqhW8PghQTusH1VDkgvY5vPwQpLrcb\nHGVw3aTa8e2HIMW2CWVw/WRQ+PYgSPG5ZfARKIPrZHIGhW8PgsQDZXC9pNzx7YcgcUEZXCdp\nd3z7IUh8sDa4PjIqfHsQJE4og+tia+od334IEi90g+uh9tzUO779ECRuKINrIbPCtwdB4odF\nUTSQ1hrfYRAkEeNbsIFYFEVls1tn0PHthyAJwaIoaltUxk7NqvDtQZDEzCpj3RbIngSEyLTw\n7UGQBGFRFHVVHsa6Zv1IbQRJ1KreWBtcTSk93DIagiRs82kogyvpumwL3x4ESVzVCMYuQze4\naiZnXPj2IEhJoBtcPS+Xsl9KGBZBSsQtg2fXzwXFLSrPuvDtQZCSQTe4WiQUvj0IUkILurMO\n02VPAuq5Hd9yvihHkJJa3Qfd4KrIuOPbD0FKDN3gysi449sPQUoOa4MrIuuObz8EiQLK4CrI\nvOPbD0EigTK4fNl3fPshSDReQTe4ZGt6sN6b5Q2PIBHBoihyVfbPvuPbD0Gi4i6Kgm5wWaR0\nfPshSGRQBpdISse3H4JEB2uDSzOZsbvlzgBBonQLyuBSSOr49kOQSKEMLoOsjm8/BIkWyuDZ\nk9bx7YcgEUMZPGvyOr79ECRqKINnq/Zc1l6BO8IQJHIbT2Ylt8mehD0kdnz7IUj0UAbPkMyO\nbz8EKQ0og2dljsyObz8EKRUog2dDhcK3B0FKBxZFyYLkjm8/BCkl7qIo02RPwnCyO779EKS0\nuIuiPCp7Ekar/Ynkjm8/BCk1bhn8VtmTMNlo1vIF2XNohCClB2XwVE1m7B7Zc2iCIKXpthJ2\nKsrg6Xi5lF0pew4+CFKqnmrLDkIZPA3qFL49CFK6ZpWx7ugGp/dVT9Zbqd/1CFLK8IjMNFQO\nYHupUvj2IEhpW90ba4NTU6Tj2w9BSh0WRSGnSMe3H4KUvuqRjA3bIXsWBlGl49sPQcrC+Bbs\nxA2yJ2EMZTq+/RCkTLjd4PJvhzaDaoVvD4KUDSyKQkWhjm8/BCkjKIPTqOyvWuHbgyBlBY/I\npOA+3FKZjm8/BCkz6AYnMEaljm8/BCk76AZPTK2Obz8EKUtYFCUZJQvfHgQpU4+1Zoeq+FFZ\nD2oWvj0IUrZQBhe3pqeShW8PgpQxlMFFqdfx7YcgZQ1lcDFu4Vuxjm8/BClzKIMLUbbw7UGQ\nsocyuAB1C98eBEkGdIPzUrjw7UGQpHi0Neu9WvYkNLKwjJ2uauHbgyDJ8QoWReGgdOHbgyBJ\ngkdkxqd24duDIMmCMnhcihe+PQiSNFgUJSbFC98eBEkeLIoSy2QFlzophCDJhDJ4ccoXvj0I\nklRuNzjK4FEWlrEhahe+PQiSXO7a4AtlT0JhX6lf+PYgSJIt6M7KZ8uehLJ0KHx7ECTZKlAG\nD6VF4duDIEm36WRWcpvsSahJi8K3B0GSD93gIfQofHsQJBWML0EZvJAmhW8PgqQEd21wPCIz\nny6Fbw+CpIZZZWxPTT5WZ0SbwrcHQVLEgu6sw3TZk1DI1sNZF51+RyNIqlh1KMrgTRR8uGU0\nBEkZWBTFR6PCtwdBUgfK4I10Knx7ECSVoBvco1Xh24MgKQXd4C6V1/gOgyCpBYuiOM7afdmh\n2j2zA0FSjFsGnyZ7ElJValb49iBIqrG9DF5ztqIPt4yGICnH8jL4aN0K3x4EST1Wl8FVX+M7\nDIKkInvL4BoWvj0IkpJsLYOrv8Z3GARJTXaWwTXr+PZDkBRlYze4bh3ffgiSqlYdwlo/LnsS\nmaodqmXh24MgKWvjSZYtijKGtZwqew7CECR1VV1qVRlcv45vPwRJZTYtiqJt4duDICnNnkVR\ndOz49kOQ1GbLoihadnz7IUiKs6MbXM+Obz8ESXVuN7jpZXBNO779ECTlbTR/bXBNO779ECT1\nVY1g7DKTy+BTtC58exAkHdxawk7V+qN4JM0L3x4ESQuPtWa9Te0G12uN7zAIkh5e6WRqN7jG\nHd9+CJImTO0G17nj2w9B0sXq3iYuiuJ2fBvxhTOCpI1Ng1nJWNmToKZ1x7cfgqSP6pGMDdsh\nexakHmTsbtlzoIEg6cS0RVHmtGZXyp4DEQRJK2Z1g+ve8e2HIOnFpG5w7Tu+/RAkzZjTDW5K\n4duDIOnGlLXBa881pPDtQZC0Y8ja4KNZy2dlz4EQgqSf6ssNKIOb0PHthyDp6NYSdoreH9Nn\nlhrQ8e2HIGlJ97XB9V3jOwyCpCe91wZf08OIjm8/BElTOneD67/USSEESVf6LopSo/Ma32EQ\nJG1pWwYfo/9SJ4UQJH1p2g3+oGGFbw+CpLNbNCyDv1xqTMe3H4KkNf3K4GYsdVIIQdKbbmVw\nozq+/RAkzX3UjZXNkj2J2NyO7+WyJ5EKBEl3OnWDG9bx7YcgaU+jMrhhHd9+CJL+tCmDm9bx\n7YcgmUCPRVFmm7DGdxgEyQjuoiiqf4hfVGbOUieFECQzzCpjXdT+GG9s4duDIBligeJl8Mr+\nrOsq2ZNIEYJkCrW7wWt+aGDHtx+CZIxNpyj8iMzrTOz49kOQzOEuinKJmmVwkwvfHgTJJKo+\nIvPlUvZL2XNIGYJkFDUfkbmwjJ1mbuHbgyCZRcW1wQ0vfHsQJMOotyiKuR3ffgiSaVb3Uasb\nvPZc1l6135EpQJCMo9gjMg3u+PZDkMyjVDe4+YVvD4JkInXWBp9jcse3H4JkJHdRFBU620x6\nuGU0BMlMapTBrSh8exAkQ6nQDW56x7cfgmQq+YuiGN/x7YcgGctdFEVqN/h1dhS+PQiSuaqG\nMzaiStrwUxi7W9rgmUOQTCZzbXALOr79ECSjPSptbXBT1/gOgyCZTVYZ3KLCtwdBMpyctcHt\n6Pj2Q5BMJ6MMbknHtx+CZDwJ3eCWdHz7IUjmy7wMbkvHtx+CZINsy+DWdHz7IUhWyLIMbk/H\ntx+CZIfsyuDWFb49CJIl3EVRpmUwTmV/tpctHd9+CJItslkUpeZsizq+/RAka2RSBrew8O1B\nkOzhlsGHp1sGn8zYPakOoCwEySZpL4rycim7MsXNqwxBskq6a4Pb1vHthyDZ5Y1vsT3TKgZY\nWvj2IEiWcRdFmZ3KlisH2Fn49iBItknrEZk1Qy0tfHsQJOukVAa3tvDtQZDsk8ra4DZ2fPsh\nSDYa34KduIF0i1Z2fPshSFZ6qi37/grC7dnZ8e2HINmJthvc6sK3R/kgfTrwDPJtAu2iKLZ2\nfPsNGfgZ5ebogwQpoVsUxdqO7xQhSPogK4NbXvhOBYKkEaJFUezt+E4RgqQVikVRZtrb8Z0i\nBEkvbhk82YqoKHynAkHSzCtlrNuCBP/9mp6s92ay2UADBEk3yR6RicJ3SuiD9M2KNZZ/1Zey\nGGXwDatXBTcUWd7x7bOpYuU3lNujDNKCP11y5C7MVdL9xF/8Wc5jfiwQVQb/bPJPj9uj7hyw\nPY776eTm3zlWntryhZRnp7xVj/18YLeSukO06w8uvXcRzVbJgvTGyN1YMwfe9DnV1sEvrAw+\n/5oezc9Bj2vm572kek5Gc1TUkrEHND9Ee1zxFsGGaYK0beJ+zadXp8XxM0m2D80ElMFrnuob\neA5Yn6dq5ExSPdOODT5EB0zZnnTTFEGquX/P4Pm5DptLMAI092hr1jvvPf7TBT9pfdfJ/8ma\nplJe7h1+iLpNqU22cYIgvRPyk7DBhV8lHwOam1U21PeLZsnJ0efglKXyZqqI1T+OPkRHJCvC\nJA5SzW2toyfI2LdnJB0ECn28temfH25X7By0f0TeTJXw/LeKHaI2f0jySylpkL4eVGx+rhsT\n/t6EKNsujnMOLt4me54S1Vwb5xANSXATcsIgrfxunAkydh71igPQaMNx8c7BcbT3qutk29B4\nh+iQCuEhkgVpafd4E2TsZJt/Hqbq64iP0PkO/Y/suUpSOTDuIdr7C9ExEgWpYp+4E2TsbDRK\npmLL4fHPwYAtsmcrRdXp8Q/R/msFB0kSpE2HxJ8gYz9LMBKEqR7Mcw7sbPu+lOcQHVYpNkiS\nIJ3PM0HGHk0wFIQYx3cOxsqerwQP8B2i4WKjJAjSFL4JsvaLxceCYLM4zwF7RfaMM/fRzpyH\n6M9Cw4gHaWUH3pN4BIrgxLb04D0H3W37mFRdpF2gULlQA4F4kM7hnSBjU4QHg0A38J+DG2TP\nOWP38B+ii0TGEQ7SHP4Jsl03io4GQT5vw38O2tjVkf9NmcB1+rbAQMJBOkFggux3oqNBkJ+K\nnIOfyp51pm4WOUSnCQwkGjbVk/YAAByVSURBVKR/ikyQ7SZYW4QgFW1FzkGblbLnnaHNuwpd\np+/yjyQapAuFJshoVgqFOkI/bRkbJ3veGZosdohG8I8kGKTN7cVmOEhsOAiyv9g52Mei4ukx\nYoeoE/87J8EgPSY2QdYSCzmQeUvwHLA3Zc88M1+0EDxEf+MeSjBI54meRNtviyE0VvQc/Eb2\nzDMzSfQQXco9lGCQIu4tj3ax2HhQ6CjRc3CU7JlnpshNseH25h5KLEifFg793e3OI+6f5znO\nlbk/LvzGWRA0wx5C40GhbYV3Juefg6Fvbl7//q8DvmpqbU3ttEuRQ3T+m+u2LfzNTgHX6Ze8\nQ4kF6R8BQ/+m7qNdpwrn3ZZsj+dzLwoMUgkWjyTyUZFzcIH3sicDXjY/esvG+KbIIbrVe9n0\ngJdN4x1LLEh3BAxd+qGzsJTd41T3YWyG89I7wUFi7wkNCAWejj4HLdY67/fqkvuBtnfhy/g/\nSuspqBzjO0Q9a50n9uw23XG6Fr5sPO9YYkEaERSRPtXOmD41zvjcPz55MZsaEqSnhAaEAndG\nn4Ou06efzNjpjnNi4at+K3vuGXk8+hD1GDny27kPIY5zcOGrfs47lliQgm+Bv9PZ8pGzwm0K\nL2WhQZokNCAUuL7IOagzxnEOLHzRaNlzz0hww2reIWqx3+vO4laFLzqfdyyxIJ0SOMO2n+T+\n6oz6fwkL0h+EBoQCvyh+Dlj/Smd2wIsulz33jPy26CE623E2PLxHwItO5x1LLEghXxg/4zjV\nvYoE6RahAaFAyA3U/nMwcIOzNuAjkth9Ahr6TdFDlAtS1atBq14M5B2L8jfSGblZOS8VCRJ+\nIxH5edFzcMEOZ8V3gl5k928k/yFq2fGQ6c62noUvyug3UuBnpE6rnJeubvwSDJ+RUhb4Gcl/\nDi51nLd2DzwHVn9G8h8it4OoZ+ChzOgzUmDVbpKzrVerD5zVHSODhKodkcCqne8cDKp2ng36\nqpFZXrVrOkQ/XVOTe+O7h+P8rvBVGVXtgr5HOrrWXdJmQK1zd2SQ8D0SkaDvkXznYKdlzrLu\nu+cE9Onb/D2S7xAd4Tgvf6fnXwJXTcjoe6SAzoa2nzhL3BvNJjrVvdmclSu3OlUrVxY2iaOz\ngUpAZ4P/HPyo4XXjCl9nS2fDuiKXaf16QfMCyt8ZdTYE9Nrd7jh16+mXf+X8q8XH9a8rbL5A\nrx2VgF47/zm4sOF1hUEqtabXrrCwnXeZtvjFe5XbP7kjaDmsjHrtAroBY7pIbDwo9APRc/AD\n2TPPzI9ED1F37qEEg3Su6AxxPxKZkG9JirPnfqSJoocos/uRHhWcIO6QpfOm6FVi0R2yooco\nsztksWaDAoIfgF2UTWs2HC12iLJbs8G5QGyGWEif0E1i52Cs7HlnSHAVIYGF9EWD9LbQBLGu\nHaVVAgutMtZmhex5Z2hT0SfHBnqff6RsV1q9U3Q0CIKVVotSfqVVrP2tAKz9XdQ65df+xtMo\nFICnURSl/NMo8HwkBeD5SEWp/3wkPLFPAXhiX1HzQ1rgQ2X9xD48Q1YF4/jOwVjZ85WAc7nV\n7J8hi6eaK6D6VJ5zYOdTzS/hOUQynmruVAQtCBDiLCvPYfo2D4h/Dvpvlj1bKapOi3+I9vu3\n4CCJguQsCVhaL9igrYkGglD/if2+4OC1sucqyZbj4x6instEx0gWJGd5wKppQc7dnmwcCLf+\n2Hjn4Nj1smcqzbaz4x2ig8V7qhMGyfk6YCHPQr+qSTgMRNgW6/GJF22TPU+JqkfFOUSnJfhR\nkzRITs0tRee36/Skg0Bz/5529XlNHWEP7lzsHLR7SN5clfBc52KHqM3/JPmeM3GQHOdffaIn\neP6a5GNAkzVTb/6vus+mvpVuPh0UfQ5OWiJvvopYVeR22QHJluUhCJJTfV/Enef9ZhOMAE3G\nNx7avC/m/hZxd9J+tiwbFG3moeGHqOsDCT99UAQp9y79vn2C53cs3tVRa1wKrUN+IbTmryHX\nySFP4JsHT+0LIU853G9S4mIYTZByc3ztsm83n953xi0l2jo0qW5YIv+Cgr/6YFT35ueg+9Uf\n+F/xYe/TXrG68PPJrwt+c+9++RsEG6YKUk7t/PEX9S/3JrfXCT97dBXdpsGnsr93jF8I+ssl\nk0YcXf8Tbdejhk9s/tHortz/3+W/38lglupa8cjI4+ufgdx5wLC7F9BslTBInqp1n6/4hnqj\n4FPhfQNbHv5upHbdl1+sCyxBza1/KzPuk9Smp4lvVixbR/mWlzxIkLJFPVh7d3n1S0T+4w0l\n9e9nSp6mnpflECTNzC1ne334Rlv3Ob0iGopCLaYSz8t2CJJe/taWfW957o8WXaqE/vsfejlq\n+wTxvKyHIGnljhJ24gb3H+Z9IbYBbzGQdv+inBQ4CJJWqi9n7OIdiTbxnPcbaQLRjKABgqSP\nzUMYG51w3YvluRQN+jlr+TzNlKABfZC2XfQ78m1CTkVfVvpQ4q3swS6rqjmTdfwo+YT0dvsw\n0lt76IM0irG3yDcKzuKerONLyTfz3nO5/9nSl3WvSL4tnb3G2BjK7ZEHqaIdY32s7kJJx9zO\nbK8PybZW0Z31sfO+83o1fRlrR/loFPIg1d1khqcgUXPL3txPkYuwsIydZnMva93q+kLfaYeg\nDtI894nrbLcNxJu13fgW7ATaO8Wnt2LXkG5QKxt3r/tWmvBbAOIg1R7plVdtWxg3XdVXMHZR\nsrJ3odzP5HuJN6mPa7zL9HC6tX+Jg/RIfQdK609pt2s1irJ3gFH2FsGXNjx94C9km6QN0uY9\nG27yOIt0u1ajKXsXqjmLdaArX2hlSMNl2pVsKXTaIE1tvFuqBI8UI7KoB+s4M5UtVw5ge9r0\n1LFGG5ru6iO7g5s2SFtvGz36ItZ29OjRqNsReXMX1iXZshzhKrqz3lYWwR/MXaGt2SWjR99O\ntkYZ/Rey81g5+Tbt9UQbdlB6vzU+6sjOsPU7v/aM9H0tgqS0xm7vlMxoxa5OcfMqQ5DsQdDt\nXcwExu5PdQBlIUjWSKnsne9qW4vgCJIt0ip756s5w9JOcATJEm7Zm6Dbu6jNlnaCI0h2SLPs\nnc/STnAEyQrplr3z2dkJjiDZIO2ydz4rO8ERJPNlUPbOZ2MnOIJkvEzK3vks7ARHkEyXTdk7\nn4Wd4AiS4bIqe+dzO8FXZj6qTAiS2bIre+ezrgiOIBmNepGT+D7qyM60qRMcQTIZ/SIn8c1o\nxUZJGloGBMlc1T9LYZGT+OwqgiNIxto8mJWMy7bsnc+qIjiCZKo1fVnpw1JnYNWa4AiSoRb3\nZJ3SWeQkvi397OkER5DM5Ja935c9CZuK4AiSkZ5oww5W4QvR+dYUwREkE92Zabd3FGs6wREk\n86SytrcoW4rgCJJxJHR7R7GkCI4gmaaiL2s1UfYkfGrOtKITHEEyzJJ9WfsXZU8ijx2d4AiS\nWd6Q1O0dxYoiOIJklCwXOYnPhk5wBMkk2S5yEt908zvBESRzKFX2zmd+ERxBMsbm0yR3e0e5\nhrUiewiXkhAkU8hY5CQ+4zvBESRDLO4pY5GT+EzvBEeQzCBrkZP4DC+CI0hGULPsnW9+R3aW\nuUVwBMkEqpa98xndCY4g6U/hsnc+k4vgCJL2lC575zO4ExxB0p3aZe98BneCI0iak7O2tyhz\nO8ERJL3N7cz20ulnvLFFcARJazqUvfOZ2gmOIOlMj7J3PkM7wREkfWlT9s5nZhEcQdKWYouc\nxDfKxE5wBElXqi1yEp+RneAIkqbUW+QkPhOL4AiSnuaW61X2zlfRjfXbInsStBAkLbllb51/\npptXBEeQdHRnCRukW9k7n3FFcARJP27Z+2L9yt75TCuCI0ja2TyEldwkexLJGVYER5B0o1O3\ndxTDiuAIkmaW9NK37J3PrCI4gqQX9Rc5ic+oTnAESSt/a8u+96XsSZBZWMYGV8ueBBEESSfj\nW7AT1sueBCGDlkNBkPShabd3FHOK4AiSNrTt9o5izJrgCJIu9O32jlJzFus4X/YkKCBImtC5\n2zuKKUVwBEkPc8rZXkb85C5Q0d2ITnAESQt/0bzbO4oZneAIkg50XOQkPiOK4AiS+qpHMjbM\nrLJ3PhOK4AiS8jRa21uUAUVwBEl1pnR7R3E7wTUvpSBIijO17J1P/yI4gqQ23db2FqV9ERxB\nUpp+a3uL0r0IjiCpzLRu7yiaF8ERJHW5ZW/tFzmJbwJjE2TPQRyCpCwLyt75tC6CI0iqsqHs\nnU/rIjiCpCg7yt75dC6CI0hqsqXsnU/jNcERJCXZU/bON1/bIjiCpCKbyt75tC2CI0jqMb7b\nO4quRXAESTmbB1tW9s6naREcQVKNfWXvfJquCY4gKcbGsnc+PYvgCJJa7Cx759OyCI4gKcXW\nsnc+HYvgCJJK7C1759OwCI4gqaP6cqu6vaPoVwRHkJThdnuPtbfsnU+7IjiCpArby975as5i\nHbQquiBIikDZO59uRXAESQ0mPdKShmZFcARJCWY90pKGXkVwBEkFKHsH0aoIjiDJZ3W3dxSd\n1gRHkKSzvNs7yijW8nnZc4gJQZLNzEda0tCoCI4gSbaoB+s4U/YklLWlH+u2WvYkYkGQ5Jpb\njm7vKBXdWV8tiuAIklTo9i5GlzXBESSZUPYuTpMiOIIkD7q9Y9GjExxBksa6tb1FadEJjiDJ\ngrJ3XDVnalAER5AkccveL8mehCa29GPdK2RPoggESQ6UvXlo0AmOIEmBsjcf9TvBESQZUPbm\npXwRHEHKHsreAlTvBEeQMoeytxDFO8ERpKyh7C1G8U5wBClj6PYWpXYnOIKULZS9xSndCY4g\nZQpl7yQWlrHB1bInEQJBytIdJezEDbInoTGFi+AIUnZQ9k5M3U5wBCkzKHsTGKVqJziClBWU\nvSko2wmOIGVkcU90e1PY0lfNTnAEKRt4pCWV1Wp2giNImcDa3nQWlrHT1CuCI0hZQLc3JSWL\n4AhS+qqvYOwilL3pqNgJjiClbvMQxkaj7E1JwU5wBCltKHvTU7ATHEFKGbq906DegzERpHSh\n2zsdFd1Zn82yJ+GHIKXKLXsvlz0JI6nWCY4gpQnd3umZ0YqNkj0HHwQpPe4jLdHtnZqJjN0n\new5NEKTUoNs7ZaNYq2my59AIQUoLyt5pU6oTHEFKySJ0e6dOpU5wBCkd6PbOgkKd4AhSKtDt\nnQ11OsERpDSg7J0VZTrBESR6KHtnSJVOcASJHMremVKkExxBolbRl5U+JHsSFlGkCI4gEcMj\nLbOmRic4gkQLZe/sKdEJjiCRQtlbBhU6wREkSljkRI7p8jvBESQ61T/DIieSTGDsfrkzQJDI\nbB7CSsai7C2H9CI4gkSlog/K3vLUnMk6fiRzAggSkcU9WScsciKP+2BMmZ3gCBKNVzuzrvNl\nT8JqFXI7wREkEk/vhEVOZJPbCY4gUbizhA1Ct7dsUjvBEaTk3EdaDkPZW777JT4YE0FKDGt7\nK0NiERxBSgqLnKhDYic4gpTQkn1Z+xdlTwLqyesER5CSeXMX1uU92ZOARtI6wRGkRNDtrRpZ\nneAIUhJY5EQ9kjrBESRx7iInKHsrZ4KUIjiCJAxlb0VJKYIjSKKwyImqpHSCI0iC8EhLdW3p\nl/2a4AiSGCxyojIJneAIkpCnUfZWWvad4AiSiDtR9lZc5p3gCBI/t9sba3srLutOcASJG8re\nWsi4CI4g8ULZWw8Zd4IjSJywtrcusu0ER5D4oNtbH5l2giNIXJ5oww5aIXsSENNHHdmZNRmN\nhSDxQLe3XjLsBEeQ4kPZWzvZdYIjSLGh7K2hzIrgCFJcWORER5l1giNIMWGREz1t6ZtNJziC\nFM8bKHtrKqMiOIIUC8re+sqmCI4gxYFub51lUgRHkIpzH2l5Ccre+sqiCI4gFVX3SEvJc4BE\nMiiCI0jFrOnHSh+WOwVIKIMiOIJUxOKeWOREf+kXwRGkaOj2NkPqRXAEKRLW9jZF2kVwBCnK\n+BbshPXyhgdCKRfBEaRw1VcwdhHK3qZItwiOIIVCt7dhUi2CI0hhsMiJaVItgiNIIbC2t3nS\nLIIjSMFQ9jZRikVwBCkQur3NlN6a4AhSECxyYqrU1gRHkAqh7G2wyYzdm8Z2EaQCm09jJeNQ\n9jbVNazV9BQ2iyA1h7K32dwi+Hz6zSJIzaDsbbot/Vg3+iI4gpQPZW/zpVIER5DyoOxtgzQ6\nwREkP5S97ZBCERxBaoKytzXoi+AIUiO37D0WZW87jGKtZpBuEEFqgLK3Tcg7wRGkeot64pGW\nNqHuBEeQPCh724a4CI4g1cEiJ/ah7QRHkFxY5MRGpEVwBMlb2xtlbwtRFsERJGfzYHR7W2oU\nXSc4goSyt70IO8GtDxK6vW1G1wlue5BQ9rYbWRHc8iCh29t2VJ3gdgcJ3d5AtCa4zUFCtzc4\nVEVwi4OERU6gDslyKPYGCWVv8JAUwa0Nklv2Rrc3uCoHsD1XJtyGrUFC2RuaVHRj/bYk24Sl\nQUK3N/jNT1wEtzNI6PaGfImL4DYGCd3eUCBpEdzCIKHbGwIkfDCmfUFyy94P024SDFBzFuuQ\nIAvWBWnJvqz9i6RbBDO4RXDxvkvbgjS3nO2VwrMIwAAV3Vlf4SK4ZUFCtzeES9IJbleQ0O0N\nUWaIF8FtChK6vaEI8SK4RUFCtzcUJdwJbk+Q0O0NxQmvCW5NkFD2hjhEO8FtCZJb9ibdUzCU\nYCe4JUFC2RviEusEtyNIKHtDfEJrgtsQJJS9gYtIEdyCIKHsDZwEOsHNDxLK3sBLoBPc+CCh\n7A38+IvgpgfpDSxyAgK41wQ3PEjuIifLyeYC9lhYxgbzPBjT7CBhkRMQxbkcislBqh7J2DCU\nvUHMZMbuif9qg4Pklr1vIp0LWOUa1mpa7BebG6Q1fVlrLHIC4rjWBDc2SEt6sfbxf54AFKrs\nH385FFODNLczFjmBpFbH7wQ3NEhut3fS5wsAxO8ENzNIKHsDjdid4CYGCd3eQCZuJ7iBQdo8\nhLHR6PYGGjE7wc0LUkVf1moi+TTAVjVnxuoENy5I6PYGWvE6wU0LEhY5AWqxOsENCxLK3kAv\nTie4WUFC2RvSEKMIblKQ0O0NKZnA2P3Rr1A+SOueeibmK7HICaSmaCf4358ifStEH6TYsMgJ\npIerE5yAvCCh7A1pcjvBMyxjSQuS2+2Nsjekh6MTnICsIGFtb0jb/I7sLNEHY3KTFCSUvSF9\n01uxa7MaS0qQ3LL3xSh7Q9ruZ2xCRkPJCBLK3pCRUaIPxuRGG6SqJVOn3HXHHfc9MmtV+ItQ\n9oasuEXwgNLdilceufeOO+6e8sJSniUlI9EFaeuL1/ZpzRp1GPTbfwb+1lnUg3V8iWxUgCiV\nh4/N/z9q37rthPZNl2nrw0ZP30YxEFWQ3hhRxgrsPfbzghei7A1Zqsr7t6W/7lF4mZaPfCv5\nOCRBqn3+8MLp1Wl13oL8l6LsDdJ8+KNWIdfpUYk/SVEE6b0BIdOrc8la30s/bolHWoIca86P\nukyP/ijZ1pMHqfLK0qgJMvYt//Kpd2CRE5BiYsBnD7/W121PsvnEQfr4oOj5uc7flHQUgETW\nn138Mu37WYIBkgZpavviE2TsgGUJhwFIYsm+cS7TTjPFR0gYpEfaxJkgY12wGjHI895u8S7T\ntn8VHiJZkCbHm19OOZIEsrzbKfZ1+pjoGImC9FzM30euLsuSjAQgbOnu8S/TtqK3yCUJ0ryd\n4k8w9zkJFQeQYX2sz0cN2gm+dUoQpE3780yQsR+JDwUg7Cd8l2mvjUKjJAgS5wQZe1B8LABB\nE3gv02FCw4gHaRrvBFnntcW3CkCqIn6hocFskXGEg7RtP+4JsktFBwMQdC7/ZbqfSDu4cJDG\n80+QsQXFtwtA6D2Ry1TkrlrRIO3oJjLD8wVHAxATozOo0N5VxTfcnGiQpohMkLUqvD8JID2f\nhN03Ee1x/pFEg3SE0ATZOMHhAERcL3aZHs8/kmCQlohNkO2DNU8gOzVdxS7Tlsu5hxIM0m2C\nQWLviI0HIOA10cv0D9xDCQbpeNEZ3iE2HoCAcaKX6WDuocSCtI2ry87vJKHxAEQcI3qZduC+\nj1ssSG8HjP16w1/+OPcvF37jLAicYUeh8QAE1Oxc5Dod+ubm9e//Ougehvd5xxIL0sNREzyH\n7fF87o/gILHVQgMC8FsWdAH6rtMLvH94MuBV3Hf4iQUpqKrYefec/Vc783diM5yX3gkL0myh\nAQH4TQ+6AJuu0xZrnfd7dcn90N+78FU38Y4lFqQLg0PC2P/V3Vzx5MVsaliQHhUaEIBf+A3c\ndddp1+nTT2bsdMc5sfAFw3nHEgvSGSHzu8r7hFTKwoN0r9CAAPz+GJaj+uu0zhjHObDwFdw3\nz4kFaWDw/PrvcO6r/8fQIKH+DVm5JSRHvuuU9a90Zge85BTescSCdFzg/Dp/6bzbUAEJDdKt\nQgMC8BsbfAn6r9OBG5y1AR+R2CDescSCNCRofiUvOOsb5xQapLuFBgTg9/vAK9B/nV6ww1nx\nnaAXnc07lliQAu+Wut5xzmr8l9AgPSQ0IAC/4LvMfdfppY7zVvASQ9z3m4sF6aqAoY+p9t/s\nFxqkF4QGBOD3j6AL0HedDqp2ng3p0RnNO5ZYkO4rHHm31c7Crm6JfvfyyCAtFRoQgN/CgOvP\nd53utMxZ1t39p4Blt7kX6hEL0qzCka9u/MvpbM7KlVudqpUrHyt8WRuyZw0CFLE94LY+33Xa\nWOIeV/iyN3jHEgvS1y0KRr6maYLs48Z/KtBPaDwAEQcXXoC+67Rx4YPCIJVyL24neBvFoYUz\njGeM2HgAAoI+y8dyBPdQgkEaJTrDGWLjAQh4TvQyvZF7KMEgid56WE7yBGmAWCr5V4f0vMs9\nlOjiJwLLQ7ouFxwOQMQwscv0QP6RRIN0s9gMCZ7DDhDbXLHL9E7+kUSD9J9Yj7xs7nDB0QDE\nHC5ymXZcxz+Q8JLFV4vMcLroaABCnhG5TG8QGEg4SBUd+Sd4lOhgAGJqD+O/TL/1H4GBxB/r\nwr+KfukHwoMBiHmXP0iTRMYRD1J1b94JXis8FoCoX/BepofXiAyT4Il9n3K+uTtoq/hYAIK2\nHsJ3mZZ9JjRMkocxP841wU7o+wYZFvMVmJ8WGyVJkLjW+m+Dih3IMTVoBcgwNwsOkihItZfE\nn+CfkwwEkMBD8S9T4dabREFyquMmqc3DicYBSGJS7BwJ3y6XLEhObbx3d+2mJhsGIJF/xHvq\nA/f6qk0SBslxHo/xUW7/D5OOApDIvH2KX6ad/pZggMRBcj4u+n3ShZsSDwKQzPofF7tM+4vV\nveslD5JTfVfkXR8HvJx8CIDEpu0bdZmW3y/0PWwjgiA5zppRoe/vut+7nWIEgMS23hX6SNkO\no/+dcOMkQXKctbf0Cphei6Mf5n7yGUBqtk85MvBN0+1fJ940UZBy3ryub0v/7HYedOcyso0D\n0PjstwPzKngtDxvzT4rt0gUp55vZE68+778GDvzhxTc+/i+8pQM1bfvno7+66IcDB/7X+aMm\nzllPtFHSIAHYCkECIIAgARBAkAAIIEgABBAkAAIIEgABBAmAAIIEQABBAiCAIAEQQJAACCBI\nAAQQJAACCBIAAQQJgACCBEAAQQIggCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAg\nARBAkAAIIEgABBAkAAIIEgABBAmAAIIEQABBAiCAIAEQQJAACCBIAAQQJAACCBIAAQQJgACC\nBEAAQQIggCABEECQAAggSAAEECQAAggSAAEECYAAggRAAEECIIAgARBAkAAIIEgABP4faqcY\nSY1xVh8AAAAASUVORK5CYII=" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ] - }, - { - "metadata": { - "id": "7Bdrp3mOk2-W" - }, - "cell_type": "markdown", - "source": [ - "\n", - "# Deduce Conditional Exogeneity or Ignorability by D-separation\n" - ] - }, - { - "metadata": { - "trusted": true, - "id": "KwJgFzoqk2-X", - "outputId": "b66ee434-d948-4c7e-cf45-0fbc6843c6cc", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "print( impliedConditionalIndependencies(SWIG)[5:8] )\n" - ], - "execution_count": 21, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "D _||_ Yd | X2, X3\n", - "D _||_ Yd | X2, Z2\n", - "D _||_ Yd | X2, Z1\n", - "D _||_ Yd | X1, X2\n" - ] - } - ] - }, - { - "metadata": { - "id": "wgaiPTysk2-X" - }, - "cell_type": "markdown", - "source": [ - "This coincides with the backdoor criterion for this graph." - ] - }, - { - "metadata": { - "id": "iHHStLhqk2-X" - }, - "cell_type": "markdown", - "source": [ - "# Print All Average Effects Identifiable by Conditioning" - ] - }, - { - "metadata": { - "trusted": true, - "id": "OcBfUurBk2-X", - "outputId": "b2a8b5c2-3407-48e4-e68e-61f19ac9e598", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "for( n in names(G) ){\n", - " for( m in children(G,n) ){\n", - " a <- adjustmentSets( G, n, m )\n", - " if( length(a) > 0 ){\n", - " cat(\"The effect \",n,\"->\",m,\n", - " \" is identifiable by controlling for:\\n\",sep=\"\")\n", - " print( a, prefix=\" * \" )\n", - " }\n", - " }\n", - "}" - ], - "execution_count": 22, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "The effect D->M is identifiable by controlling for:\n", - " * {}\n", - "The effect D->Y is identifiable by controlling for:\n", - "{ X2, X3 }\n", - "{ X2, Z2 }\n", - "{ X2, Z1 }\n", - "{ X1, X2 }\n", - "The effect M->Y is identifiable by controlling for:\n", - "{ D }\n", - "The effect X1->D is identifiable by controlling for:\n", - "{ X2 }\n", - "{ Z1 }\n", - "The effect X2->D is identifiable by controlling for:\n", - "{ X1 }\n", - "{ Z1 }\n", - "The effect X2->Y is identifiable by controlling for:\n", - "{ X1, X3 }\n", - "{ X1, Z2 }\n", - "{ X3, Z1 }\n", - "{ Z1, Z2 }\n", - "The effect X3->Y is identifiable by controlling for:\n", - "{ D, X2 }\n", - "{ X1, X2 }\n", - "{ X2, Z1 }\n", - "{ Z2 }\n", - "The effect Z1->X1 is identifiable by controlling for:\n", - " * {}\n", - "The effect Z1->X2 is identifiable by controlling for:\n", - " * {}\n", - "The effect Z2->X2 is identifiable by controlling for:\n", - " * {}\n", - "The effect Z2->X3 is identifiable by controlling for:\n", - " * {}\n" - ] - } - ] - }, - { - "metadata": { - "id": "gZRMiUymk2-Y" - }, - "cell_type": "markdown", - "source": [ - "# Equivalence Classes" - ] - }, - { - "metadata": { - "trusted": true, - "id": "8vNiL5HWk2-Y", - "outputId": "fcd29ac8-675b-4fb1-8a95-8e62ccba8d44", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 437 - } - }, - "cell_type": "code", - "source": [ - "P=equivalenceClass(G)\n", - "plot(P)\n", - "#equivalentDAGs(G,10)" - ], - "execution_count": 23, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAAChVBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMmJiYoKCgpKSkrKyst\nLS0uLi4vLy8xMTEyMjIzMzM0NDQ1NTU2NjY4ODg5OTk6Ojo7Ozs8PDw+Pj5BQUFDQ0NERERF\nRUVGRkZHR0dISEhJSUlKSkpLS0tNTU1OTk5PT09QUFBSUlJUVFRWVlZXV1dYWFhaWlpbW1tc\nXFxdXV1fX19gYGBhYWFiYmJkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N1dXV2dnZ3d3d4eHh5eXl7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OF\nhYWGhoaHh4eJiYmLi4uMjIyOjo6Pj4+QkJCRkZGSkpKTk5OUlJSWlpaYmJiZmZmcnJyfn5+h\noaGioqKjo6Ompqanp6epqamqqqqrq6utra2urq6vr6+xsbGysrKzs7O2tra4uLi5ubm7u7u9\nvb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7P\nz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh\n4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz\n8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7///9z3YkZAAAACXBIWXMAABJ0\nAAASdAHeZh94AAAgAElEQVR4nO3d+WMkaX3f8dpdZmHNkRCDje3YLCYEE2MHYyBA4thcMXiJ\nCUlIQjbrhHhwAjgmxjl8OybrGGOCQ4iq+pRaR+u+75FmpJVGGh3T29Ot7qq/J5JmpkdHt9RV\n9VQ91/v1w6xmpK5+Vt0fqbvr28/HCQDE5sheAGACggQIQJAAAQgSIABBAgQgSIAABAkQgCAB\nAhAkQACCBAhAkAABCBIgAEECBCBIgAAECRCAIAECECRAAIIECECQAAEIEiAAQQIEIEiAAAQJ\nEIAgAQIQJEAAggQIQJAAAQgSIABBAgQgSIAABAkQgCABAhAkQICwQfq089j88d/qX376Z5JY\nFZCic3fq3S/9xLPPf6I37DHCBumbL536VefNu0Ew9d43EiRo7+ydeud555e+8sJrXjcW8hjR\nHtr5H3/qu0Gw/9zPzr+WIMEMD+/UX3T+4Pjjbzu/GPLS0YL0O86vH/+586V6QJBgiId36l/7\ncP34T/+5t4e8dKQgZV/zwaNHHxIkmOHMnToIHtz4QMiLRwnSKz/61tuPPyZIMMLZO3UQ/N7p\nA7wwIgSp+ZGnv9/6C0GCCc7dqQPv2TO/nboTIUhfdX7zyV8IEkxw7k79Z699707YA4QP0v99\n+iPNJ38jSDDA2Tu1/1XnFw5CHyF0kNbe8qNbZ/5KkKC/s3dq//POi43whwgbpKOff03u7N8J\nErR37k79kvPbUY4RNkj/zvmdc38nSNDe2Tv1t52XIh0jZJCyzg/8m4fzFC99K/Bu3rz5zNuO\n/7gb6aoBJZy7U7/TefHmqd1wBwkZpJdb433OF4JvnJ1fBTR17k7d+nA53EF4GwUgAEECBCBI\ngAAECRCAIAECECRAAIIECECQAAEIEiAAQQIEIEiAAAQJEIAgAQKIDlJ12Rd8READwoOUH43w\nPl1Ac8If2lV7B+uijwmoTvxzpFp/3wPhBwXUlsCLDUflYkX8UQGVJfGqXXM8t5fAYQF1JfLy\ntz+TCb1TJaCzyEEafiF3xWeXvFeiHhhInL/wart/zr0wHPWIkYP0KedDV316zb0V9chA0uZz\nbV8Q+5DzqahHjBykX3Ted+XnN92FqIcGkrXhtt+17n2hi/paEgtScDczy5ADVLTnrbf/hJJB\nCvZzY81rvgRI34PCdIfPqBmk4H5hOGxdE5C05uBQpx/wigYpqPYN1KIeH0jGZLHjnVLVIAX1\nwd62LzMCsqx4nVvElA1S0BjJH0a9BkC8u+4VZzjVDVLQnMgyLgRlVLLLV3xW4SAF/myG+iQo\not47ftWnVQ5SEKy6d6JeCSCSP9x/5dtO1Q5SsO6uRr0WQKDZ3NWvfSkepGDLm4t6NYAwd9x7\nV3+B6kEKdjNTjAtBsj3v9jVfoXyQggPGhSBZNX/t4yL1gxRUimXGhSBRc6B87c9yDYIU1PpL\n7IkCeSaK1+9upUOQgqOhIuNCkGUpc//6L9IiSEFjlHEhSLLlbnfxVXoEKfCnM+3fmAgk6zCz\n0s2XaRKkIFjwtqJeHxDZNZNBLdoEKVh1r3spHxDNL189GdSiT5CCDXcp6jUC0czku3zBWKMg\nBdveHEMOSNOad81kUItOQQruZSdJEtJzz+v63QdaBelkTxQqlJCWam6+66/VK0hUKCE9jdJI\n9w+ANAsS40JIiz/WF2LGU7cgUaGElCxmu5gMatEuSFQoIRVbbqgNQ/QLEhVKSMFhJtwmBxoG\niQolJK5WmAp3AS2DRIUSkuWXB0K+LVvPIFGhhERNdzsZ1KJpkKhQQoJuefthL6JrkIL93Dh7\noiARO+5m6MtoGyQqlJCQSnYx/IX0DdJJhRLjQhAu1GRQi8ZBokIJCfBHw0wGtegcJCqUIN5C\nNtJPZ62DRIUSRNsINxnUoneQqFCCWAfeWrQLah4kKpQgUujJoBbtg0SFEoRpDg5FPTmpf5CC\nLY9xIQgxWaxFvagBQaJCCWKseAeRL2tCkIJ9KpQQ344b4905RgSJCiXEV8nG2YDUjCAFD0rs\niYJYjnrH4jxBMCRIVCghHn+0FOtBjSlBokIJsczl4v0gNiZIgT+d7XafZuCCDTdm/5Y5QQqC\neSqUEM2etx7zCCYFiQolRPOgMBv3EEYFiQolRBFjMqjFrCBRoYQIJorx32ptWJCoUEJoy5kw\nm3x3YFqQgvuFUSqUEMJ2nMmgFuOCRIUSQqlkl0UcxrwgUaGEEOq940KOY2CQqFBC1/zhfjHP\nBEwM0kmFUugtZ2Gl2VxVzIGMDFLgT1OhhC7ccUWNlZkZpCBYoEIJ19rzhI3CmBqkYM2NOz0F\n01Vzc8KOZWyQgg0qlHClRv+wuJP35gYpuOtRoYQriJgMajE4SFQo4SpLIiaDWkwO0kmFEuNC\naG/L3RZ5OKODRIUSOjnMiN2g1+wgBbWBPkEn3GCUenFC7AENDxIVSmjHLw8IftBvepCoUEIb\nM3nRc83GBynwZ6hQwnlrnvBZTPODRIUSLthN4A5hQ5CoUMJZ1VwCMy9WBCnYZFwIjzVKIwlM\nvNgRJCqU8Jg/1pdEc4klQaJCCY8sZEVOBrXYEiQqlHDqFTeZ13CtCRIVSjh2kLmVzIHtCRIV\nSghqhamEjmxRkBgXsp6ITb47sClIJ+NCVCjZbLqQ2KN7q4JEhZLdVsVPBrVYFiTGhSy2424m\nd3DbgkSFkrUq2cUEj25dkKhQstRRaTTJ292+IFGhZCV/NJHJoBYLgxQc5KlQss58LtmTiDYG\nKXiVCiXbbLi7yV6BlUGiQsk2e17SG1jbGaSTcSEqlOyR3GRQi6VBokLJJglOBrXYGiQqlCwy\nWawlfh3WBokKJWuseAfJX4nFQaJCyQ47bho/MG0OEhVKNqhkU5kJszpIwTYVSqar946nchPb\nHSQqlEznD/enM8VieZCoUDLcXMKTQS22Bymo9jEuZK4NN623RFsfJCqUDLbn3U7rqgjSyZ4o\niWwZCNmq+bnUrosgUaFkquZAOb1XkghSQIWSoSaKKT77JUin2BPFPEuZNB+xE6SH1tyEtrKF\nJFvudppXR5AeoULJLIeZlVSvjyA9RoWSSeq94+leIUFqoULJHH45pcmgFoL0RKVAhZIhZvJp\n78lBkM54UBpI/q2USN56apNBLQTprPpQLxVK+rvnpX8ygyCdQ4WSAaq5+fSvlCCdR4WS9hql\nEQkvvxKki6hQ0txEspt8d0CQLmFcSGuLqU4GtRCky+64q7KXgKi2XDnzxwSpDSqUtHWYkfRD\nkCC1s5tlXEhLKWzy3QFBausgP8q4kH788oCsm40gtUeFko7SnwxqIUgdUKGkn1uevIYRgtQJ\nFUq62ZF53oIgddQYzafQYgBRqjmZb80kSJ1RoaQTOZNBLQTpKlQoacMflTIZ1EKQrrTmprZV\nJ2JZyMp9AwxBuhoVSnrYlDQZ1EKQrkGFkg4OMmuSV0CQrnMvO0mSFCdvMqiFIF2LCiXVNQeH\npA90EaTrVRkXUtt0Qf4QCkHqQq2/T/4thU5WJU4GtRCkbjRGClQoqWrHVeFkH0HqSnM8R4WS\nmirZJdlLOEGQuuPPMC6kpKPeMSVeVCVI3VpiXEhBsieDWghS16hQUtBcTpGtcQlS96hQUs6G\nuyt7CY8QpBCoUFLMnrcuewmPEaQwqFBSyoPCtOwltBCkUKhQUogKk0EtBCmcah8VSqqYLCo0\nuUWQQqJCSRXLGZUaeAhSWFQoqWFbicmgFoIUWnMiy7iQdJXssuwlnEOQIqBCSbp677jsJZxH\nkKKgQkkyf7hfsTdbEqRI1qlQkmo2V5W9hAsIUjTb3pzsJVjsjqtc0S9BiogKJXn2PPV2GyRI\nUR0wLiRJNa/gowGCFBkVSnI0+ocVfCxAkKKjQkmKCZUmg1oIUgxHQ0XGhdK2lFFyHxqCFEdj\nlHGhlG2527KX0BZBioUKpZQdZlZkL6E9ghQTFUppUm4yqIUgxUWFUnr8smqTQS0EKTYqlFIz\nk1f2ZVKCFN+2N6fgiQ0DramwyXcHBEkAKpRSsavyzD1BEoEKpRRUcyo/hCZIQlChlLhGaUTl\nX/sESQwqlBLmjymyyXcHBEmQxjAVSklazKr97SVIolChlKQt967sJVyNIAlDhVJyDjOqN4EQ\nJIGoUEpIrTAlewnXIUgiUaGUCKU2+e6AIAlFhVISptWdDGohSGLtZmZVPtuhpVsKTwa1ECTB\nqFASbcfdlL2ELhAk0e4XhpU+c6ibSnZR9hK6QZCEo0JJpCO1J4NaCJJ49UEqlETxR9WeDGoh\nSAmgQkmY+ZwmP5MIUhKoUBJkw92VvYQuEaRkzGcUnw3TwoG3JnsJ3SJICaFCKT4NJoNaCFJS\nqFCKS4fJoBaClJgtKpTimSxqdBqBICWHCqVYVrwD2UsIgSAliAqlGHZcrd6TQpCSVCmW9Tid\nqJ5Kdkn2EkIhSImiQimieu+YXg+LCVKyqFCKxB8tafarnCAljAqlKOZ0mQxqIUhJo0IpPH0m\ng1oIUvIWvC3ZS9DLnrcuewmhEaQUrFKhFEY1Pyt7CeERpDRsuHq9litVc6Cs4dk3gpQKKpS6\nN1HUsY+AIKWDCqVuLWfU3uS7A4KUEiqUurOt12RQC0FKCxVK3bifXZG9hGgIUmqoULpevXdc\n9hIiIkjpOSoXK7LXoDZ/uF/Xx78EKUVUKF1jVoNNvjsgSGmiQulK6+492UuIjCCliwqlzu55\nGg+AEKSUUaHUSTU3L3sJMRCktFGh1F5Dk02+OyBIqbtLhVI7ek4GtRCk9O3nxjWcykzYkp6T\nQS0ESQIqlC7ZcjXf4pkgyVDtG9D6cYxwhxndt6UlSFJQoXROrTApewlxESQ5qFA6wy8P6DoZ\n1EKQJKFC6YkZfSeDWgiSLP4sFUoP3fL2ZS8hPoIkDxVKp3aN+DYQJImoUApOJoOMmPQgSDJR\noaT7ZFALQZJqN2N5hZI/2mfGuWmCJJftFUoLWUPOpxEkyeyuUNrUfTKohSDJZnOF0kHGmDdn\nESTp7K1QqhWmZC9BGIIkn60VSs3BIXOeHxIkBfjTGe36gASYLhj0mJYgKWHewgqlVRMmg1oI\nkhrsq1DacTdlL0EkgqQI2yqUKlmz/n8JkirsqlA66h0z6/+WICnDpgolYyaDWgiSOiyqUJrP\nmXbqjCApxJoKpQ3XuJf7CZJKLBkX2vPWZS9BOIKkFCsqlB4UpmUvQTyCpJbmeM6k05TtGDUZ\n1EKQFONPm16hNFmsyV5CAgiScgyvUFrOGDmhS5DUs+aa91y8Zds188cEQVKQwRVKleyy7CUk\ngyCpyNgKpXrvuOwlJIQgKcnQCiV/uN/U2Q2CpCYzK5RmjZsMaiFIijKxQumOe0/2EhJDkFRV\nH+yryl6DWHuewW9eJEjKMq1CqZo3eX9mgqQusyqUmgPDZr4S+RBBUphRFUoTReOe851FkJRm\nToXSUua+7CUkiiCpzZQKpS13W/YSkkWQFGfGuNBhZkX2EhJGkFRnQoWSuZNBLQRJefvaVyj5\nZWMng1oIkvq0r1CayZu/EwVB0sCDktZ7oqx55k4GtRAkHWhdoXTPM+Ul/KsQJC1oXKFUzc3L\nXkIaCJIempNZPR8fNUoj2r/o2A2CpAs9K5T8MdM2+e6AIGlDywqlxazZk0EtBEkfGlYobbnm\nTN1ejSBpRLsKpcPMLdlLSAtB0olmFUq1wpTsJaSGIGnlID+qz7BNszyg+WxTCARJL69qVKE0\nbcFkUAtB0ow+FUq3PNN7Nc4iSLrRpUJpx92UvYQ0ESTt6FGhVMkuyl5CqgiSfnSoUDqyZDKo\nhSDpaEH1CiV/1JLJoBaCpCXVK5QWzN3kuwOCpKcNpfdE2bBmMqiFIGnqrqduhdKBtyZ7Cakj\nSLpSt0LJpsmgFoKkrfuFYSXHhZqDQ4omPEkESV+KVihNFmuylyABQdJYbUDBCqUV70D2EmQg\nSDprjORVewPqjqv4Ka6EECStKVehVMlq9y5eMQiS3vwZpSqUjnrHlH1RPlkESXcqVSj5oyXL\nJoNaCJL21lxlNkaYs24yqIUg6U+ZCqUNd1f2EqQhSAZQpEJpz1N7kjZRBMkESlQoVfOzspcg\nEUEyQqUgvUKpOWDjZFALQTKD/AqliaKK80qpIUiGqEuuUFrOqDZjkS6CZIrGiMwKpW1LJ4Na\nCJIxmhPyKpTuZ5dlXbUiCJJBpFUo1XvH5VyxOgiSSSSNC/nD/Uq+xTBNBMkociqUZm3a5LsD\ngmQWGRVK666e9bZCESTDpF+htOdpWMkpHEEyTdoVStXcfJpXpyqCZJx0K5QapWEVBmalI0jm\nSbVCyfLJoBaCZKCjodQqlJYsnwxqIUgmao7n09kTa8u+Tb47IEhGSqlC6TCzmsK1aIEgGSqN\nCqV6cTLx69AFQTLVmpv06R2/PGD9ZFALQTJW4hVKM0wGPUGQzLWdbIXSmqdBJ3RqCJLB9nMJ\njgvtKrQxpQIIkskSrFCq5hTZTE8RBMlo1aTGhRqlESaDziJIZqv1J1Kh5I/1yd7+SzEEyXCN\nkUICQzwLWWs3+e6AIJkuiQqlV5gMuoggGU98hdJBRpn+C2UQJAusepsiD1crTIk8nBkIkg3E\nVCitPZw5ag5avcl3BwTJCkIqlIYe7lA0XWAy6DKCZAcRFUq503nyVSaD2iFIlohfoVTrOXkd\nfccV+nzLGATJFrErlHbd4yRWsouC1mMYgmSNat9ALc7l1/qC4Kg0ymRQWwTJHvWh3jjzCLPj\ngT/KZFAHBMkiTyqU/LnwgSgvBfO50yQ2ednuEoJkk1aFkp/dCH3h3OaGu3v83/3Z7KDgdRmA\nINnlcYXSVOhCo1rPhrce1Nb6ewZus1XDJQTJMo8qlLYyYcNwzy1MbU+4hQXmvtshSLZZd0/2\nomtktkNebs0t5DJTu7xo156MIH3MeX/UiyK+bW/u+M/xsIOnQz2Dd3hI19H7nY9FvWjkIH3n\nXX8a9aIQYDc75Qcb2ZC/WypMBl3lT9/1nagXjRwkSHaQH2seebuyl4GHCJK2TiqUhmdlrwIP\nEST91MuzGyeNL7X+0kpB9mLwEEHSj39rLN+TG13eqQ4VenjSowaCpKfq5tyg5/YV3GnZK8Ep\ngqSv5t6tcY/HdmogSIAABAkQgCABAhAkQACCBAhAkAABCBIgAEECBCBIgAAECRCAIAECECRA\nAIIECECQAAFCBuno5576f6cfvPq3nh05/k/9y0//jPhFIZoLt87ul37i2ec/0St5Ucq78F1b\n/Gc/+exbPlEKe5Swv5EW3/Tjp/vlvuT8l+M/p977RoKkkHO3zs7zzi995YXXvG5M9qpUd+67\nNvODz372ay/cuFEMeZDQD+1edn7l+M/MU3/PD4L95352/rUESSFnb50vOn9w/PG3o+95aI2z\n37WPPpU5/vgvnV8OeYzwz5FecP48OHzHm9eOP9z5Uj0gSEo5c+v82ofrx3/6z71d9prUd+a7\n9h9/4+QfGjf+TshDhA/S/k++ef1fON96/FeCpJQLt04QPLjxAXmr0cWl79q688mQh4jwql3v\na97t/GrrbwRJLedvnSD4vdMHeLja+e9axX3PGwdCHiHKy983nTc82QSKICnm3K0TeM9+kHq+\nLpz9rv01x/ls6JLdCEGqvvtp509afyNIajl/6/zZa9+7I3Ex2jj3XfvyP//5pz8YNkkRgvQv\nnb/4qdcvPP4bQVLL2VvH/6rzCwdyl6OJ8/fpIHBf/55muCOED9L/dj4flG+8/3E5CEFSytlb\nx/+88yIdLt24cJ8+9itOyMac0EG6/YPPH/+Q+5rz9Ud/J0gqOXfrvOT8tuz16OHMd239Pf/4\n9J8+5YR8tSFskJoffvrkhFX9p2886vIlSAo5d+t823lJ9nr0cO679jef7Tv+ePYNb6iGO0jY\nIH3D+dLpf8eefdergXfz5s1n3nb8x92QR0Eyzt0673RevHmKDqWrnfuufeeZG5/5D597vfOH\nIQ8SMkj9N9794OFH/9l58XgFj8yHvFYk4vyt8/jGcZalLkp5579rQd8n3/rMX//Id8MehbdR\nAAIQJEAAggQIQJAAAQgSIABBAgQgSIAABAkQgCABAhAkQACCBAhAkAABCBIgAEECBCBIWvOn\nxmUvAacIks4ao/lD2WvAKYKksaOhYkX2GvAQQdJXrb/0QPYa8AhB0tarvYN12WvAYwRJVwf5\nsZB7GCJB0YPEs1ypdrNTfvhL1WviV2KS6HfqyEGq9/A8V6Jtby7KxZaLPBq8QqUn8rcncpBq\nPfejXhSxrburkS7XHBzi8WBn93si/8YmSDpade9EvGStEHJPa6sQJLvMe1uRL7vnrQlciWEI\nkk2aE9l7MS6+4bKFcScEySKNkZhjQfO5VwUtxTgEyR71od6YOfBH+2jDbI8gWaPaNxD7VNBR\naSTCKSgbECRbVAplAb9NKtnQXcN2IEiW2M+JGQvacTdFHMY4BMkOu5koY0HtrHr7Yg5kFoJk\nhU134fov6tJ0nndgXEaQbLDm3hJ3MGaF2iFIFlj1hD6vYVaoDYJkPH8mI7jx+jAj8BecIQiS\n6ZoT2T3Rx9xyKaO/gCAZrjFSSOC7vZjlJjyPIJmt1t9XTeCw/hizQucRJKNVk9rlpMGs0HkE\nyWT3C8ONhA5dzYk7M2UCgmSw/dxkcr82diO/1dZIBMlc295skg+/1pgVOoMgGWtD4FhQWzPM\nCj1BkEy15t5O+Br88kBSz8D0Q5AMteC9kvh11HsnEr8OXRAkI/nTmZ0UruYwE22PPAMRJBM1\nx/MHqVzRlrudyvWojyAZKMXyo6UMt+UpgmSeVMuPJtgS/BRBMk665UeN/mFmhQKCZJ6D/Giq\nL0pXc5HKLUxDkAxzL5vgWFBbe17SJ6x0QJDMsu3Npf5I644bZ0NxQxAko2y4SxKudTaXxHue\n9EKQTBK9/CgWf7jf+lkhgmSQOOVHsdR7x+VcsToIkjFilh/FUskuy7pqRRAkU8QuP4pl201+\nRlZpBMkQ9aGi1BKw5YzEGCuAIJnhQSnFsaC2LJ8VIkhGEFN+FIvlW4ITJBOIKj+K5UFhWvYS\nJCJIBhBXfhTLnrcuewnyECT9iSw/imXD3ZW9BGkIkvaElh/FM5eT+sqhTARJd5LGgtryR63d\nEpwg6U18+VEsR71jKjxbk4AgaS2J8qNYKlkZ4+cKIEg6a4zkVftW7lg6K0SQNFYbSKT8KJ5V\nL52dwBRDkPRV7Utzl5OuTRcj36U0RpC0lWD5USx2zgoRJF3t58YVvb/WClOyl5A+gqSpu8mW\nH8VykFmTvYTUESQ9JV5+FMumq9LJrVQQJC2tuWrPhy5kbZsVIkg6SqP8KBb7ZoUIkn5SKj+K\npVEaUfYpXCIIknaa4zkNWpCruUXZS0gVQdLNUTm18qNYdtxN2UtIE0HSTKrlR7Hc8jT4xSkM\nQdJLuuVH8UznNYm8CARJK2mXH8XilwcUnb1IAEHSSfrlR7HYNCtEkDQio/wolsPMquwlpIUg\n6UNO+VEsW9bMChEkbay6GlZMLmYsua0Jki6klR/F4o9ZMitEkPTQnJRXfhSLLbNCBEkLjVGZ\n5UexVHPzspeQBoKkgyPJ5Uex3PPU2cEyOQRJA/LLj2JZc/V8VBoKQVJfpSi9/CieGQtmhQiS\n8pQoP4rFL/frM9gUEUFSnSLlR7HUe8dlLyFpBElxypQfxXKYWZG9hIQRJLWtu2ZMq22527KX\nkCyCpDSVyo/iWTJ8VoggKcyfVar8KJ6JojbvSIyCIKlLufKjWJoDZc1ffLwSQVJWY0TbsaC2\nqvk52UtIEEFSVX1QwfKjWPY8Dd8H0i2CpKhq34Bxzyk2DJ4VIkhqul8Y1nssqK25nL6zt9cg\nSEpSt/woFn/Y2FkhgqSiuxl1y49iMXdWiCApyIyxoLYq2WXZS0gGQVKP6uVHsey4ijfSRESQ\nlLOkevlRPCvegewlJIEgKUaH8qN4JouR73IKI0hq0aP8KJbm4JCBr0gSJKXoUn4US60wLXsJ\n4hEklehTfhTLnmfeqykESSFVjcqPYtlwd2UvQTSCpI77hWFTz/tfNG/crBBBUoZm5Uex+KOm\nbQlOkFShXflRLEelUbP+bwmSIjQsP4qlkl2UvQShCJIatCw/imXH3ZS9BJEIkhL0LD+KZ9Uz\n6dQzQVKAP50x7uXgLkwXDDppRpDk07j8KBajZoUIknRalx/FUitMyV6CMARJNkvGgto6yNyS\nvQRRCJJk2pcfxfKKa8pWsgRJrgPty4/iWcgacl8gSFKZUH4Uiz9myKwQQZJpyzN5F9+uNEoj\nRvwoIUgSmVJ+FEs1Z8SWSQRJHnPKj2LZNeLbQJBkMar8KJY1E2aFCJIkZpUfxTOT1/9MGkGS\nw7Tyo1j88oD2bw0mSFLUB3stHQtqq16ckL2EuAiSDCaWH8VymNH99UuCJIGZ5UexbOk+K0SQ\n0mdo+VE8Sxm97xQEKXXGlh/FM1HU+tEuQUqbweVHsTT6h3X++UKQUrbmGvMWHMGquXnZS4iB\nIKXL8PKjWPY8jXdSIkhp8mdMLz+K5Y57T/YSIiNIKWqO5xgLusqsvrNCBCk9VpQfxeIP9+s6\nK0SQUlPr79P2521a6r3jspcQEUFKizXlR7Hczy7LXkI0BCklFpUfxbLt6vmyJkFKh03lR/Es\n6zkrRJBSYVf5UTx6zgoRpDTYVn4US3NAxy3BCVIK7Cs/iqWan5W9hPAIUvIWLCw/imXPW5e9\nhNAIUtL8acaCwtpwteuLIkgJs7X8KJ65nG5bWhCkZNlbfhSLP1rS7M34BClRNpcfxXLUO6bX\n+QKClCS7y49iqWT1OmNAkBJke/lRLDt6zQoRpOTsZi0vP4pnxTuQvYQQCFJiKD+KabIY+b6Z\nPoKUFMqP4moOajQrRJASQvlRfLXClOwldI0gJWOe8iMBDrw12UvoFkFKAuVHgmxosyU4QUoA\n5UfCLGQ1GQwhSOJRfiSOP9qnxyltgiRctW9Ao5dtVdcojWhxMo4giUb5kViV7KLsJXSDIAm2\nzxrj448AAA2qSURBVFiQYDvupuwldIEgibVL+ZFwt7x92Uu4HkESivKjJExrsCU4QRKJ8qNE\n+OUB5R8uEySBKD9KiAazQgRJGMqPknOYUf1XPUEShfKjJG2pPitEkARpDBcM/L9Sx2JW7W8v\nQRKD8qOE+WNqzwoRJCEoP0qc4rNCBEkEyo9SUM3Ny17CFQiSAJQfpeKep/CbjglSfJQfpWTN\nuyd7CR0RpNg2GAtKy4y6s0IEKa41yo9S45f7VX0uSpBiWmAsKEX13nHZS+iAIMVC+VHKDjMr\nspfQHkGKg/Kj1G2527KX0BZBioHyIwmWMkreewhSdJQfSTFRVHGIhCBF9ipjQVI0B8oKvs2P\nIEVF+ZEs1byCNR8EKSLKj+TZ89Q7dUeQoqH8SKY7rnKzQgQpEsqP5JrNqfZyKUGKgvIjyfxh\n1WaFCFIE896W7CXYTrlZIYIUGuVHKqhkl2Uv4RyCFBblR2q46yo1LUyQQqoPUX6khhXvQPYS\nziBI4VB+pI7JokI3BUEKpVIoK70plFWag0PqzJYQpDAoP1LKg8K07CW0EKQQdjOMBSllz1uX\nvYTHCFL3KD9Szoa7K3sJjxCkrlF+pKB5VWaFCFK3KD9SkT+qyJbgBKk7lB8p6qg0psTzVoLU\nFcqPlFXJLslewgmC1I3GCOVHytpxN2UvISBIXaH8SGmr3r7sJRCkblB+pLjpgvyfcwTpWpQf\nqU6FWSGCdB3Kj9RXK0zJXgJBugblRzo4yMg+WU6Qrkb5kR423btyF0CQrkT5kS4WsnJnhQjS\nVSg/0obsWSGC1BnlRzpplEZkPpklSB01RvMqbQqAa1RzMp/OEqROjoaKFdlrQBi7MrftJEgd\nUH6kn1sSZ4UIUnuUH+loJi/thx9BausgPyp96ASh+eUBWdNcBKkdyo80VStMSrpmgtQGY0Ha\nOsxI6tshSJdRfqSxLUmzQgTpEsqPtLaUkXLvIkgXUX6kuYmijNdbCdJ5zYmscvWkCKVRGpbw\nDJcgnUP5kQGqufn0r5QgnUX5kRH2vPTf/EKQznhQovzICOtu6o/PCdITlB8ZYzb1WSGC1EL5\nkTn84f6UZ4UI0mOUH5mk3jue7hUSpEcoPzLL/exKutdHkE5RfmSabTfV/TYI0qlVT4WN2CHS\ncqqzQgQpOC0/krwrGhKQ6qwQQTodC6L8yEDNgXJ6L8MSpJOxIKXWA1Gq+dnUrosg1fr7qrLX\ngGTseetpXZX1Qar2scuJuTbc3ZSuyfYgUX5ktrlcSlPIlgdpPzfOWJDJ/NFSOvOTdgdp25tl\nLMhs9d6xVG5iq4NE+ZEFKtmlNK7G5iCtuam9pgN5dlKZFbI4SJQfWWLFS6FVxNogUX5kj8li\n8m98tjVIzfGcvOYCpKs5OJT4a7OWBonyI6vUClNJX4WdQaL8yDIH3lrC12BlkCg/ss5G0luC\n2xikg/woY0G2mU94VsjCIN3LTjLOYB1/tC/RWSH7gkT5kZ2OSiNJ3u7WBWnDTWViBMqpZBcT\nPLptQaL8yF47boI73FgWJMqPbHbLS+4kvFVBovzIctPJbQluU5AoP7JdszyQ1KyQRUGqDxUp\nP7JccrNC9gTpQYmxIBxmEtqa2pogUX6EE1sJzQrZEiTKj/DQYjaRe58lQaL8CI/4Y4nMCtkR\nJMqP0NJIZFbIiiCtu6vpXiFUVs0l8GPVhiAxFoRz7nni7xDmB4nyI1y0Jn5WyPggUX6Ey2aE\nzwqZHiTGgtCGX+4X/CZpw4NUG6D8CG3Ue8fFHtDsIFX7BtjlBO0cZlaEHs/oIFF+hI623G2R\nhzM5SJQf4QpLGZF3Q4ODdJfyI1xloijwcb+5QaL8CFdr9A+L+0lrbJAoP8J1qvk5YccyNUiU\nH+F6e95tUYcyM0iUH6Erd1xRu+EYGSTKj9ClWVFbgpsYpKMy5Ufojj8saFbIwCBRfoTuiZoV\nMi9IlB8hjEp2WcRhjAsS5UcIZ9sV8QKvaUGi/AhhLWcEvNPGsCBRfoTwJouRQ9BiVpAoP0IE\nzcGh2NPNRgVp1RV2oho2eVCYjnsIk4JE+REi2vPiTmaaEyR/mvIjRLXh7sY7gDFBaoyyywmi\nm485K2RKkI4oP0Ic/mi8LcENCRLlR4jpqHcszpkTM4JUKVJ+hJgq2TjnTowIEuVHEGAnzqyQ\nCUGi/AhCrMbYEtyAIG157HICIaYLkZ9p6x8kyo8gSoxZIe2DRPkRxKkVpiJeUvMg+bOUH0Gg\ng8xatAvqHSTKjyDYphvtJ7PWQaL8CMItZCONyOgcpPpgL2NBECzirJDGQaL8CElolEYinJbU\nN0j3C8OMBSEB1VyEE5PaBonyIyRlJ8IpFV2DdDdD+RGSciv8rJCmQdqk/AgJmsmHnRXSM0hr\n7q2YRwCu4JcHQm4zqmWQlig/QrJqhclwF9AwSP4M5UdI2mEm3Cy0fkFqjucYC0LitsLNCmkX\nJMqPkI7FTJi7qW5BovwIaZkohpic0SxIVcqPkJZQs0J6Bel+YZjyI6Slmpvv+mu1ChLlR0jV\nPa/rVgadgkT5EVK27na7n7xGQaL8CKmb7XZWSJ8gUX6E9PnD/d09K9cmSAuUH0GCeu94V1+n\nSZD86UzM/hogkvvZla6+TIsgUX4Eaba72hJciyBRfgSJlruZFdIhSIwFQapuZoU0CBLlR5Cr\nOVC+dn8Q9YN0QPkRJKvmZ6/7EuWDRPkR5Nu7dlZI9SBteXNRrwYQZuO6WaF0g+SvnJwmfhgk\nf6WLR2yUH0ENc7lOLxw3V04eMj0MUmMlwsOnKEEqLAaPg3Qrd/3sBeVHUIQ/2mlWqJE72dLq\nYZAWC+kEKdj0Xn0UpHr22iIayo+gjnrveIeQrGVrj4JUjbS9VaTnSOXxR0GaKV2XXcqPoJJK\ntsO7D/zSzKMgjQ9FOXCkIB26O6dBOvnv1Sg/glp2Os0K7boHp0HadSO1okd71W665J8EqTx2\nzddRfgTVrHgH7T8xNnQSJL9/OtJhowWpll0/DtIr3jV7alX7BiK/nAgkY7L48E65eGEvh1e9\nreMgrWei3WUjnkdazd3vOSheswc+5UdQUHNw6PSczcrghU8sFPZ7KrmIp2oiBqnZN9Mzm786\nJvuMBUFFtcLUyX+2Mhf+vZGf7ZnujXifjTrZsO32ZK4+PUT5ERS1552ctan0XHw7wp1Mj7sd\n8ZiRR4TKPW1f+m79G+VHUNaGu3t8V/Uuvubsl3rKUQ8ZOUgHbbP74PFTNcqPoLD5k1mh0qW7\n6Lbb4RW960UOUtD2seRa4eF/KT+CyvzRvqNg4vIL3dGf1EcPUlvDp2PelB9BWbun52yOSiP+\ncqQRhg7EBql+8tiT8iMobKxn6PbRyazQ4lZW4GHFBmkj51N+BLXdX8h7E9v+jrt66WW7GMQG\naWzqZJeTPnY5gcr83Qm3sLDgXXrZLgahQWpktik/gg5qt0o9eXdZ3AGFBmkr06D8CJo4mHUj\nnzW6TGiQJscflx/5sSrPgTQ0BY6wiQySn50/KT+qbS2UPZckwSYig7TjurO3p0o9maG5V3ie\nBKuIDFK5x+3pm759n1lVWEdkkOYnd3j/EewkeEQIsBNBAgQgSIAABAkQgCABAhAkQACCBAhA\nkAABCBIgQKQgvewcu/HDH/1d3lEuz/Z42/chH98233v04UuOw6TJJc3sdLuhb++p9z365284\nX4xw2IhB+sDNm//2l3/E+eHvR7k4RKiOuPNtcvKy88xnHn509EPPEKQ2Rnsy7bpkv+D8/ul/\nl5778Sh7ckUM0tdO/tP4Hz/wXH+Uy0OI3VJu7dKA8MvO+1/3sCn1r5yfJkjtHPT1tNmdZ//H\n3njamvcPnf8T5aBxghQEf+H8XJTLQwx/Ldt3cduBl53/5Pzx6Uf/6Cc+Q5Da28z0DF4qnfiu\n88njP//c+WykQ8YLUvBeh75ymepz7uj5BqqXnb96x+lPt3uv/fefJkidzLnuzMV/+7Tzl8He\nj7w1WlNrzCD9hvM/I10tRDkoe3Nn43IcpN90Tu4jf+yME6TOauWezPr5f3rlb/zY/r9y/le0\n48UM0h86//XJP49kIIHX01MaaPkt579996nPHX/wU3974KNO35NPDBRlL1Qxx9+33vP36286\nf//pj0XLUdwg/Xfnd5/8851JpG8075bWb7f8vvPN2+9/29rtgvP12x93Vp984vay7JWqZTzX\n414sZv4Hzpsu/JbqWswg/WvnWxGvGEIcP0kaPnc+6fih3fFP1u8FX7mxHfDQrrNF1528dD7p\ne84/jXq8eEFqvsPZjHrNiO/kZbsLz41PglR542f85z8ZEKSONjNuf5uu2O87X4h6xHhB+iPn\n41GvGPHtlrKrF3+qngQp+CevzzrfIUidVPp7crvtPiEpSM0/evZNs1GvGHFVRt2Zy7uenQYp\n63zoLXWC1F5z3HVX2n8q/SB94ObNX//c250fykW9XsS17Q6324TzNEjBO50XA4LUVtPrufzk\n6JH0g3TiTX/3t9r+fkQqqu2rFB4G6evOYECQ2luodvxU2kECcB5BAgQgSIAABAkQgCABAhAk\nQACCBAhAkAABCBIgAEECBCBIgAAECRCAIAECECRAAIIECECQAAEIEiAAQQIEIEiAAAQJEIAg\nAQIQJEAAggQIQJAAAQgSIABBAgQgSIAABAkQgCABAhAkQACCBAhAkAABCBIgAEECBPj//KWj\nSgr/hzUAAAAASUVORK5CYII=" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ] - }, - { - "metadata": { - "id": "mUAnGdrkk2-Z" - }, - "cell_type": "markdown", - "source": [ - "Next Consider the elemntary Triangular Model:\n", - "$$\n", - "D \\to Y, \\quad X \\to (D,Y).\n", - "$$\n", - "This model has no testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$." - ] - }, - { - "metadata": { - "trusted": true, - "id": "pBHDnH7Fk2-Z", - "outputId": "ea6c8fd5-4e90-4ce6-e84b-449d9c49d8b5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 437 - } - }, - "cell_type": "code", - "source": [ - "G3<- dagitty('dag{\n", - "D -> Y\n", - "X -> D\n", - "X -> Y\n", - "}\n", - "')\n", - "\n", - "ggdag(G3)+ theme_dag()\n", - "\n", - "print(impliedConditionalIndependencies(G3))\n", - "\n" - ], - "execution_count": 24, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de5zOdf7/8decnE85C6EoYamE\nHDebNpWN7aCNtGk76BurUik20kbatNnYWmyk0nbQprRLKjaiEkUUcswhlOM4DXN4f6+PMWOG\nua65Pp/r/Xm/3ofn/Z9+31xzvV9uv300n8/M+3p/SABAwoh7AAAbICQACRASgAQICUAChAQg\nAUICkAAhAUiAkAAkQEgAEiAkAAkQEoAECAlAAoQEIAFCApAAIQFIgJAAJEBIABIgJAAJEBKA\nBAgJQAKEBCABQgKQACEBSICQACRASAASICQACRASgAQICUAChAQgAUICkAAhAUiAkAAkQEgA\nEiAkAAkQEoAECAlAAoQEIAFCApAAIQFIgJAAJEBIABIgJAAJEBKABAgJQAKEBCABQgKQACEB\nSCA1pP3zJz146/Vdut5w+/B/LT0m850B9CYvpM8Ht06hAsp1/csP0t4cQG+SQto96jw6XfKl\nr2TKeX8AvUkJaceD5YvI6LgGLxyVsQKA3iSElDWuYrSMPOfPTXwJAM0lHtKalrEy8vQ9IGFQ\nAJ0lHNK0qFd1JzX+RsaoAPpKMKScIcVnFFH2v3KmBdBUYiFl3R5XR0QlX5Y0L4CWEgopJ96O\nIv4la2IADSUU0iPxd0QlZ8kaGUA/iYT0qo+OiCqukzY0gG4SCGltBV8hUfMj8sYG0EvwkLKK\n/f3RqR6UODeAVoKHNNZvR5S2TOLgADoJHNJ2nxd2no4yJwfQSOCQ7vffEdFsmaMD6CNoSLvK\nBQmpndTZAbQRNKQ/B+mI6HOpwwPoImhI5wYL6f+kDg+gi4AhfRqsI6qcIXd8AD0EDGlQwJBo\njtzxAfQQMKSLgob0iNzxAfQQLKQ9yUVn0vSomOr9s7cQA4t+RWvJ8wNoIVhIc6N9wxkmxC+J\nKm4XS1OKfkHJLMl/AQAdBAvphWghpS0X36bR+Bgb8dZL/gsA6CBYSPdFy4RaZomHW2bH2Ij3\nH8l/AQAdBAupV9RO6ClxaIXYEv1ElCly5wfQQrCQfhM9pFJrIn/eI/qfPyd3fgAtBAupc/RQ\naIYQWY2i//FIyX8BAB0EC6lL9FB6CJEp5kT/89GS/wIAOggWUvRLt4rbxJz7hfhd1Bf8XfJf\nAEAHwULqE7WTiSKjUeoy8WPUj/3hgDuwUbCQop7D1SlHPEZ0SY54Ltor5smdH0ALwUJ6KUol\npdaItaUi/5wgsqLtxvtR8l8AQAfBQvo8SiWjhPi1988zdorFRW/HqyB5fgAtBAvpSOloV27F\n+bXk+QG0EPBjFLF+kRQTfvoNVgoY0hNBQ/ryxBts2iTtrwDAL2BIawN2dE7OiTe4hxrd/e+9\n0v4aALyCHn7SLlhIw/O+vkdS5P9KueRPn+BZzWCDoCH9M1BHqRvy32DHq7fW8f5VuaufXSnn\nrwLAJ2hIx84KEtLNhd/ku+d+c/zzFrX6vIxfL4HRAh9Z7P8M/YjTv/dkLhjWLtX7o6b3vo+H\nn4OxAoeUEeCIyD8U/Vb7Z/Q/z/vjtE6Pf4YjHcBIwR/rMst3R5V/jv5umyffVN17TaXf/n1N\n4JEAuCTwxL6b/IY0Ofb75Sx7+ooy3uvq/eFfPwUfC4BBAiGl+7y4uzGO98z4+JGLvU16yRc+\nNOdw8NEAFEvkYcxLfO24axzvzxJ2vXnn2d4XlOoyekl2AuMBqJNISOI9Hx2dudHPO6/7x/WV\nva+q2nMS9hKBARIKSUyMu6NKvp8fm7V4ZOeS3pc2+j/sJQLdJRaSmFIyvo5qBXsO86FZg5pj\nLxEYIMGQxMy4HoHZeEPx7xTNjld/X9t7j3JXPbsiwWEBwpJoSGLVL4rvqFd6got8+7e8vURT\nsZcIdJRwSOLwgGIyqvyihDlF5oLh2EsE2ko8JCGWtonV0a3yfrm6/93+jb23TOv0+CLsJQKd\nyAhJ5LwXLaXUm2Tf12ye3Ov4XqKKPcZjLxFoQ0pIEfP/UPH0jBo8uk7S2xeSs2xM7l6is7CX\nCDQhKyQhjvzn/gtTT0ZU9rKRn+cU/1VBZXz8SKvcvUQPYi8R8JMXkufY6ncnjBn+xN9e/PCH\nECPKs/ut/L1ES7GXCFjJDUm99Xl7iW7EXiJgZHpIouBeIpxLBFwsCMlzaNb9uXuJ2vzpf9hL\nBOpZEpJnx6u3Ht9LVBZ7iUA5i0LynDyXaOo27lnAJZaFJI7vJWqfu5do4EzsJQJFwgxp/cA+\nIb57LPvfHZC3lwjnEoEKYYa0glL3hfj2xdgyuVcNL6ZKv/3793xTgCPCDCmnFr0b4tvHMcDJ\nc4lex14iCFOo90h9aECYbx+XjI8fxrlEELpQQ5pKjcN8+7gVOJcIe4kgFKGGtC2JNof5/n6s\n+8d1eecSbeSeBewT7o+/m9CUUN/fnwJ7id7GXiKQKtyQBlLvUN/fv/xzidoMxV4ikCfckGZS\nDQUfp/Brx7TcZ5yVveqv33DPApYIN6T0NFoe6gKBrXrumuN7iWrejL1EIEHIW4Q60DPhLpCA\nzE8fa49ziUCOkEMaQV3DXSBB6dhLBFKEHNJCKpsR7gqJy3vGWcUeeMYZBBVySJkVaV64K0iR\nv5cI5xJBMGF/jOIaGhryCrKcfMYZziUC38IO6TlqE/IKMhU4lwjPOAM/wg5pFaXsDnkJyU4+\n42ziRu5ZwBihf0K2Lr0d9hLSZS0elbuXqOHdb+/hHgaMEHpIfenusJcIxaHZg1pgLxHEK/SQ\nplHDsJcIzY5pffP2EuFcIogp9JB2JFECj+vjl7eXCOcSQSzhnyLUnCaFvka4Mj/NP5cIe4mg\naOGHdD/dGPoa4ct/xlnHxxdlcg8D+gk/pP9SFUt+JZN3LhGecQanCT+kQyXpq9AXUeXkM85u\nw14iKEDBSauX0lPhL6JQxtxHWqVEWkq64MEPsJcIcikI6Qm6PPxFFNv91l04lwgKUBDS51TK\nyv9wr5+Q/4yzjdyzADcFIWVVpo/CX4UFziWCE1Q8jeJaeljBKlxOnkuEZ5w5TEVIz1NLBatw\n2vHqrdhL5DYVIa2lZAd+VPwdziVymZIHjdWnN1Qswy5/L1ETPOPMNUpCup3uVLGMFvKfcdZx\nBPYSOURJSK9TfRXLaGPLlBN7ibqPX809C6ihJKRdybROxToayVk+piv2EjlEzcOYL6R/KFlH\nLxlzhxzfS4RziRygJqSH6Hol6+inwF4inEtkMzUhzaEzHD4QeH3+uUTYS2QtNSEdLkWLlSyk\nq+wlo7uU8mI6+843DTueDOKiJiRxGY1Ss5DGDs3CuUT2UhTSk/QrNQtpLv8ZZ1f+9RsNH8EG\ngSkKaQmVOKhmJf2teu6aCrl7iV7CXiJrKAopuyrNVrOSEfKfcdbkj9hLZAdFIYkb6AFFK5ki\n/b0B52MvkTVUhTSRWihaySRbpvTGuUR2UBXSBkraqWgps5zcS4RnnJlMVUjiHHpN1VLGyT+X\n6EKcS2QqZSHdRbepWspI+XuJLsNeIhMpC+ktqqNqKWPl7SWq0nOi0Q8ecJGykPakEO6ni5e9\neNSvcp9x1g/PODOJspBEKxqvbC2z5T/jrPXQedhLZAh1IQ2hHsrWMt/OaX3r4lwig6gLaS5V\nxO8dfVk1LncvEZ5xZgB1IR0tS58pW8wWBZ5xhr1EWlMXkriCHle3mEVwLpEJFIb0NHVSt5hl\n8s8l6oFzifSkMKSvqQSuToLLWZZ/LtFr2G2lHYUh5dSk/6hbzUp4xpm2FIYkbqJ7Fa5mq91v\n3XVO7l6iJ7GXSB8qQ5pMzRSuZrP1E26ocnwv0Q3YS6QJlSFtoaQfFS5nt+wvsZdIJypDEufS\nKyqXs96h2Q/gXCJNKA3pHvq9yuWcsHNaX5xLpAGlIb1DZ+L/r0OAc4n4KQ1pfyp9q3I9h5w8\nlwh7iVgoDUm0pbFK13NL+rv55xItxF4ixdSG9Ch1U7qee/CMMyZqQ/qEyh9TuqCL8s8lqou9\nROqoDeloOVqgdEFXZcwd0jp3L9ED2EukhNqQxFU0XO2CDts9HXuJ1FEc0l+pvdoFHbchby9R\nz4kbuWexm+KQVlDqPrUrOg97iZRQHFJOLXpX7YogCpxL1AbnEoVEcUiiDw1QvCLk2pG/l+gZ\n7CWST3VIU6mx4hXhpLxziWre/NJW7lksozqkbUm0WfGSUFDmp491OPGMs/fSuYexiOqQRBOa\nonpJOEX6e3/M3UvUAXuJZFEe0kDqrXpJKMLWE884q9B9HPYSSaA8pJlUA7e6eshZ/kzXsthL\nJIfykNLTaLnqNSGqjHl5e4lwLlFClIckOtAzyteEWPZgL1Hi1If0GF2pfE0ozvoJeMZZQtSH\ntJDKZChfFIpXYC/RdOwl8kt9SJkVaZ7yRSE++ecStR6CvUS+qA9JXEND1S8Kcdv52olnnF35\nzHL8gDVeDCE9R23ULwq+rBrXPXcvUW/sJYoPQ0irKAWX4PrLXJi/lwjnEhWPISRRl95mWBX8\ny99LhGecFYcjpFvpboZVIZi8vUQ4lygmjpBepUYMq0Jg+ecS4RlnUXGEtCOJ8Es/02TMHdIK\n5xJFxxGSaE6TOJaFBBU4l+hL7CUqjCWk++lGjmVBgpPPOJuAy4oCWEKaRVXxHzRzZX/5ZO5e\nonOwlygfS0iHStJXHOuCNIdmP3AB9hIVwBKSuJSeYlkXZNr52m15e4lwLhFPSE/Q5Szrgmyr\n8/YSuf6MM56QPqdS+BGqLbCXyMMTUlZl+ohlYQhH+nsDHD+XiCckcS0N5lkYQuP2uURMIT1P\nLXkWhjB55xLlPuOsr2t7iZhC+p6Sf+ZZGUJ28lyiB2Yf4h5GHaaQRAN6k2llCN+BDwe39L4x\npbYc/KEjt0xcId1OdzKtDGpsmOjUXiKukF6n+kwrgzLZXz55WSlH9hJxhbQrmdYxLQ0qHf4g\nby+R3c844wpJXET/4FoaFCtwLpG1e4nYQnqIruNaGhjkP+PM0nOJ2EKaQ2dkca0NLArsJbLv\nGWdsIR0pTYu51gY2eecSpXZ4zK69RGwhictoJNvawGnrS71rnthLtIp7Fmn4QhpNndnWBmY5\n3zxzZVmr9hLxhbSEShxkWxz4HZ03tLU95xLxhZRdlWaxLQ562DO9nyXnEvGFJHrSIL7FQRsb\n8s8lMvkZZ4whTaQWfIuDTrKXjO5yfC/R2Xe+uVvBchtnT3z2z8OfnvDOKmmbLRhD2kRJ2/lW\nB80UfMZZiI90zF7wWKfSlC/1gvtmSrlBYwxJNKRpjKuDfvL3EnUN6Rlnqx+uQ6ep+IcFib8z\nZ0j9qC/j6qCnvHOJavSeInsv0ZJrT68o1yUzE+2WM6TpVJtxddBW5sIRHdK8/32fL3Mv0bbf\nRcvI0/brxN6dM6S9KeTgKRkQl/SZf2wicy9RzvgKsToiSrvvSCLvzxmSaE3jOJcHzW196ea8\nvUSJ/hd3T4/YGXkuWJPAAqwhDaHunMuD/k7uJUroGWerGxTfEVH5BHYIsIY0l8of41wfjHD0\n5LlEHwQ7l2hxtXg6Iir5SuAhWUM6WpYWca4PxsjfS/SrAHuJFpePr6OIKUEHZA1JdKURrOuD\nSU7uJZqw3s/Xra0Rd0dU8u2Aw/GGNIY6sq4PhvGecZZ7LtFd0+PdS7QnrvujPGWWBZuMN6Tl\nlGbdZ44hZPnnErUaMjeOvUQ5v/XTEVHD/YGm4g0ppya9zzoAmOnnN++s7/2vvnSX0UsK7kk4\nPHjOqS8d768joj6BJuINSfSie3kHAGOtHt+9ovc//IJ7iSZT2r8Lv2pbMb+HLUKgJw4xhzSZ\nmvIOACbLXDiiY6G9RL3o1JJu9N0RNQqy+Zw5pK1EVp5yBsocyN9LNGLhsep0SklL/HdE9HyA\nMZhDEufRy8wTgPnyziU6vgeiUElR93vHclaAj/txh9SfbmGeAKyQs/zEXqJCJa0O0hEF+Y87\nd0gz6ExbT4MG1TJanQgh+b8n/s3gYCFd6n9t7pDS02gl8whgi4wyeSXckfsvsusGCyl5o++1\nuUMSbWks9whgiY9zMyjTaeiu3H8xP1hHRGN8r80e0jDqxj0CWGKct93hkY9P/vR6eNCQrvK9\nNntI8/FRCpBk76hTNuB1ChpSOd//m2QPKbMCzeeeAeyUXbroTCJ3ULd4/6x7WMyIUtJXftdi\nD0lcTcO4RwA7bYxSSfISsdnbQ/6KOHJ2lJe87nct/pDGUlvuEcBOs6NduV2SIx4mapkjHo/2\nCt+fk+MPaSWl7uOeAaz0z2iZ0FSxvyp9IjaXifaC2/2uxR+SqE0zuEcAKz0bNaSa+8VzvxXi\nhqgvuNHvWhqE1If6c48AVop64UY0SBz7QcyN/udX+l1Lg5CmUmPuEcBKj0UPJW2VEJnNov/5\nr/2upUFI25NoM/cMYKMx0UOha4SYEOOPr/e7lgYhiaY0mXsEsFGsUpoJ7wd3Ud3mdy0dQhpI\nvbhHABtF+21rHCE97HctHUKaSdXxUQqQ77vgIU3xu5YOIaWn0XLuGcBCR1MDh7TQ71o6hCQ6\nBti2DlCsC4KGlHbA71JahDSCunKPADa6L8b3nJja+15Ki5AWUpkQH78LzpoZNKRHfS+lRUiZ\nFWku9wxgoSOVAobk/zmYWoQkutMQ7hHARn8I1lET/yvpEdI4as09AthoQbCQnva/kh4hraKU\neB/SAeBDuyAdVdzrfyE9QhJ1KegDngBiCPTjhj8FWEiTkPrS3dwjgI1y2vrvqGqQqyNNQppG\nDblHACt9U8J3SC8GWUeTkHYk0UbuGcBKvn8p2z7Qxk9NQhItaBL3CGCljAv9dVRpQ6BldAlp\nkP9PyQPEY015XyEFPEBEl5BmUZVs7hnATh+V8tHRkwEX0SWkQyVpKfcMYKlX4u9oQNA1dAlJ\nXEpPcY8AtppSMt6OAl8WaRPSE3Q59whgrfeingRZyKjgK2gT0hdU6jD3DGCtZecWn9EZiRxU\nqk1IWZXpQ+4ZwF7pfYrrqP2mRN5fm5DEtTSYewSwzb4VJ//fHzWOlVHVfyZ2AI8+IT1PLblH\nAJscm/9o6ySafvJfHH2+frSMKv0p0U8f6BPSWkr+iXsGsMWKZ68udzyRQvc9x165NLmIjJo9\nnfjzUPQJSTSgN7hHAEsMyWskNf2UP/lhzJXlCkaU1v5R/58rL4JGId2e91B3gATlP4W5QxF/\neOyrfz125w1du9xw28OTF/k+dysKjUJ6g+pzjwCWyOl3IiTfT94LSqOQdiXTOu4ZwBI5t+eG\ntEjVghqFJC6iF7hHAEsc/OXxjipmqlpQp5Aeouu4RwA7HLyUyvwmElIPZSvqFNIcqpTFPQPY\n4EAHKvu/7L5E45UtqVNIR0rTYu4ZwALpXkdCZPe/YLuyNXUKSXShkdwjgPm86zrlR2BrFdJo\n6sw9AhiPpSO9QlpCJQ5yzwCG4+lIr5Cyq9Fs7hnAbEwd6RWS6EkPcI8ARuPqSLOQJlIL7hHA\nZGwdaRbSJkpS9wNLsA5fR5qFJM6h17hHAGMd/CVbR7qF1I/6co8ApkpvT+U+4Vpcs5CmUx3u\nEcBQrB3pFtLeFFrNPQMYKb0dlZvPt7xmIYlWCvcZgkUi90dlue6PPLqFNEThznewh9fRPM4B\ndAtpLlU4xj0DGGc/73Wd0C+kjDLqPh0Mttjflsot4B1Bt5BEV3XnVYAlNOhIv5Cepk7cI4BZ\nDnZivj/yaBfSMkrbzz0DmESLjvQLKacmvc89AxhEh+s6oWFI4ia6l3sEMIcmHWkY0ovUjHsE\nMEako/I6dKRhSJspaRv3DGCI/ZfocH/k0S8k0Zhe5h4BzKBPRzqGdA/dwj0CGCHSUflPuYc4\nQcOQ3qEzE3sKIbhBp450DGl/Gn3LPQPob59OHekYkmhLY7lHAO1FOvLOJdaFjiE9St24RwDd\nadaRliHNp/L4KAXEpFtHWoZ0rBzxfrYEdKddR1qGJK6m4dwjgM7060jPkJ6ldtwjgMY07EjP\nkFZQ6j7uGUBb+9ro15GeIeXUone5ZwBdRTqqsJB7iNNoGZLoQwO4RwBN6dmRpiFNpcbcI4Ce\n9rWmCjoej6NnSNuSaDP3DKAj7/6I71ziGPQMSTShKdwjgIa867rPuIcokqYhDaTe3COAfvZq\nel0ntA1pJtXARyngFHtb6fr9SNuQ0tNoOfcMoBmdO9I1JNGBnuEeAfSi7/2RR9eQRlBX7hFA\nK3p3pG1IC6lMBvcMoBGtr+uEviFlVqR53DOAPnTvSNuQxDU0lHsE0Ib2Hekb0nPUhnsE0IXm\n90cebUNaRSm7uWcAPexrTRU170jfkERdept7BNBC5LpO+440Dqkv9eMeAXRgREcahzSNGnKP\nABrYe7EJHWkc0s4k2sA9A7Az4f7Io29IojlN4h4BuO2NdPQ59xDx0DikQdSTewRgtudiQzrS\nOaRZVCWbewZgtaclVfyCe4j4aBzSoZK0lHsG4GRQRzqHJDrTaO4RgJEx90cenUMaSV24RwA+\n3u+PjOlI65C+oFKHuWcALpHrukqmXNcJvUPKqkwfcs8ATAzrSOuQxHU0mHsE4GFaR3qH9AJd\nxD0CsDDr/sijdUjrKPln7hmAgdeRUd+PNA9JNKDXuUcA9XZfRJUWcw/hk94h3UF3cI8Aynkd\nfck9hF96h/QG1eMeAVQzsiPNQ9qVTGu5ZwC1DLw/8ugdkriIXuAeAZQytCPdQxpM13KPACqZ\neV0ntA/pIzoji3sGUMfYjnQP6UhpMuv3cpAIczvSPSRxOY3kHgFUMfX+yKN7SKOpM/cIoMje\niw3bX1eQ7iEtpRIHuWcAJXZfSGcYel0n9A8puxrN4p4BVDC7I+1DEj1pEPcIoMAuszvSP6RJ\n1Jx7BAif0fdHHu1D2khJO7hngLDtudi8/d6FaR+SaEjTuEeAkO26gM5Ywj1EYvQPqR/15R4B\nwmVBRwaENJ1qc48AofI6Mv4oUP1D2ptCq7lngBCZf3/k0T8k0ZrGcY8A4fHOCzK/IxNCGkrd\nuUeA0OxqQZWNv64TRoQ0j8of454BQmJLRyaEdLQsLeSeAcJhTUcmhCS60gjuESAUltwfeUwI\naQx15B4BwmBRR0aEtJxS93PPAPLZc10nzAgppybN5J4BpPvZpo6MCEn0poHcI4BsdnVkRkiT\nqSn3CCCZTfdHHiNC2kq0hXsGkMrryOTP8Z3GiJBEY5rKPQLI9HNzqvwV9xBSmRFSf+rDPQJI\nFOmoil0dGRLSDKqZwz0DSGNhR4aElJ5GK7lnAFl+am7Z/ZHHjJBEO3qWewSQ5KdfmH1eUNEM\nCWkYXc09AsgR+X5U5WvuIeQzJKT5VPYo9wwgQ+T7kY0dmRJSZgWazz0DSGBrR6aEJLrRMO4R\nIHF23h95TAlpLLXlHgES5nVk+rlbUZgS0kpK3cc9AyTop2ZUdRn3ECExJaScM+kd7hkgMTst\n7siYkMQt1J97BEiI1R2ZE9LLdB73CJAIi++PPMaEtD2JNnPPAMFZ3pE5IYmmNJl7BAjMu65b\nzj1EmMwJ6V7qxT0CBGV9RwaF9D5Vx0cpDGV/RwaFdKAE2fszH7tF7o+qWd6RQSGJjjSGewQI\nwoWOTAppBHXlHgECcKIjk0JaRGUyuGcA3xy4P/IYFFJmRZrLPQP4tbOpEx2ZFJLoTkO4RwCf\nIt+PHLiuE2aFNJ5acY8A/ngdfcM9hBImhbSaUnZzzwB+uNORUSGJujSdewTwYUcTZzoyK6S+\n1I97BIifSx2ZFdI0asg9AsTNoes6YVhIO5NoA/cMECe3OjIrJNGCJnKPAPFxrCPDQhpEPblH\ngLi41pFhIc2mqtncM0AcnOvIsJAOlSSLHjtqL/c6Miwk0ZlGc48AxdrZ1LmOTAtpJHXhHgGK\n42JHpoX0BZU6zD0DxLbjfKq+gnsI5QwLKasyfcg9A8S03cmOTAtJXEeDuUeAWJy8rhPmhfQC\nXcQ9AsTgakfGhbSWkn/ingGiinRU3cmOjAtJNKA3uEeAaLz7I0cfP29cSHfQHdwjQBQOd2Re\nSG9QPe4RoGg7mjr587pcxoW0K5nWcs8ARXG6I/NCEi3pee4RoAhud2RgSIPpOu4R4HTbG1MN\nV++PPOaF9CFVyuKeAU71o+MdGRjSkdL0BfcMcIodTZy+rhMmhiS60EjuEaAwdGRiSKOpM/cI\nUAg6MjKkpVTiIPcMUMCP51GNb7mH4GZgSNlVaRb3DHCS19F33EOwMzAk0ZMGcY8A+bzrOqd/\nXpfLxJAmUgvuESAPOsplYkibKGk79wyQCx2dYGJIoiFN4x4BjkNHeYwMqR/15R4BPOgon5Eh\nTafa3COAQEcFGRnSnhRaxT0DoKOCjAxJtKZx3CNApCO396kWYmZIQ6g79wjO23Yu1cR1QT4z\nQ5pL5Y9xz+A4dFSYmSEdLUuLuGdw23Zc1xVmZkiiK43gHsFp289HR4UZGtIY6sg9gsvQ0WkM\nDWk5paVzz+CurY2oFu6PCjM0pJya9D73DM5CR0UwNCTRi+7lHsFV3nWd85/jO42pIU2mptwj\nOAodFcnUkLYSbeWewUnoqGimhiTOo5e5R3AROorC2JD6Ux/uERyEjqIxNqQZVCuHewbnoKOo\njA0pPY3wK0HF0FF0xoYk2tJY7hEcg45iMDekYdSNewS3bGlIZ67mHkJb5oY0Hx+lUAodxWRu\nSJkVaD73DA6JdITruhjMDUl0o2HcI7gDHRXD4JDGUlvuEZyBjopjcEgrKXUf9wyO2HIOnbmG\newi9GRySqEMzuEdww+ZzqDY6is3kkG6h/twjOGFzQ6qJ57YUw+SQXqbzuEdwQeT7EToqlskh\nbU+ijdwz2A8dxcXkkEQzepF7BOt590ffcw9hAKNDuo9u4h7BdugoTkaH9D5VzeaewW64rouX\n0SEdLEnLuGewGjqKm9EhiU70NPcINkNH8TM7pMfpCu4RLLb5bNwfxc3skBZRmQzuGaz1Azry\nweyQMivRx9wz2CpyXVdnLfcQ5jA7JNGDhnCPYCl05I/hIY2nVtwj2Akd+WR4SKspZTf3DDZC\nR34ZHpKoT9O5R7AQOvLN9MkAIO4AABFmSURBVJBuo37cI9gHHflnekiv0TncI1gHHQVgekg7\nk2gD9wyWQUdBmB6SaEETuUewyw8N0FEAxof0APXkHsEqmyIdreMewkDGhzSbquCjFPL8cDa+\nHwVifEiHStIS7hnsEemoLjoKwviQRGcazT2CNbyOcF0XiPkhjaQu3CPYYlN9dBSU+SEtplKH\nuWewAzpKgPkhZVWmOdwzWAHXdYkwPyRxHT3EPYIN0FFCLAjpBbqIewQLoKPEWBDSOkr+iXsG\n43n3R+u5hzCZBSGJBvQ69wimQ0eJsiGkO+h27hEMh+u6hNkQ0ptUj3sEs6GjxNkQ0q5kwraW\nBKAjCWwISbSk57lHMNjG+nQW7o8SZUVIg+la7hHMhY6ksCKkD6lSFvcMpvqhAa7rZLAipCOl\n6QvuGQyFjiSxIiRxOT3BPYKZ0JEsdoT0F7qUewQjbahH9XB2jBR2hLSUShzknsFA6EgeO0LK\nrkazuGcwz6YG+HmdNHaEJG6kQdwjGAcdyWRJSJOoOfcIpkFHUlkS0iZK2s49g1k2nEX1NnIP\nYRFLQhKN6FXuEYyyHh3JZUtId9Ot3COYBNd1stkS0ttUm3sEg6Aj6WwJaW8KreKewRjoSD5b\nQhKtaRz3CKZYX5fqb+QewjbWhDSUunOPYAivo03cQ1jHmpDmUflj3DMYYVN9Ogv7gqSzJqSj\nZWkh9wwmQEfhsCYk0ZVGcI9gAHQUEntCGkMduUfQH+6PwmJPSMspdT/3DLpbh47CYk9IOTVp\nJvcMmsN1XXjsCUn0ooHcI+gNHYXIopCmUBPuEbSGjsJkUUhbiLZyz6CxyP1RA9wfhcaikERj\nmso9gr7QUbhsCmkA9eEeQVsbcV0XLptCmkE1c7hn0FSkI5wXFCqbQkpPo5XcM+gJHYXOppBE\nO3qWewQtra1DZ//APYTlrAppGF3NPYKO0JECVoW0gMod5Z5BP9/XwXVd+KwKKbMCzeeeQTvf\n10ZHClgVkuhGw7hH0A06UsOukMZSW+4RNLO2Nu6PlLArpJWUspd7Bq14HW3mHsIJdoUk6tAM\n7hF04l3XbeQewg2WhXQL3cM9gkbQkTqWhfQKncc9gj7QkUKWhbQ9ibDF+YTvcX+kkGUhiWY0\nmXsETaAjpWwL6V7qxT2CHnBdp5ZtIb1P1fFRCoGOlLMtpIMlaBn3DBpAR6rZFpLoSGO4R+C3\npjadg/sjpawL6XG6gnsEdmvOREeqWRfSIiqTwT0DM6+jLdxDuMa6kLIq0VzuGXhF7o/wHDHl\nrAtJdKch3COwQkcs7AtpHLXmHoFT5LquIa7r1LMvpNWUspt7Bj7oiIl9IYmzaDr3CGxWoyMm\nFoZ0G93FPQKXyPejRuiIhYUhvUYNuUdg4nWE5wjwsDCknUnk5mkfq2uhIzYWhiRa0ETuETig\nI042hjSIenKPwAAdsbIxpFlUJZt7BuVwf8TLxpAOlaQl3DOoho6Y2RiS6EyjuUdQbBWu65hZ\nGdJI6sI9glroiJ2VIX1BpQ5zz6CS19E27iEcZ2VIWZXpQ+4ZFML9kQasDElcR4O5R1AHHenA\nzpBeoIu4R1AG13VasDOkdZT8M/cMiqAjPdgZkmhAr3OPoEako3PRkQYsDekOuoN7BCVW4/5I\nE5aG9AbV4x5BBXSkDUtD2pVMa7lnCN93NXFdpwtLQxIX0QvcI4Qu0tF56EgTtoY0mK7jHiFs\n6Egntob0IZ2RxT1DuCL3R7iu04etIR0pTV9wzxCq1fi5t1ZsDUl0oZHcI4QJHWnG2pBGU2fu\nEUKEjnRjbUhLqcRB7hlCg460Y21I2dVoFvcMYUFH+rE2JNGTBnGPEBJ0pCF7Q5pEzblHCMe3\nNajxj9xDwCnsDWkTJW3nniEM6EhL9oYkGtI07hFCgI70ZHFId1Nf7hHkW1UL+4K0ZHFIb1Nt\n7hGkQ0e6sjikvSm0insGyVZWp/OtvPEzn8UhidY0jnsEudCRvmwOaShdwz2CVOhIYzaHNI/K\nH+OeQSLcH+nM5pCOlqWF3DPIg460ZnNIois9xj2CNLiu05vVIT1DHbhHkAUdac7qkJZT6n7u\nGeRYgY40Z3VIObVoJvcMUqzCOSe6szok0Zv+yD2CDF5H2F+nN7tDmkJNuEeQIHJd12QH9xAQ\nm90hbSXawj1DwlZUQ0f6szskcT69xD1Cor5BRyawPKQBdDP3CAn6DvdHRrA8pHepRg73DAmJ\ndITP8ZnA8pD2p9E33DMkAh2ZwvKQRHv6K/cICUBHxrA9pOF0FfcIwaEjc9ge0gIqm8E9Q1Do\nyCC2h5RZgT7hniEgdGQS20MSv6FHuUcI5ptq1BS/PzKG9SH9jS7hHiGQ5ZGOdnIPAXGzPqRv\nKXUv9wwBoCPDWB+SqE3vcI/g3/KquD8yi/0h/Z7u4R7BN3RkHPtDeoXO5R7Br0hHzXBdZxb7\nQ9qRRBu5Z/AHHRnI/pBEM3qRewRf0JGJHAjpPrqJewQ/lkXuj3DOiXEcCOk/VN2gj1KgIzM5\nENKBEvQ19wxxQ0eGciAk0Yme5h4hXujIVC6E9DhdwT1CnNCRsVwI6TMqfZh7hrigI3O5EFLW\nGfQx9wzxiHSEc4lN5UJIogc9wj1CHJZVoV/8xD0EBORESH+nVtwjFO9rdGQyJ0JaQ8m7uWco\nDjoymxMhifr0FvcIxcD9keHcCOk2uot7hNi+RkeGcyOk1+gc7hFiilzXNcd1ndHcCOnnZFrP\nPUMMX6Ej47kRkriAJnCPEJ3X0c/cQ0BiHAnpAbqBe4SovkZHFnAkpNlUJZt7higiHbVAR8Zz\nJKTDpWgJ9wxF+6oyOrKBIyGJX9GT3CMUCR1ZwpWQRtFl3CMUBR3ZwpWQFlMpDT9Kgfsja7gS\nUnZVmsM9w2nQkT1cCUlcTw9yj3Cqpbius4czIf2DLuQe4RToyCbOhLSOkvTaheN1tIt7CJDF\nmZDE2fQ69wgF4f7ILu6EdCfdzj1CAejIMu6E9CbV4x7hJHRkG3dC2pVM33PPkAcdWcedkMTF\n9Dz3CCegI/s4FNLDdC33CLm+Qkf2cSikj6hSFvcMnkhHF6Aj2zgU0pHS9Dn3DBFLzqAL8Psj\n6zgUkricnuAeAR3ZyqWQnqJLuUfwOroQHVnIpZC+ohIHuUeojO9HdnIppOxq9F/eCdCRtVwK\nSdxI97Ouj47s5VRIk6g55/JfVqKLtD/MH4JxKqRNlPQj3+royGZOhSQa0atsa+O6zmpuhXQ3\n3cq1NDqym1sh/ZvqMK2MjiznVkh7U+g7loVxf2Q7t0ISbeg5jmXRkfUcC2koXcOw6lJc11nP\nsZDmUfljyhdFRw5wLKSj5Wih6jXRkQscC0lcSY8pXnEx7o9c4FpIz1AHtQt6He1RuyQwcC2k\nbygtXeV6kes6fP7IBa6FlFOLZipcDh25wrWQRG8aqG4xdOQM50KaQk2UrfVFRWqJ+yM3OBfS\nVqItipaKdHQxOnKEcyGJxjRVzUJLItd1+Lm3K9wLaQDdrGQd77wgdOQM90J6l2rlKFgGHbnF\nvZDS02hF+Kt8jvsjt7gXkmhHz4a+htfR3tBXAX04GNJwujrsJSLXddhf5xYHQ1pAZTPCXQEd\nucfBkDIr0CehLoCOHORgSKIbPRrm239WgVrh/sg1LoY0ltqG+O7oyEkuhvQtpYT3k2lc17nJ\nxZBEHXonrLdGR45yMqRb6J6Q3hkducrJkF6m88J5Y9wfOcvJkLYn0cYw3jfSUWt05CYnQxLN\n6MUQ3nVRBVzXOcvNkO6lm+S/KTpymZshvU9Vs2W/JzpympshHSxJyyS/5SLcHznNzZBEJ3pa\n7ht6He2T+5ZgEkdDepyukPp+kY5a4rrOZY6GtIjKyPwoxcIKOHfLcY6GlFmJPpb3bugIHA1J\n9KAh0t5rYXlqg/sjx7ka0nhqJeutIh1dgo5c52pIqylF0g8HcF0Hwt2QRH2aLuV9It+P0BG4\nG9Jt1E/G26AjOM7ZkF6jcyS8y6e4P4LjnA1pZxKtT/hNvI72SxgGjOdsSKIFTUz0LXBdB3nc\nDekB6pngO6AjyOduSLOpSmIfpUBHcJK7IR0qSUsS+XrcH0EB7oYkOtPoBL4aHUFBDoc0ii4L\n/sWR67q26AjyORzSYip1OOjXfoqOoBCHQ8qqTHMCfik6glM4HJK4jh4SP776X/9fuKAcOoLC\nXA5pLFVvRpRy0O/XRTpqh46gEGdD+u7PndLIU8fvV+K6Dk7nakiZ5eiEW31+JTqCIrgakuiW\nF9Kr/r4u0hGu6+A0zoZ09ERJSdt9fdl83B9BUZwNKa+kFr6+yOsoPaSBwGTuhiSOXOGFNMjP\nl+C6DqJwOKTckmb5+IIF5fH9CIrmckjiSGtKPRT/y9ERROV0SOLI/T4+JftJWWqPjqBobofk\nBzqCGBBSnHBdB7EgpPgswM+9IRaEFJdIR7iugxgQUjz+V5Y6oCOIASEJcWT5OxOeGT16wrTP\nohwK5HV0QO1MYBjXQ9r1rzvPTaZ8Na7/+/envQbXdVAsp0PKfOe3Jeg0rccV/sa0Ht+PoFgO\nh5Qx4ezTKzqu/EM7Cr7wT/h+BMVxN6QZ9aJk5Ckz8miBl8p8bjPYydWQtnWLkZGn8afcI4JJ\nHA3pg+rFdERUclRiR4ODU9wM6cliM/J0832+EDjLxZBy7o2rI6JLdnGPCqZwMKScO+PsiOgX\neGwLxMfBkIbE3RFRG1zdQVzcC+lFHx0R3cA9LpjBuZBWlvEVEo3nHhiM4FpIR8731xGVWsY9\nMpjAtZCG++yI6BL8OgmK51hIa0v5DokmcQ8NBnAspN/574hqBn6uH7jDrZDWpQUICT9vgOK5\nFdIdQTqielncc4P2nArpUPlAIdFs7sFBe06F9Eqwjuhm7sFBe06F9JuAIZXFJ/ugGC6FlFkh\nYEj0CffooDuXQvo8aEc0nHt00J1LIY2NksnxD5UfWDXx4qgh/Zp7dNCdSyHdHSukiJynooVU\nj3NsMIFLIXWOGtLmSpUa9N0oRLSPzib7eBwZOMmlkM6NGtIm7x/Vt4k9paO8ZC337KA5l0Kq\nGTskelCIK6O85Cvu2UFzLoVUrpiQ2gtxX5SX/I97dtCcSyFFu27LC6mZEMOivOQj7tlBcy6F\nVK2YkDoLMSDKSxZzzw6acymkaGfm54X0pBAdo7xkNffsoDmXQrokdkh1D4ktqVFegvPtIDaX\nQrolakibK1Wq1XOjEH2ivKIa9+igO5dCGhU1pFw5j0V5AXVgnRsM4FJIs2OFlLn1tWiXfkR/\n5B4ddOdSSAeLeM5lfGZwjw66cykk0T5gRyl7uScH3TkV0tMBQ+rCPThoz6mQdkT76XYxpnIP\nDtpzKiRxVaCOyh/gnhu051ZIcwOF9BD32KA/t0IS7QJ0VHoH99SgP8dC+hjfkCAUjoUkbvLd\nUR3cIUHxXAvpx0p+Q3qbe2QwgWshifd8dnQH98BgBOdCEv19dfQLPBwJ4uFeSMeiHXBSlDM3\nco8LZnAvJHGgVdwdVcSTmCE+DoYkdreNs6OqX3CPCqZwMSRxKL6tQvVxUgPEy8mQROaQODq6\n4ifuMcEcboYkxAe1ismo5Khs7hnBIK6GJPYPjPmE8y5ruAcEozgbkhArfxf140mt3uUeDgzj\ncEhCfD+gqMNXS177AfdgYBynQxLi2Ht3NSpUUdXrJ+IwSPDP8ZA8W2eN7X/j1V1+fcMdI99a\niZ8wQCAICUAChAQgAUICkAAhAUiAkAAkQEgAEiAkAAkQEoAECAlAAoQEIAFCApAAIQFIgJAA\nJEBIABIgJAAJEBKABAgJQAKEBCABQgKQACEBSICQACRASAASICQACRASgAQICUAChAQgAUIC\nkAAhAUiAkAAkQEgAEiAkAAkQEoAECAlAAoQEIAFCApAAIQFIgJAAJEBIABIgJAAJEBKABAgJ\nQAKEBCABQgKQACEBSICQACRASAASICQACRASgAQICUAChAQgAUICkAAhAUiAkAAk+H8zeJxR\ni3OrkAAAAABJRU5ErkJggg==" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ] - }, - { - "metadata": { - "trusted": true, - "id": "1cw47mOEk2-Z", - "outputId": "9e4f3af6-1f2b-4eac-ab68-841aaebaa87d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - }, - "cell_type": "code", - "source": [ - "P=equivalenceClass(G3)\n", - "plot(P)\n", - "equivalentDAGs(G3,10)\n", - "\n" - ], - "execution_count": 25, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Plot coordinates for graph not supplied! Generating coordinates, see ?coordinates for how to set your own.\n", - "\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "[[1]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "D -> Y\n", - "X -> D\n", - "X -> Y\n", - "}\n", - "\n", - "[[2]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "D -> X\n", - "D -> Y\n", - "X -> Y\n", - "}\n", - "\n", - "[[3]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "D -> X\n", - "D -> Y\n", - "Y -> X\n", - "}\n", - "\n", - "[[4]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "X -> D\n", - "X -> Y\n", - "Y -> D\n", - "}\n", - "\n", - "[[5]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "X -> D\n", - "Y -> D\n", - "Y -> X\n", - "}\n", - "\n", - "[[6]]\n", - "dag {\n", - "D\n", - "X\n", - "Y\n", - "D -> X\n", - "Y -> D\n", - "Y -> X\n", - "}\n" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "plot without title" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3dd3xUVd4G8GcmjRAIvXekg4oi\niCCKih0VC1gBRcUeO7i+u8uurmvs2HtBrFiQWBELGiuCa6FZqFKVIpBAAsPc1ySEBEiZmXvO\n+Z177/P9Y/X145zzvJ/dR2Hyu78Lh4hcg3QAIj9gkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRg\nkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGI\nFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGIFGCRiBRgkYgUYJGIFGCRiEpt\n6xv6sPhPNndK/S6+j7JIRDstyGy1vuiPV+KOOD/JIhGVmYiz/vrPT0KHReP8IItEVM7ZeNnZ\n1K7eb/F+jkUiKmdD+3rLLsakuD/HIhGV92Vyd4yM/2MsEtEuxqLWhvg/xSIRlbelexiPxv8x\nFomovEvwyt4Zv8b9MRaJqJwpGOV8m3JQJN7PsUhEZZY3aLvRccbh5ng/yCJRoM2/8+ly/9f2\nI8Kf/PWHrfulzIzzHBaJAiv69Q1dAMwt+yu34triP/6Q2mVzfGexSBRMW6de0gJF+uXv/Gsz\nUroXlPzZLbgivuNYJAqg/JzhdYtb1G3cHDUnskgUNGsmDM0oKlFS//HLlB3KIlGgLBo/KLmo\nRTUHT/hT5bksEgXH7HG9in9B13B4ToHio1kkCoZI7tiOxS1qlzVtm/rjWSQKgM05o5vs+HIh\n3h8QxYhFIr9bO2ForR1fLsT9vF7MWCTytSWPDk4palH64EdX67yHRSL/mp3dP1TUogbDJ23S\nfBWLRP60fea4zsW/LWqTNW2r/utYJPKhLdOympV8uTA2N959QIlhkchv1k0anlny5UL2T8Yu\nZZHIV1ZPGJxa1KIagx9dafJeFon8Y8H4ki8X6g2dsNHw1SwS+cP2meO6Fv+2qPXoHANfLuyO\nRSIf2Jab1by4Re2zDH25sDsWibwuP2d4naIShXuNmycWgkUiT/tjwuC04i8XBo1fIZmDRSLv\nWjC+f7ioRRmDJySwHFUpFok8qvThokbDcwqls7BI5EmR3KyW6r9cuPuyvIQ/yyKR12xWvrmk\nxEIk8DqXUiwSeYqWzSUlJgKzEv4wi0TeoWtzSYlLUTPxZ9BZJPKI0oeLNGwuKdETAxP/MItE\nHqB5c0mxvGT8LfFPs0hkO/2bS4p9BOQk/mkWiay2dtLw2ju+XFiq96b/IPR74p9mkchepjaX\nFDseHV18mkUiSxncXFIk2hAjXHycRSILGd5cUmQ+8LCLj7NIZBvzm0uKPA185+LjLBJZRWRz\nSZHRqB33G5jLYZHIHr+Xbi4ZNN7o5pIie+MINx9nkcgSgptL/rIhCX9383kWiSywc3NJq9FC\nDxe9D7zt5vMsEkmL5Ga1kN1c8pd/I7TGzedZJBJlxeaSvxyDLq4+zyKRnNLNJWmDxi+XTRKt\nj/NcHcAikZCFOx4usmBziePMAR5zdQCLRBLs2lziOE8As10dwCKRaeU3l2yXDrPD+ch0F4VF\nIqM254xurGNziUtdcZS7A1gkMmdN2WuRVW8ucWd9GOPcncAikSHlNpesl86yu3eBqe5OYJHI\nBO2bS9z5J0Lr3J3AIpFukdyxnXRvLnFpEHq4PIFFIq0254xuamBziTvb6+BCl0ewSKRP2eaS\n7F+ks1TlB+Apl0ewSKRJuc0lq6SzVOMRwO2gH4tEOpR+uVDfzOYSl0aintufDLNIpNr2meO6\nlGwuEXktcgI64Ti3R7BIpJTM5hJ31oRws9szWCRSJy+nZHNJ2PTmEnfeBD5wewaLRIpIbi5x\n50YkuX6Qg0UiFWQ3l7h0GPZ1fQaLRK7NHtdNdnOJO5FauNj1ISwSuWLF5hJ3vgUmuD6ERaLE\nldtcMlc6S+IeBH52fQiLRAmyZ3OJS+egoft/l7JIlIiFWl+LbNZeOMH9ISwSxa3c5hIbHy6K\n02rgv+5PYZEoLpHcrFa2bS5x5w1guvtTWCSKXbnNJfY+XBSvsUhWMFfLIlGMym0u+U06i0qH\nYH8Fp7BIFIvF4wel2Lq5xJ2tNXG5gmNYJKpWudci50lnUe4b4HkFx7BIVKWdm0vaWru5xJ37\ngIUKjmGRqHJbpmXZv7nEpTPRWMUxLBJVYl3Z5hL3EzT2aoOTVRzDIlFFPLS5xJ0VwO0qzmGR\naA+lDxd5Y3OJO68CuSrOYZFoF97bXOLOtUjJV3EOi0RlvLi5xKV+6KPkHBaJdii3uWS+dBZj\nCmvgSiUHsUhUpNzmkhXSWUz6EnhJyUEsEhV9uRDesblE/rXIZt0NLFFyEIsUdKWbSxpb8lpk\ns4aiuZqDWKQg88HmEpda4jQ1B7FIgZWfM7yu9zeXuLMMuEvNSSxSMJVuLknu7/HNJe68DHyh\n5iQWKYD8tLnEnauQpmjrBIsUNKWbSyx9LbJZB+IgRSexSEESyR3bwV+bS9wpSMM1io5ikQLD\nl5tL3PkMeEXRUSxSMKz16eYSd+4Alik6ikUKgMU7Hy7y2+YSl05Ba1VHsUh+5+vNJS41x+mq\njmKR/Gy73zeXuLMIGK/qLBbJt4KwucSdF4AZqs5ikfwpIJtL3LkCNZTN6bJIPrT00ZKHi3y/\nucSlXhig7CwWyW92bi4ZOsH3m0vc2ZyCMcoOY5H8ZOfmktbB2FzizifAZGWHsUi+UTAtq3nA\nNpe4cyuwUtlhLJI/5JW9Fjk4m0tcOhHt1R3GIvlAUDeXuNQEZ6s7jEXyugBvLnHnV+ABdaex\nSJ5W7rXIAdxc4s6zwCx1p7FInsXNJe5cggyFY1Mskjdxc4lrPTFQ4WkskgetmTA0Y8fDRaoe\npwmevGT8TeFxLJLXLOLmEiU+BHIUHscieQo3lyjzH4R+V3gci+QZkdyxHYtb1I6bSxQ4Hh1V\nHsciecPmnNFN+HCRQtGGGKHyPBbJA7i5RL35wMMqz2ORbFe2ueTR1dJZfORp4HuV57FIVuPm\nEl1Go3ZE5XkskrW2547tzM0luuyNI5SexyLZqWxzCR8u0mFDEv6h9EAWyULcXKLd+8A7Sg9k\nkWxTurmkBjeXaPRvhNYqPZBFskq5zSUbpbP42jHoovZAFska22eO68rNJWZE62GU2hNZJDts\ny+XmEoPmAI+rPZFFskB+2eaSedJZguFxYLbaE1kkab/veC0yN5cYNAqZiud+WSRRpZtL6nJz\niVFdcbTiE1kkOdxcImV9GP9SfCSLJCOSm9WSm0ukvANMVXwkiySgdHMJuo2bI50lkP6B8DrF\nR7JIpnFzibxB6KH6SBbJKG4uscH2OrhQ9ZkskjncXGKJ74GnVJ/JIplRbnMJHy6S9gig/Aff\nLJIB3Fxil5Gop/ybUhZJN24usU4nHKf8TBZJqyXcXGKfNSHcrPxQFkmfcptL+Fpki7wJfKD8\nUBZJj+0zx5VsLmmTNY0PF9nlRiSpH2xkkTTYMi2rGR8ustZh2Ff9oSySausmDc/csbnkJ+ks\nVIFILVys/lQWSanVE3ZuLlH35nlS6ltggvpTWSR1SjeX1OPmEps9CPyi/lQWSQ1uLvGMc9BQ\nw29cWSQFdm4u4cNFHtAeJ2o4lUVyi5tLvGU1cKuGY1kkV/7g5hKveQOYruFYFilxC3dsLskY\nzM0l3jEWyTrmTFikBHFziUcNQC8dx7JICeDmEu/aWhOX6ziXRYrXZm4u8bJvgOd1nMsixYWb\nS7zuXmChjnNZpNhxc4kPnIEmWs5lkWJU+nARN5d4WxucrOVcFikG3FziG8uB27UczCJVh5tL\n/ORVIFfLwSxSldbufC3y+KXSWUiBa5GyWcvBLFLluLnEf/qhj56DWaRKlH65UJ+bS3ykoAau\n1HMyi1SBss0lfLjIX74EXtJzMou0O24u8bG7AU2/12WRdrF+x+aSMDeX+NJQNNd0MotU5vfS\nzSWDxnNziT+1xGmaTmaRduDmkgBYCtyl6WgWySm3uaTVaD5c5GcvA19qOppFiuRmteDDRcFw\nFdJ0DUoGvEjcXBIoB+IgXUcHuUilm0vSBo1fLp2FDChIw7W6zg5skRbueLiIm0uC4zPgVV1n\nB7NI3FwSSLcD2p5rDl6Rym8u2S4dhkw6GW20nR2wIm3OGd2Ym0uCqjnO0HZ2kIq0puy1yNxc\nEkCLgHu1HR6YIi0ePyhlx+aS9dJZSMTzwAxthwejSGWbSyblSWchKZcjXd9XS/4vUiR3bKfi\n3xa15eaSYOuFAfoO93mRtuSMbsrNJVRkcwrG6Dvdz0Uq21ySreFdh+Qx04HJ+k73bZHKbS5Z\nJZ2FbHAroPExM38WiZtLaA8nor3G0/1XpO0zx3Xh5hLaQxOcrfF0nxWJm0uoEr8AD2g83k9F\nysvZublkvnQWss2zwCyNx/umSNxcQlW6BBk6f4zojyJxcwlVpycG6jzeB0WaPa4bN5dQNfKS\ncaPO8z1eJG4uodh8CLyp83wvF6nc5pK50lnIcv9B6Hed53u2SKWbS5L7c3MJVe94dNR6vjeL\ntJCvRaa4RBtipNYLPFikcptL+Fpkis184BGtF3isSJHcrFbcXEJxexr4XusFXipSuc0lfLiI\n4jIatSNaL/BMkcptLvlNOgt5Tg8M0nuBN4rEzSXkyoYk/EPvDR4oUunDRQ24uYQSNBV4R+8N\nlheJm0tIhX8htFbvDTYXacu0LG4uIRWORlfNN1hbpHVlm0t+ls5CHheth1Gar7CzSEu5uYQU\nmg08rvkKC4tU+nARN5eQIo8DszVfYVmRuLmENBiFurrnYGwqUsG0rObcXELqdcXRuq+wpkg7\nN5f0GsfNJaTU+jD+pfsOO4pUbnPJCuks5DvvAFN132FBkRaM7x/esbmEr0UmDf6BsPbH1qSL\nVLq5pDFfi0y6DEIP7XdIFombS8iE7XVwofZLxIqUnzO8LjeXkAHfA09pv0SmSNxcQuY8Auj/\nIligSNxcQkaNRD39v3MwXaTSzSUNubmEDOmI4/RfYrJIkdyxHbi5hAxbE8LN+m8xViRuLiEZ\nbwIf6r/FTJHWcnMJSbkRSQZ+0G+gSIt3PlzEzSVk3kD0NHCL7iJxcwnJitTCJQau0Vmk7dxc\nQuK+BZ41cI22IpVtLuHDRSToAeAXA9foKRI3l5AtzkZDE/8g11CkpY+WPFzEzSVkgfY40cQ1\nqou0c3MJX4tMNlgN3GriHpVF2rm5pDU3l5AlJgPTTdyjrEjcXEI2GoNkI0vd4i3SxKKupDQ5\ncvyug9vLTi/5cmHAXQvUZSNybQB6Gbkn/iL1Hzv26mHN0GRa+b88tmhzyeAntL43mihuW2vi\nciMXxV+kcUV/iDxRM31Gub88q+85r3BygawzA3jeyEUJFslxXkFf1VmIVLsXWGjkooSL5OwP\n/qyVbHcGmpi5KPEi/c3ICBORG21wspmLEi/SA7hTcRYixZYDt5u5KfEi3YPxirMQKfYq8JmZ\nmxIv0uWYpDgLkWLXImWzmZsSLtL2duBIKlmuH/oYuinhIj1kZqiWKHEFNXCVoasSLNL2h1Iz\nf1KfhkilL4GXDV2V0IjQmHPboHGuljxE6twNLDV0VUJDq8jsfdM6LXGIFBqK5qaukn4/EpE+\nLTHU1FUsEvnWUuBuU3exSORbLwFfmrqLRSLfuhJpxl55wiKRb/VBP2N3sUjkV1tSca2xy1gk\n8qtc4FVjl7FI5Fe3A8uMXcYikV+djDbmLlNUpF9OfUbNQUSqNMMZ5i5TVKQxwC1qTiJSYyFw\nr7nbFBVpTiPgOi5YJYs8D8yo/u9SRdXvkea3AkbwbWJkj8uRXmjuNmVfNizpBJy0RdVpRG71\nwgCDt6n71m51T+BwI/vKiaq3OQVjDF6n8Ovv9QcDfdaoO4/IhenAZIPXqfw5Uv4xQPflCg8k\nStitwEqD1yn9gWzhMKA93+tCNjgR7U1ep3ayIXIh0OxHpUcSJaQJzjZ5neIRoegYoP5Xas8k\nit8vwAMm71M+a5cN1PpA9aFEcXoW+NbkfeqHVh8OI83k1yVEFbgEGUbnAzRMf7+QgmSOsJKs\nfXGY0ft0PEbxVjpCfFMFScpLxo1GL9TyPNInmcBYHQcTxeZD4E2jF+p5sG9mIyCLw+Ak5maE\nfjd6oaYnZOe1AkZyGJykHIdOZi/U9aj54o7AEGNLxYh2EW2IkWZv1LazYVVP4Lh8XacTVWU+\n8IjZG/UtP1nfHxjwp7bjiSr3FPC92Rs1bhEqGgbfb7W+84kqcyFqR8zeqHMdV+FQoPMSjRcQ\nVawHBhm+Ueteu8gFQOv5Om8gqsCGJPzD8JV6F0RGrwMaG50dJHKcqcA7hq/UvWk1G6jDt82S\nWf9CaK3hK7WvLH4ojJqm/+lAAXc0upq+Uv/u7+dTkGrqHe1Ef4nWwyjTdxpYov9mOpIe038N\n0Q6zgcdN32nibRTTMxG6zcA9RMUeB+aYvtPIa11mNuRjFWTOKNTdbvpOM+9HmtsSuNj4/28U\nUF1xtPE7Db1obFFH4IytZu6igFsfxr+MX2rqjX2r9gWO32zoMgq0d4D3jV9q7NWX6/sBh3AY\nnPT7B8Lm/4dm7h2yeUcB+5t9/JcCaRB6mL/U4MuYC08Duiw1dx8F0/ZMjDZ/q8m3mkfOB9r8\nZPBCCqLvgafN32qySE70WqDJ/0zeSMHzMCDw6I7RIhUPg9f9zOyVFDAjUF9gE5zhIjkPhFHz\nXcN3UqB0xPECt5oukvNcMlInmb6UgmNNCP8RuNZ4kZycGhwGJ31ygA8FrjVfJOfjTIRuN38t\nBcPfkLRB4FqBIjnfcBictBmInhLXShTJmdsCuITD4KRBpBYukbhXpEjOog7AWRwGJ/VmAc9K\n3CtTJGflPsBgDoOTcg8Av0jcK1QkZ91BwKESvykkfzsbDUVezCVVJCfvSKAXh8FJsfY4UeRe\nsSI5BacCXX8Tu558aTVwq8jFckVyIqOANj/L3U8+NBmYLnKxYJGc6NVAk+8EA5DvjEFynsjF\nkkUqGQb/XDQB+csA9JK5WLZIzv1hZLwnG4F8ZGtNXCFzs3CRnInJSH1FOAP5xgzgBZmbpYvk\nTKmBpCekQ5BP3AsskrlZvEjOR7URulM6BPnDGWgidLN8kZwZDTgMTmq0xilCN1tQJGdOC+BS\nDoOTa8uBO4SutqFIzsK9gLM5DE5uvQpIrdaxokjOir2BEzgMTi5dixSp/xXZUSRnXV9gIIfB\nyZ2DcKDU1ZYUydk0CDjgD+kU5GkFNXCV1N22FMkpOIXD4OTOl4DYa7+tKZITOQ9oK/JwI/nE\nXYDYSxrsKZITvQpo+r10CvKu09Bc7G6LilQ8DF7vC+kQ5FktMVTsbquK5NwXQsZU6RDkUUuA\nu8Uut6tIzrPJSH1VOgR500vAl2KXW1Yk540aSHpSOgR50pVIKxC73LYiOR/WRugu6RDkRX3Q\nT+5y64rkfM1hcErE5lRcK3e7fUVyZjcHLuMwOMUpFxD87bWFRSoeBj9nm3QK8pjbgWVyt9tY\nJGdFD+DELdIpyFtORhvB260skrP2QGDgRukU5CnNcIbg7XYWqXgYvDeHwSl2C4F7Ba+3tEhO\nwclAN8Ff8pLXPA98I3i9rUVyto0E2nEYnGJ1OdILBa+3tkhO9Eqg6Q/SKcgr9scAyevtLVLJ\nMLjc8BR5Sn6y7I/xbS6Sc28IGe9LhyBPmA68IXm/1UVyJiQj7TXpEOQF/wVWSt5vd5GcyWlI\neko6BHnACdhL9H7Li+R8UAuhe6RDkP2a4BzR+20vkvNVfQ6DU7V+AR4UDWB9kZwfmwOXcxic\nqjQB+FY0gP1Fcha0B4ZzGJyqcjEyZP8n4oEiOct7ACdxGJyqsC8Okw3ghSI5q/cDDuMwOFUq\nLxk3yibwRJGcPw8Geq+RTkHW+gB4UzaBN4rk5B8LdF8unYJsdTNCv8sm8EiRnMLTgXa/Sqcg\nSx2HTsIJvFIkJzIaaMZhcKpItCFGCkfwTJGc6Big/lfSKchG84BHhCN4p0iOkx1CrWnSIchC\nTwHSrzHxUpGch8NIe106BNnnQtSOCEfwVJGcF1OQ9LR0CLJODwySjuCtIjlvp3MYnHa3IYx/\nSmfwWJGcT+twGJx2MxV4VzqD14rkzGoEZEWlU5BN/oXQWukMniuSM68VMILD4FTmaHSVjuDB\nIjlLOgFD5N4oRbaJ1sP50hm8WCRnVU/g8E3SKcgWs4EnpDN4skjO+v5AH/FfFZMlHgfmSGfw\nZpGc/GOAHhwGp2KjUFd+FYE3i+QUDgPaL5BOQVbogmOkI3i2SE7kQqDZj9IpyALrw/i3dAbv\nFsmJXs9hcCryNmDBXmvPFql4x36tD6RDkLi/I/yndAZPF6l4GHyydAiSdgT2lo7geLtIzgsp\nSH5GOgTJ2p6J0dIZHI8XyXkrHSHJF4eSvO+Ap6UzOF4vkvNJJkLZ0iFI0sPAfOkMjueL5Mxs\nBIzlMHiAjUADG/7793qRnHktgYvkf7BNUjrieOkIRTxfJGdxR+CMrdIpSMgfIfxHOkMR7xfJ\nWbUvcFy+dAqSkQN8KJ2hiA+KVDwMPsCCn8mRgL8hyYonavxQJCf/aGD/1dIpSMJA9JSOUMwX\nRXIKhwKdl0qnIPO21cIl0hmK+aNITuQCoPVP0inIuFnAs9IZivmkSE70OqDx/6RTkGkPAHa8\no8QvRSoeBq+bKx2CDDsbjaUjlPBPkZwHw6gpvieQzGqHE6UjlPBRkZznU5D6snQIMmk1cKt0\nhhJ+KpLzZjqSHpMOQQZNBj6RzlDCV0VypmcidLt0CDJnDJLzpDOU8FeRnG8acsd+kByMA6Qj\n7OCzIjlzWwIXcxg8ILbWxBXSGXbwW5GcRR2BMzkMHgwzgBekM+zguyI5K/cFjt8snYJMGA8s\nks6wg/+K5KzrBxyyQToFGXA6mkhHKOXDIjl5RwG9fpdOQfq1xinSEUr5sUhO4WlAFw6D+95y\n4A7pDKV8WSQnMgpo87N0CtLsFeAz6Qyl/FkkJ3oN0ITD4D53DVKs+VbJp0UqGQa35h9XpMVB\nOFA6wk6+LZLzQBg135MOQRoV1MBV0hl28m+RnInJSJ0kHYL0+QKwZ9jfx0Vycmog6XHpEKTN\nXYA9X836uUjOx7URsub7UVLtNDSXjlDG10XiMLivtcRQ6Qhl/F0kZ04L4BIOg/vSEuBu6Qxl\nfF4kZ1EH4CwOg/vRS4BF7xD2e5GclfsAg635sR2pcyXSCqQzlPF9kZx1BwGHchjcf/qgn3SE\ncvxfJCfvSKDXH9IpSLHNqbhOOkM5ASiSU3AK0PU36RSkVi7wmnSGcoJQJCdyHtCWw+D+cjuw\nTDpDOYEokhO9Gmj6nXQKUulktJGOUF4wilQyDP65dAhSqCnOlI5QXlCK5NwfRsZU6RCkzELg\nPukM5QWmSMXD4K9IhyBVngO+kc5QXnCK5EypgaQnpEOQIpchvVA6Q3kBKpLzUW2E7pQOQWrs\nj0OkI+wiSEVyZjTgMLhP5Cdb9t9koIrkzG4OXMZhcB/4GHhDOsMuglUkZ+FewNnbpFOQa/8F\nVkpn2EXAiuSs2Bs4YYt0CnLrBOwlHWFXQSuSs7YvMHCjdApyJ9oI50hn2FXgiuRsGgQcwGFw\nb/sZeFA6w66CV6SSYXCb5h0pbhOAb6Uz7CqARXIi5wJtf5FOQS5cjAzLvjIKYpGc6FVA0++l\nU1Di9sVh0hF2E8giFQ+D1/tCOgQlamMSbpTOsJuAFsm5L4SM96VDUII+AN6UzrCboBbJmZCM\ntFelQ1BibkbItu9dA1sk540aSHpSOgQl5Dh0ko6wu+AWyfmwFkIWreqkmEUbYKR0ht0FuEjO\n1xwG96Z5wCPSGXYX5CIVD4NfzmFwz3kK+EE6w+4CXSRnQXvgHMt+skfVuhC1I9IZdhfsIjkr\negAnchjcY7rjSOkIewh4kZy1BwKHcRjcUzaE8U/pDHsIepGcTUcAvddIp6A4TAXelc6wh8AX\nySkYAnTjMLiHjENorXSGPbBIzraRQLtfpVNQzI5CN+kIe2KRHCd6JdDMuu9TqRLRejhfOsOe\nWKQiRcPgX0qHoNjMBizc88kiFRsfQq1p0iEoJo8Bc6Qz7IlFKvFMMtJsem8VVeo81LVwGoVF\n2mFyGpKelg5BMeiCY6QjVIBFKvV2TYTukQ5B1VoXxr+lM1SARdrp0zocBveAtwEbH21mkcp8\n2xi4Iiqdgqr2d4T/lM5QARapnPmtgeEcBrfbEdhbOkJFWKTylnQGTuIwuM0imRgtnaEiLNIu\nVu8HHM5hcIt9BzwtnaEiLNKu/jwY6MNhcHs9DPwknaEiLNJu8o8Fui+XTkGVGYEGVn4fxCLt\nrnAYh8Et1hHHS0eoEIu0h8iFQLMfpVNQhf4I4T/SGSrEIu0pOgao/5V0CqpIDvCRdIYKsUgV\nyQaHwe30NyRtks5QIRapQg+Hkfa6dAja00D0lI5QMRapYi+mIPkZ6RC0u221cKl0hoqxSJV4\nKx2h8dIhaDezgInSGSrGIlXmEw6D2+d+wNKfTLBIlZrVCMiy8od/wXUWGktHqASLVLl5rYAR\nHAa3STucJB2hEixSFRZ3AoYUSKegnVYD2dIZKsEiVWVVT+AIO39uEUivA59IZ6gEi1Sl9f2B\nA+3bjxtU1yMlXzpDJVikquUfA/RYIZ2CShyMA6QjVIZFqkbhUKD9AukUVGRrTVwhnaEyLFJ1\niobBW8+XTkF/+Rp4QTpDZVikakWvBxp/K52CHGc8sEg6Q2VYpBhkA3VypUOQczqaSEeoFIsU\ni4fCqPmOdAhqjVOkI1SKRYrJ8ylIfUk6RNAtB+6QzlApFik2b6Yj6VHpEAH3CvCZdIZKsUgx\nmp6J0G3SIYLtGqRsls5QKRYpVjMbAWM5DC7oIBwoHaFyLFLM5rYELrbwFVdBUZCGq6UzVI5F\nit3ijsAZW6VTBNYXwCTpDJVjkeKwal/geHt/me5zdwG/SWeoHIsUj/X9gAE2vp0nCE5Dc+kI\nVWCR4pJ3NLD/79IpgqkFhtMMkj0AABQQSURBVElHqAKLFJ/C04AuS6VTBNESwOZX/LJIcYqc\nD7S28sUiPvcSYPMaaRYpXtFrgSb/k04RPFcizeb1GSxS/LKBuvbOqvhVb/SXjlAVFikBD4ZR\n813pEAGzORXXSWeoCouUiOdSkPqydIhg+RR4TTpDVVikhOTUQNJj0iEC5TZgmXSGqrBIifk4\nE6HbpUMEyRC0lY5QJRYpQd805I59k5riTOkIVWKREsVhcJMWAPdJZ6gSi5SwRR2AMzkMbsZz\nwDfSGarEIiVu5T4cBjflMqTb/c8sFsmFdQcBh2yQThEI++MQ6QhVY5HcyDsK6MVhcP3ykm3/\nZodFcqXwVKCLxY+b+cXHwBvSGarGIrkTGQW0+Vk6he/9F1glnaFqLJJL0WuAJt9Jp/C7E7CX\ndIRqsEiuFQ2Dfy4dwt+ijXCOdIZqsEjuPRBGxnvSIXztZ+BB6QzVYJEUmJiMVIs3RXnfBMD2\nRylZJBWKhsEflw7hYxcjY5t0hmqwSEp8XBshe9+U4Hn74DDpCNVhkdSYwWFwfTYm4f+kM1SH\nRVJkTgvgUg6Da/EB8JZ0huqwSKos7ACcZfdgpVfdhNAf0hmqwyIps3JvYDCHwTU4Fp2lI1SL\nRVJnXV/gUA6DKxetj3OlM1SLRVIo70jgAOt/EeI58wD73zrKIqlUcArQlcPgij0J/CCdoVos\nklKR84C2v0in8JkLkBmRzlAtFkmt6NVAUw6DK9UdR0pHqB6LpFo2UI/D4AptCOOf0hmqxyIp\nd38YGVOlQ/jIe4AHFq2zSOo9m4zUV6RD+Mc4hNZKZ6gei6TBGzWQ9KR0CN84Ct2kI8SARdLh\no9oI3SUdwie218P50hliwCJpMaMBh8EV+RF4QjpDDFgkPWY3By7jMLgCjwFzpDPEgEXSZOFe\nwDm2P9bpBeehrhf+gcQi6bJib+CELdIpvK8LjpGOEAsWSZu1BwIDN0qn8Lp1IfxbOkMsWCR9\nNg0CenMY3J23gWnSGWLBImlUcDLQzeo3n9rv7wj/KZ0hFiySTpFzOQzu0hHYWzpCTFgkraJX\nAU2/l07hYZFMXCSdISYskmZFw+BfSIfwru+AZ6QzxIRF0u3eEDLelw7hWQ8DP0lniAmLpN2E\nZKS9Jh3Cq0agQVQ6Q0xYJP0mpyHpKekQHtUBg6UjxIZFMuDDWgjdLR3Ck/4I4RbpDLFhkUz4\nuj6HwRMyBfhIOkNsWCQjfmwOXO6F2UvL3ICkTdIZYsMimbGgPYfBE3Ao9pOOECMWyZAVPYAT\nOQwen221cKl0hhixSKYUDYMfxmHwuMwEJkpniBGLZMymI4Dea6RTeMr9wK/SGWLEIplTMATo\ntlw6hZechcbSEWLFIhm0bSTQziv/iLVBO5wkHSFWLJJJ0Sygmf1vVrDFKiBbOkOsWCSjomOB\n+l9Kp/CK14FPpDPEikUyLDuEWp54dtoC1yMlXzpDrFgk0x4JI+116RDecDAOkI4QMxbJuBdT\nkPS0dAgv2FoTV0hniBmLZN7bNRG6RzqEB3wNvCCdIWYskoBP63AYPAbjgcXSGWLGIkmY1Ri4\nwhtPfgo6Hc2kI8SORRIxvxUwgsPgVWuFU6QjxI5FkrGkE3ASh8Grshy4QzpD7FgkIat7Aod7\n5KE1Ga8AHnqpNYskZf3BQB8Og1fuGqR66F/ZLJKY/GOA7hwGr1Rf9JWOEAcWSU7hMKD9AukU\ntipIw9XSGeLAIgmKXAg0+1E6haW+ACZJZ4gDiyQpej1Q/yvpFHa6E/hNOkMcWCRZ2UCtD6RD\nWOlUtJCOEA8WSdjDYaRNlg5hoxYYJh0hHiyStBdSkOyNN5cYtQTw1GAviyTurXSExkuHsM6L\ngKd+88giyfskk8Pge8hCWoF0hniwSBaY2QjI4jD4Lnqjv3SEuLBINpjXChjJYfByNqfiOukM\ncWGRrLC4I3DGVukUFvkU8NZbDlkkO6zqCRznmZU5+t0GLJPOEBcWyRLr+wMD/pROYY0haCsd\nIT4ski3yjwb2Xy2dwhZNcaZ0hPiwSNYoHAp0XiKdwg4LgPukM8SHRbJH5AKg9XzpFFZ4Dpgp\nnSE+LJJFotcBjb+VTmGDy5Duse8wWSSrZAN1cqVDWGB/HCIdIU4skl0eCqPmO9IhxOUl4wbp\nDHFikSzzfApSX5YOIe1jYIp0hjixSLZ5Mx1Jj0mHEPZfYJV0hjixSNaZnonQbdIhZJ2AvaQj\nxItFss/MhgF/rCLaCMOlM8SLRbLQ3JbAxdulU8j5GXhIOkO8WCQbLeoInOmxH6Qo9AzwP+kM\n8WKRrLRyX+D4zdIppFyEDM89nMUi2Wl9P+CQoA6D74PDpSPEjUWyVN5RwP6/S6cQsTEJ/yed\nIW4skq0KTwO6LJVOIWEa8JZ0hrixSNaKnA+0+Uk6hYCbEPpDOkPcWCR7Ra8Fmnju6yv3jkVn\n6QjxY5Fslg3U/Uw6hGnR+jhXOkP8WCSrPRBGzXelQxg2F3hUOkP8WCS7PZeMVC+9JkiBJ4Ef\npDPEj0WyXE4NJD0uHcKoC5AZkc4QPxbJdh/XRuh26RAmdceR0hESwCJZ75tgDYOvD+Of0hkS\nwCLZb24L4JLADIO/B3jx6xUWyQMWdQDOCsow+DiE1klnSACL5AUr9wEGB2QY/Ch0k46QCBbJ\nE9YdBBy6QTqFCdvr4ALpDIlgkbwh70igVxCGwX8EnpTOkAgWySMKTgW6/iadQr/HgLnSGRLB\nInlFZBTQ5mfpFNqdh7qe/IKSRfKM6NVAk++kU+jWGcdKR0gIi+QhRcPgn0uH0GtdCDdJZ0gI\ni+Ql94eR8Z50CK3eBqZJZ0gIi+QpE5OR+op0CJ3+jrA3V76wSN4ypQaSnpAOodHh2Ec6QmJY\nJI/5qDZCd0qH0CaSiYukMySGRfKaGQ18PAz+HfCMdIbEsEieM6cFcKknf9ZSvYcAj+5NYpG8\nZ+FewNn+HAYfjgZR6QyJYZE8aMXewAm+HAbvgMHSERLEInnRur7AQB8Og/8Rwi3SGRLEInnS\npkHAAd5bR1qdKcBH0hkSxCJ5U8EpQNdl0ilUuwFJm6QzJIhF8qjIuUDbX6RTKHYo9pOOkCgW\nyauiVwFNv5dOodS2WrhUOkOiWCTvygbqfSEdQqWZwETpDIlikTzsvhAypkqHUOh+YIF0hkSx\nSF72bDJSX5UOoc5ZaCwdIWEskqe9UQNJntwVUqF2OEk6QsJYJG/7sDZCd0mHUGQVkC2dIWEs\nksd97Z9h8NeBT6UzJIxF8rrZzYHLfDEMfj1S8qUzJIxF8ryiYfBztkmnUOBgHCAdIXEskvet\n6AGcuEU6hWtb05ElnSFxLJIPrD0QOGyjdAq3vgZelM6QOBbJD4qGwXt7fRj8HmCxdIbEsUi+\nUHAy0M3jw+DD0Fw6ggsskj9sGwm08/YweCucKh3BBRbJJ6JXAk1/kE7hwnLAy2vGWCTfKBoG\n/1I6ROImAV7ea84i+ce9IWS8Lx0iYVcjzctf4bNIPjIhGWmvSYdIVF/0lY7gBovkJ5PTkPSU\ndIjEFKThaukMbrBIvvJBLYTukQ6RkM+BSdIZ3GCR/OWr+h4dBr8T8PQbclkkn/mxGXCFB4fB\nT0Ur6QiusEh+s6A9MNx7w+AtMEw6gissku8s6Qyc5LVvkhcD3vy9XSkWyX9W7+e9YfAXga+k\nM7jCIvnQnwcDvddIp4hLFtIKpDO4wiL5Uf6xQPfl0ini0Rv9pSO4wyL5UuHpQLtfpVPEbnMq\nrpfO4A6L5E+R0UAz7wyDfwq8Lp3BHRbJp6JjgPqe+f37bYCnfiW6JxbJt7JDqDVNOkSMhqCt\ndASXWCT/ejiMNI/8gqkpzpKO4BKL5GMvpiD5aekQsVgA3C+dwSUWyc/eSvfGMPhzwEzpDC6x\nSL72aR1PDINfhvSt0hlcYpH8bVYjICsqnaI6++FQ6QhusUg+N68VMMLyYfC8ZNwgncEtFsnv\nlnQChtg9x/YRMEU6g1ssku+t6gkcvkk6RVVuAVZJZ3CLRfK/9f2BPmulU1RhMDpIR3CNRQqA\n/GOAHvaO4EQbYbh0BtdYpCAoHAa0XyCdojI/AQ9JZ3CNRQqEyIVAsx+lU1TiGeB/0hlcY5GC\nIXq9vcPgFyHD8u/nY8AiBUU2UOsD6RAV2geHS0dwj0UKjKJh8MnSISqwMQn/J53BPRYpOF5I\nQfIz0iH2NA14SzqDeyxSgLyVjvCj0iH2cBNCXn/9rcMiBcsnmQhlS4fY3bHoLB1BARYpUGY2\nAsbaNQwerY9zpTMowCIFy7yWwEVW7difC9j3y834sUgBs7gjcIZNT9E9Cdj6k+J4sEhBs2pf\n4LjN0inKXIDMiHQGBVikwFnfDxjwp3SKnbrhSOkIKrBIwZN3NLD/aukUO6wPY5x0BhVYpAAq\nHAp0XiqdosR7wHvSGVRgkYIocgHQ+ifpFMXGIbROOoMKLFIgRa8DGlvx7MKR6C4dQQkWKaCy\ngbq50iEcZ3sdXCCdQQkWKageDKPmu9IhnB+BJ6UzKMEiBdbzKUh9WTrEo8Bc6QxKsEjB9WY6\nkh4TznAu6lo1sJQwFinApmcidLtshM44VjaAKixSkH3TUHjH/toQbpK8Xx0WKdDmtgQuFvy1\n1VuAV14qWA0WKdgWdQTOlBsG/z+E7Zn6c4VFCriV+wLHiw2DH459pK5WjEUKunX9gEM2yNwd\nycRFMjcrxyIFXt5RQK/fRa7+H2DhWqOEsEhUeBrQRWQY/CHAjtFZ91gkciKjgDY/C1w8HA3s\n2sSSOBaJHCd6DdBEYBi8Awabv1QPFomKFA2Df2b60j9CuMX0nbqwSFTsgTBqmn5UdQrwseEr\ntWGRqMTEZKROMnvlDUiy+t228WCRaIecGkh63OiNh2I/o/fpxCJRqY9rI3SHwfu2ZeAyg9fp\nxSLRToaHwWcCz5m7TTMWicrMaQFcYmwY/H7A2hdEx41FonIWdQDOMjUMfhYaG7rJABaJylu5\nDzDY0DB4Wwwxc5EJLBLtYt1BwKFGhsFXAbeZuMcMFol2lXck0MvEuyhfAz41cI0hLBLtpuAU\noOtv+u+5Din5+m8xhUWi3UXOA9rqHwbvj97a7zCHRaI9RK8Gmn6n+ZKt6cjSfIVJLBJVIBuo\n97neK74CXtR7g1EsElXk/jAypmq94R5gsdYLzGKRqEJFw+Cv6LxgGJrrPN40FokqNqUGkp7Q\neH4rnKrxdONYJKrER7URulPb6csBfYcLYJGoMjMaaBwGnwRo/jbDLBaJKjW7OXCZpmHwq5G2\nRc/JMlgkqtzCvYCzt2k5ui/6ajlXCotEVVixN3CCjn9zFKThag3HymGRqCpr+wIDN6o/93PA\n8KYVzVgkqtKmQcAB6ofB7wQMzMUaxCJR1QpOBrotU33qqWil+khZLBJVI3Iu0PYXxYe2wDDF\nJwpjkag60auApt8rPXIxcI/SA8WxSFS9omHwL1Qe+CLwtcrz5LFIFIP7Qsh4X+F5WahRqPA4\nC7BIFIsJyUh7Vd1xB+BgdYdZgUWimLxRA0lPqjpscyquV3WWJVgkis2HtRC6W9FZnwKvKzrK\nFiwSxehrdcPg2cAKNSdZg0WiWBUNg1+uZBj8JLRTcYxNWCSK2YL2wDkqhsGb4iwFp1iFRaLY\nregBnOh+GHwBcL+CNFZhkSgOaw8EDnM9DD4RmKkijU1YJIrHpiOA3mtcHnIp0k29OsYYFoni\nUjDE/TD4fjhUSRabsEgUn20jgXa/ujkhLxk3qEpjDRaJ4hS9Emj2g4sDPgKmKEtjCxaJ4lY0\nDP5l4h+/BVilLowlWCSK3/gQak1L+NOD0UFhFkuwSJSAZ5KR9lqCn402wnClYazAIlEiXkxB\n0tOJffQn4CGlWazAIlFC3q6JUGJPiz8D6H6JmQAWiRLzaZ0Eh8EvQoae5a2iWCRK0KzGwBXR\n+D+3Dw5XH0Yci0SJmt8aGB73v1w2JuHvOtIIY5EoYUs6AyfFOww+DXhbSxpZLBIlbvV+wOFx\nDoPfhJD6DcjyWCRy4c+DgT7xDYMfi86awohikciN/GOB7svj+EC0Ps7TlkYQi0SuFA6Lbxh8\nLvCYvjRyWCRyJ3Ih0OzHmP/2J4DY/2YPYZHIpegYoP5Xsf7d5yMzojONFBaJXMsGan0Q49/b\nDUdpzSKFRSL3Hg4jLbbVqevDGKc3ixAWiRR4IQXJz8TyN74LvKc7jAgWiVR4Kx2h8TH8ff9E\naJ32MBJYJFLik9iGwY9Ed/1ZJLBIpMasRkBWtcPgHXGBiTDmsUikyLxWwIjqhsGnjFhoJIxx\nLBKpsrgTMKRAOoUQFomUWdUTOGKTdAoZLBKps74/cOBa6RQiWCRSKP8YoIffXsYXExaJVCoc\nCrRfIJ1CAItESkUuAFrPl05hHotEakWvBxp/K53COBaJVMsG6uRKhzCNRSLlHgqj5jvSIQxj\nkUi9iclIzZEOYRaLRBpMqYHB0hnMYpFIh9yzY3743B9YJCIFWCQiBVgkIgVYJCIFWCQiBVgk\nIgVYJCIFWCQiBVgk0m56qM/2kj+7FZfJRtGGRSL9LsJ9xX9cmN4qzvf7eQaLRPptaFH7t6I/\nHgvfDoWzSGRADob89Z8v4xzpINqwSGTC6Xjd+bNZo/heN+slLBKZsLp+iw2X4kXpGPqwSGTE\nBBwVPkE6hEYsEplxNDKXSWfQiEUiM97z63soSrBIZMY0XCQdQScWicxgkYgUYJGIFGCRiBRg\nkYioOiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAiwS\nkQIsEpECLBKRAiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAiwSkQIsEpEC\nLBKRAiwSkQIsEpECLBKRAiwSkQIsEpECLBKRAv8P2M5Gm8tU1hkAAAAASUVORK5CYII=" - }, - "metadata": { - "image/png": { - "width": 420, - "height": 420 - } - } - } - ] - }, - { - "metadata": { - "id": "kjil8zIfk2-Z" - }, - "cell_type": "markdown", - "source": [ - "# Example of Testing DAG Validity\n", - "\n", - "Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity.\n", - "\n", - "\n", - "There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests)." - ] - }, - { - "metadata": { - "trusted": true, - "id": "MZ_4jxNdk2-a", - "outputId": "d91485d7-311d-4ae2-ea60-2e7f87a2fb96", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - }, - "cell_type": "code", - "source": [ - "set.seed(1)\n", - "x <- simulateSEM(G)\n", - "head(x)\n", - "#cov(x)\n", - "localTests(G, data = x, type = c(\"cis\"))\n", - "\n" - ], - "execution_count": 26, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 6 × 8
DMX1X2X3YZ1Z2
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1-0.8898307-0.3510744-0.03219943 1.1341327-0.41242626 0.9395727-0.1083066-0.8836265
2-1.9921451 1.6582758-1.58224061-0.4958387-0.30845926 0.2455178-0.9164407 1.0521252
3 0.1191667 0.9196925-1.53587775 0.2432235 0.11882717-0.1880993-0.6970764 0.7663028
4 2.5104719-2.3644762 0.14060947-1.4218221-1.41155875-1.8683361-1.7100113 0.8212191
5 1.7576373 0.8129792 0.70682174-1.1490722-0.06638018-0.5181078-0.3312216-0.3964913
6-2.1648894 0.1065553 0.25728137 1.4267134-1.31195356 0.5945293-0.5855693-0.1837865
\n" - ], - "text/markdown": "\nA data.frame: 6 × 8\n\n| | D <dbl> | M <dbl> | X1 <dbl> | X2 <dbl> | X3 <dbl> | Y <dbl> | Z1 <dbl> | Z2 <dbl> |\n|---|---|---|---|---|---|---|---|---|\n| 1 | -0.8898307 | -0.3510744 | -0.03219943 | 1.1341327 | -0.41242626 | 0.9395727 | -0.1083066 | -0.8836265 |\n| 2 | -1.9921451 | 1.6582758 | -1.58224061 | -0.4958387 | -0.30845926 | 0.2455178 | -0.9164407 | 1.0521252 |\n| 3 | 0.1191667 | 0.9196925 | -1.53587775 | 0.2432235 | 0.11882717 | -0.1880993 | -0.6970764 | 0.7663028 |\n| 4 | 2.5104719 | -2.3644762 | 0.14060947 | -1.4218221 | -1.41155875 | -1.8683361 | -1.7100113 | 0.8212191 |\n| 5 | 1.7576373 | 0.8129792 | 0.70682174 | -1.1490722 | -0.06638018 | -0.5181078 | -0.3312216 | -0.3964913 |\n| 6 | -2.1648894 | 0.1065553 | 0.25728137 | 1.4267134 | -1.31195356 | 0.5945293 | -0.5855693 | -0.1837865 |\n\n", - "text/latex": "A data.frame: 6 × 8\n\\begin{tabular}{r|llllllll}\n & D & M & X1 & X2 & X3 & Y & Z1 & Z2\\\\\n & & & & & & & & \\\\\n\\hline\n\t1 & -0.8898307 & -0.3510744 & -0.03219943 & 1.1341327 & -0.41242626 & 0.9395727 & -0.1083066 & -0.8836265\\\\\n\t2 & -1.9921451 & 1.6582758 & -1.58224061 & -0.4958387 & -0.30845926 & 0.2455178 & -0.9164407 & 1.0521252\\\\\n\t3 & 0.1191667 & 0.9196925 & -1.53587775 & 0.2432235 & 0.11882717 & -0.1880993 & -0.6970764 & 0.7663028\\\\\n\t4 & 2.5104719 & -2.3644762 & 0.14060947 & -1.4218221 & -1.41155875 & -1.8683361 & -1.7100113 & 0.8212191\\\\\n\t5 & 1.7576373 & 0.8129792 & 0.70682174 & -1.1490722 & -0.06638018 & -0.5181078 & -0.3312216 & -0.3964913\\\\\n\t6 & -2.1648894 & 0.1065553 & 0.25728137 & 1.4267134 & -1.31195356 & 0.5945293 & -0.5855693 & -0.1837865\\\\\n\\end{tabular}\n", - "text/plain": [ - " D M X1 X2 X3 Y \n", - "1 -0.8898307 -0.3510744 -0.03219943 1.1341327 -0.41242626 0.9395727\n", - "2 -1.9921451 1.6582758 -1.58224061 -0.4958387 -0.30845926 0.2455178\n", - "3 0.1191667 0.9196925 -1.53587775 0.2432235 0.11882717 -0.1880993\n", - "4 2.5104719 -2.3644762 0.14060947 -1.4218221 -1.41155875 -1.8683361\n", - "5 1.7576373 0.8129792 0.70682174 -1.1490722 -0.06638018 -0.5181078\n", - "6 -2.1648894 0.1065553 0.25728137 1.4267134 -1.31195356 0.5945293\n", - " Z1 Z2 \n", - "1 -0.1083066 -0.8836265\n", - "2 -0.9164407 1.0521252\n", - "3 -0.6970764 0.7663028\n", - "4 -1.7100113 0.8212191\n", - "5 -0.3312216 -0.3964913\n", - "6 -0.5855693 -0.1837865" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 33 × 4
estimatep.value2.5%97.5%
<dbl><dbl><dbl><dbl>
D _||_ X3 | Z2-0.0020713580.96320552-0.089833900.085723024
D _||_ X3 | X2, Z1 0.0235140970.60079885-0.064485860.111152233
D _||_ X3 | X1, X2 0.0368450460.41214668-0.051187340.124311241
D _||_ Z1 | X1, X2-0.0669815310.13557344-0.153947970.021008765
D _||_ Z2 | X2, Z1-0.0730775840.10336020-0.159924430.014884642
D _||_ Z2 | X1, X2-0.0322278220.47320617-0.119757010.055796867
M _||_ X1 | D 0.0099400880.82479380-0.077906570.097633980
M _||_ X2 | D-0.0368539100.41156228-0.124232500.051089874
M _||_ X3 | Z2 0.0692164010.12259044-0.018675550.156053012
M _||_ X3 | X2, Z1 0.0571141330.20333985-0.030907630.144261035
M _||_ X3 | X1, X2 0.0580558580.19597115-0.029963640.145186241
M _||_ X3 | D 0.0594995710.18461277-0.028427390.146517386
M _||_ Z1 | X1, X2-0.0243838730.58739526-0.112011720.063619153
M _||_ Z1 | D-0.0399924880.37284854-0.127326250.047954358
M _||_ Z2 | X2, Z1 0.0302576180.50069599-0.057762640.117812572
M _||_ Z2 | X1, X2 0.0151462920.73610975-0.072817400.102876800
M _||_ Z2 | D 0.0260335670.56196445-0.061886400.113553829
X1 _||_ X2 | Z1 0.0173651390.69891990-0.070520960.104984454
X1 _||_ X3-0.0109160130.80772207-0.098512920.076848315
X1 _||_ Y | D, X2, X3 0.0777718180.08326238-0.010253400.164609088
X1 _||_ Y | D, X2, Z2 0.0242357670.59004395-0.063855500.111953380
X1 _||_ Y | D, X2, Z1 0.0345520490.44233066-0.053565850.122137776
X1 _||_ Z2-0.0811310740.06988127-0.167637980.006606516
X2 _||_ X3 | Z2 0.0124976610.78074322-0.075363660.100166937
X3 _||_ Z1-0.0334637320.45548484-0.120801560.054386481
Y _||_ Z1 | X1, X2, Z2 0.0190551340.67187741-0.069015720.106832097
Y _||_ Z1 | D, X2, Z2 0.0134035260.76575956-0.074639700.101239966
Y _||_ Z1 | X1, X2, X3 0.0407037770.36536644-0.047421090.128202210
Y _||_ Z1 | D, X2, X3 0.0412029350.35950969-0.046922210.128694003
Y _||_ Z2 | X2, X3, Z1-0.0075224540.86721422-0.095414930.080486104
Y _||_ Z2 | X1, X2, X3-0.0112457100.80261910-0.099103410.076785498
Y _||_ Z2 | D, X2, X3-0.0215166490.63243296-0.109265990.066564492
Z1 _||_ Z2 0.0293313840.51305592-0.058509740.116723204
\n" - ], - "text/markdown": "\nA data.frame: 33 × 4\n\n| | estimate <dbl> | p.value <dbl> | 2.5% <dbl> | 97.5% <dbl> |\n|---|---|---|---|---|\n| D _||_ X3 | Z2 | -0.002071358 | 0.96320552 | -0.08983390 | 0.085723024 |\n| D _||_ X3 | X2, Z1 | 0.023514097 | 0.60079885 | -0.06448586 | 0.111152233 |\n| D _||_ X3 | X1, X2 | 0.036845046 | 0.41214668 | -0.05118734 | 0.124311241 |\n| D _||_ Z1 | X1, X2 | -0.066981531 | 0.13557344 | -0.15394797 | 0.021008765 |\n| D _||_ Z2 | X2, Z1 | -0.073077584 | 0.10336020 | -0.15992443 | 0.014884642 |\n| D _||_ Z2 | X1, X2 | -0.032227822 | 0.47320617 | -0.11975701 | 0.055796867 |\n| M _||_ X1 | D | 0.009940088 | 0.82479380 | -0.07790657 | 0.097633980 |\n| M _||_ X2 | D | -0.036853910 | 0.41156228 | -0.12423250 | 0.051089874 |\n| M _||_ X3 | Z2 | 0.069216401 | 0.12259044 | -0.01867555 | 0.156053012 |\n| M _||_ X3 | X2, Z1 | 0.057114133 | 0.20333985 | -0.03090763 | 0.144261035 |\n| M _||_ X3 | X1, X2 | 0.058055858 | 0.19597115 | -0.02996364 | 0.145186241 |\n| M _||_ X3 | D | 0.059499571 | 0.18461277 | -0.02842739 | 0.146517386 |\n| M _||_ Z1 | X1, X2 | -0.024383873 | 0.58739526 | -0.11201172 | 0.063619153 |\n| M _||_ Z1 | D | -0.039992488 | 0.37284854 | -0.12732625 | 0.047954358 |\n| M _||_ Z2 | X2, Z1 | 0.030257618 | 0.50069599 | -0.05776264 | 0.117812572 |\n| M _||_ Z2 | X1, X2 | 0.015146292 | 0.73610975 | -0.07281740 | 0.102876800 |\n| M _||_ Z2 | D | 0.026033567 | 0.56196445 | -0.06188640 | 0.113553829 |\n| X1 _||_ X2 | Z1 | 0.017365139 | 0.69891990 | -0.07052096 | 0.104984454 |\n| X1 _||_ X3 | -0.010916013 | 0.80772207 | -0.09851292 | 0.076848315 |\n| X1 _||_ Y | D, X2, X3 | 0.077771818 | 0.08326238 | -0.01025340 | 0.164609088 |\n| X1 _||_ Y | D, X2, Z2 | 0.024235767 | 0.59004395 | -0.06385550 | 0.111953380 |\n| X1 _||_ Y | D, X2, Z1 | 0.034552049 | 0.44233066 | -0.05356585 | 0.122137776 |\n| X1 _||_ Z2 | -0.081131074 | 0.06988127 | -0.16763798 | 0.006606516 |\n| X2 _||_ X3 | Z2 | 0.012497661 | 0.78074322 | -0.07536366 | 0.100166937 |\n| X3 _||_ Z1 | -0.033463732 | 0.45548484 | -0.12080156 | 0.054386481 |\n| Y _||_ Z1 | X1, X2, Z2 | 0.019055134 | 0.67187741 | -0.06901572 | 0.106832097 |\n| Y _||_ Z1 | D, X2, Z2 | 0.013403526 | 0.76575956 | -0.07463970 | 0.101239966 |\n| Y _||_ Z1 | X1, X2, X3 | 0.040703777 | 0.36536644 | -0.04742109 | 0.128202210 |\n| Y _||_ Z1 | D, X2, X3 | 0.041202935 | 0.35950969 | -0.04692221 | 0.128694003 |\n| Y _||_ Z2 | X2, X3, Z1 | -0.007522454 | 0.86721422 | -0.09541493 | 0.080486104 |\n| Y _||_ Z2 | X1, X2, X3 | -0.011245710 | 0.80261910 | -0.09910341 | 0.076785498 |\n| Y _||_ Z2 | D, X2, X3 | -0.021516649 | 0.63243296 | -0.10926599 | 0.066564492 |\n| Z1 _||_ Z2 | 0.029331384 | 0.51305592 | -0.05850974 | 0.116723204 |\n\n", - "text/latex": "A data.frame: 33 × 4\n\\begin{tabular}{r|llll}\n & estimate & p.value & 2.5\\% & 97.5\\%\\\\\n & & & & \\\\\n\\hline\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & -0.002071358 & 0.96320552 & -0.08983390 & 0.085723024\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X2, Z1 & 0.023514097 & 0.60079885 & -0.06448586 & 0.111152233\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X1, X2 & 0.036845046 & 0.41214668 & -0.05118734 & 0.124311241\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2 & -0.066981531 & 0.13557344 & -0.15394797 & 0.021008765\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, Z1 & -0.073077584 & 0.10336020 & -0.15992443 & 0.014884642\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2 & -0.032227822 & 0.47320617 & -0.11975701 & 0.055796867\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X1 \\textbar{} D & 0.009940088 & 0.82479380 & -0.07790657 & 0.097633980\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X2 \\textbar{} D & -0.036853910 & 0.41156228 & -0.12423250 & 0.051089874\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & 0.069216401 & 0.12259044 & -0.01867555 & 0.156053012\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X2, Z1 & 0.057114133 & 0.20333985 & -0.03090763 & 0.144261035\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X1, X2 & 0.058055858 & 0.19597115 & -0.02996364 & 0.145186241\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} D & 0.059499571 & 0.18461277 & -0.02842739 & 0.146517386\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2 & -0.024383873 & 0.58739526 & -0.11201172 & 0.063619153\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D & -0.039992488 & 0.37284854 & -0.12732625 & 0.047954358\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, Z1 & 0.030257618 & 0.50069599 & -0.05776264 & 0.117812572\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2 & 0.015146292 & 0.73610975 & -0.07281740 & 0.102876800\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} D & 0.026033567 & 0.56196445 & -0.06188640 & 0.113553829\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ X2 \\textbar{} Z1 & 0.017365139 & 0.69891990 & -0.07052096 & 0.104984454\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ X3 & -0.010916013 & 0.80772207 & -0.09851292 & 0.076848315\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, X3 & 0.077771818 & 0.08326238 & -0.01025340 & 0.164609088\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, Z2 & 0.024235767 & 0.59004395 & -0.06385550 & 0.111953380\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, Z1 & 0.034552049 & 0.44233066 & -0.05356585 & 0.122137776\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Z2 & -0.081131074 & 0.06988127 & -0.16763798 & 0.006606516\\\\\n\tX2 \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & 0.012497661 & 0.78074322 & -0.07536366 & 0.100166937\\\\\n\tX3 \\_\\textbar{}\\textbar{}\\_ Z1 & -0.033463732 & 0.45548484 & -0.12080156 & 0.054386481\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2, Z2 & 0.019055134 & 0.67187741 & -0.06901572 & 0.106832097\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D, X2, Z2 & 0.013403526 & 0.76575956 & -0.07463970 & 0.101239966\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2, X3 & 0.040703777 & 0.36536644 & -0.04742109 & 0.128202210\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D, X2, X3 & 0.041202935 & 0.35950969 & -0.04692221 & 0.128694003\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, X3, Z1 & -0.007522454 & 0.86721422 & -0.09541493 & 0.080486104\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2, X3 & -0.011245710 & 0.80261910 & -0.09910341 & 0.076785498\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} D, X2, X3 & -0.021516649 & 0.63243296 & -0.10926599 & 0.066564492\\\\\n\tZ1 \\_\\textbar{}\\textbar{}\\_ Z2 & 0.029331384 & 0.51305592 & -0.05850974 & 0.116723204\\\\\n\\end{tabular}\n", - "text/plain": [ - " estimate p.value 2.5% 97.5% \n", - "D _||_ X3 | Z2 -0.002071358 0.96320552 -0.08983390 0.085723024\n", - "D _||_ X3 | X2, Z1 0.023514097 0.60079885 -0.06448586 0.111152233\n", - "D _||_ X3 | X1, X2 0.036845046 0.41214668 -0.05118734 0.124311241\n", - "D _||_ Z1 | X1, X2 -0.066981531 0.13557344 -0.15394797 0.021008765\n", - "D _||_ Z2 | X2, Z1 -0.073077584 0.10336020 -0.15992443 0.014884642\n", - "D _||_ Z2 | X1, X2 -0.032227822 0.47320617 -0.11975701 0.055796867\n", - "M _||_ X1 | D 0.009940088 0.82479380 -0.07790657 0.097633980\n", - "M _||_ X2 | D -0.036853910 0.41156228 -0.12423250 0.051089874\n", - "M _||_ X3 | Z2 0.069216401 0.12259044 -0.01867555 0.156053012\n", - "M _||_ X3 | X2, Z1 0.057114133 0.20333985 -0.03090763 0.144261035\n", - "M _||_ X3 | X1, X2 0.058055858 0.19597115 -0.02996364 0.145186241\n", - "M _||_ X3 | D 0.059499571 0.18461277 -0.02842739 0.146517386\n", - "M _||_ Z1 | X1, X2 -0.024383873 0.58739526 -0.11201172 0.063619153\n", - "M _||_ Z1 | D -0.039992488 0.37284854 -0.12732625 0.047954358\n", - "M _||_ Z2 | X2, Z1 0.030257618 0.50069599 -0.05776264 0.117812572\n", - "M _||_ Z2 | X1, X2 0.015146292 0.73610975 -0.07281740 0.102876800\n", - "M _||_ Z2 | D 0.026033567 0.56196445 -0.06188640 0.113553829\n", - "X1 _||_ X2 | Z1 0.017365139 0.69891990 -0.07052096 0.104984454\n", - "X1 _||_ X3 -0.010916013 0.80772207 -0.09851292 0.076848315\n", - "X1 _||_ Y | D, X2, X3 0.077771818 0.08326238 -0.01025340 0.164609088\n", - "X1 _||_ Y | D, X2, Z2 0.024235767 0.59004395 -0.06385550 0.111953380\n", - "X1 _||_ Y | D, X2, Z1 0.034552049 0.44233066 -0.05356585 0.122137776\n", - "X1 _||_ Z2 -0.081131074 0.06988127 -0.16763798 0.006606516\n", - "X2 _||_ X3 | Z2 0.012497661 0.78074322 -0.07536366 0.100166937\n", - "X3 _||_ Z1 -0.033463732 0.45548484 -0.12080156 0.054386481\n", - "Y _||_ Z1 | X1, X2, Z2 0.019055134 0.67187741 -0.06901572 0.106832097\n", - "Y _||_ Z1 | D, X2, Z2 0.013403526 0.76575956 -0.07463970 0.101239966\n", - "Y _||_ Z1 | X1, X2, X3 0.040703777 0.36536644 -0.04742109 0.128202210\n", - "Y _||_ Z1 | D, X2, X3 0.041202935 0.35950969 -0.04692221 0.128694003\n", - "Y _||_ Z2 | X2, X3, Z1 -0.007522454 0.86721422 -0.09541493 0.080486104\n", - "Y _||_ Z2 | X1, X2, X3 -0.011245710 0.80261910 -0.09910341 0.076785498\n", - "Y _||_ Z2 | D, X2, X3 -0.021516649 0.63243296 -0.10926599 0.066564492\n", - "Z1 _||_ Z2 0.029331384 0.51305592 -0.05850974 0.116723204" - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "aUQHK-vEk2-a" - }, - "cell_type": "markdown", - "source": [ - "Next we replaced $D$ by $\\bar D$ generated differently:\n", - "$$\n", - "\\bar D= (D + Y)/2.\n", - "$$\n", - "$\\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional independence restrictions, exploiting linearity. We end up rejectiong these restrictions and therefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure.\n", - "\n" - ] - }, - { - "metadata": { - "trusted": true, - "id": "C0ND4GbEk2-a", - "outputId": "37236927-3ece-458f-fcd4-820422609d7b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - }, - "cell_type": "code", - "source": [ - "x.R = x\n", - "x.R$D = (x$D+ x$Y)/2\n", - "\n", - "localTests(G, data = x.R, type = c(\"cis\"))\n", - "\n" - ], - "execution_count": 27, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 33 × 4
estimatep.value2.5%97.5%
<dbl><dbl><dbl><dbl>
D _||_ X3 | Z2 0.4580967312.995215e-28 0.386436079 0.527754692
D _||_ X3 | X2, Z1 0.4864643182.958117e-32 0.417288505 0.554643072
D _||_ X3 | X1, X2 0.5435644327.516370e-42 0.480390426 0.608908119
D _||_ Z1 | X1, X2-0.0603987921.784877e-01-0.147487432 0.027614410
D _||_ Z2 | X2, Z1-0.1850514763.111874e-05-0.268647050-0.098791785
D _||_ Z2 | X1, X2-0.1663535921.870798e-04-0.250625158-0.079651412
M _||_ X1 | D-0.1137442201.095239e-02-0.199551910-0.026227598
M _||_ X2 | D 0.0820492616.703474e-02-0.005770787 0.168622542
M _||_ X3 | Z2 0.0692164011.225904e-01-0.018675553 0.156053012
M _||_ X3 | X2, Z1 0.0571141332.033399e-01-0.030907627 0.144261035
M _||_ X3 | X1, X2 0.0580558581.959711e-01-0.029963644 0.145186241
M _||_ X3 | D 0.1071752471.657018e-02 0.019580972 0.193154911
M _||_ Z1 | X1, X2-0.0243838735.873953e-01-0.112011718 0.063619153
M _||_ Z1 | D-0.0374319384.042593e-01-0.124802395 0.050512540
M _||_ Z2 | X2, Z1 0.0302576185.006960e-01-0.057762645 0.117812572
M _||_ Z2 | X1, X2 0.0151462927.361097e-01-0.072817405 0.102876800
M _||_ Z2 | D-0.0324061964.703110e-01-0.119845435 0.055530285
X1 _||_ X2 | Z1 0.0173651396.989199e-01-0.070520958 0.104984454
X1 _||_ X3-0.0109160138.077221e-01-0.098512922 0.076848315
X1 _||_ Y | D, X2, X3-0.3059558622.138011e-12-0.384173754-0.224067782
X1 _||_ Y | D, X2, Z2-0.4040619521.670319e-21-0.476899981-0.328017298
X1 _||_ Y | D, X2, Z1-0.4009485763.686103e-21-0.473963077-0.324683850
X1 _||_ Z2-0.0811310746.988127e-02-0.167637985 0.006606516
X2 _||_ X3 | Z2 0.0124976617.807432e-01-0.075363664 0.100166937
X3 _||_ Z1-0.0334637324.554848e-01-0.120801562 0.054386481
Y _||_ Z1 | X1, X2, Z2 0.0190551346.718774e-01-0.069015721 0.106832097
Y _||_ Z1 | D, X2, Z2-0.0463913823.021468e-01-0.133803384 0.041733997
Y _||_ Z1 | X1, X2, X3 0.0407037773.653664e-01-0.047421092 0.128202210
Y _||_ Z1 | D, X2, X3-0.0082226988.549842e-01-0.096108819 0.079790305
Y _||_ Z2 | X2, X3, Z1-0.0075224548.672142e-01-0.095414934 0.080486104
Y _||_ Z2 | X1, X2, X3-0.0112457108.026191e-01-0.099103406 0.076785498
Y _||_ Z2 | D, X2, X3 0.0159054957.236783e-01-0.072150652 0.103716284
Z1 _||_ Z2 0.0293313845.130559e-01-0.058509741 0.116723204
\n" - ], - "text/markdown": "\nA data.frame: 33 × 4\n\n| | estimate <dbl> | p.value <dbl> | 2.5% <dbl> | 97.5% <dbl> |\n|---|---|---|---|---|\n| D _||_ X3 | Z2 | 0.458096731 | 2.995215e-28 | 0.386436079 | 0.527754692 |\n| D _||_ X3 | X2, Z1 | 0.486464318 | 2.958117e-32 | 0.417288505 | 0.554643072 |\n| D _||_ X3 | X1, X2 | 0.543564432 | 7.516370e-42 | 0.480390426 | 0.608908119 |\n| D _||_ Z1 | X1, X2 | -0.060398792 | 1.784877e-01 | -0.147487432 | 0.027614410 |\n| D _||_ Z2 | X2, Z1 | -0.185051476 | 3.111874e-05 | -0.268647050 | -0.098791785 |\n| D _||_ Z2 | X1, X2 | -0.166353592 | 1.870798e-04 | -0.250625158 | -0.079651412 |\n| M _||_ X1 | D | -0.113744220 | 1.095239e-02 | -0.199551910 | -0.026227598 |\n| M _||_ X2 | D | 0.082049261 | 6.703474e-02 | -0.005770787 | 0.168622542 |\n| M _||_ X3 | Z2 | 0.069216401 | 1.225904e-01 | -0.018675553 | 0.156053012 |\n| M _||_ X3 | X2, Z1 | 0.057114133 | 2.033399e-01 | -0.030907627 | 0.144261035 |\n| M _||_ X3 | X1, X2 | 0.058055858 | 1.959711e-01 | -0.029963644 | 0.145186241 |\n| M _||_ X3 | D | 0.107175247 | 1.657018e-02 | 0.019580972 | 0.193154911 |\n| M _||_ Z1 | X1, X2 | -0.024383873 | 5.873953e-01 | -0.112011718 | 0.063619153 |\n| M _||_ Z1 | D | -0.037431938 | 4.042593e-01 | -0.124802395 | 0.050512540 |\n| M _||_ Z2 | X2, Z1 | 0.030257618 | 5.006960e-01 | -0.057762645 | 0.117812572 |\n| M _||_ Z2 | X1, X2 | 0.015146292 | 7.361097e-01 | -0.072817405 | 0.102876800 |\n| M _||_ Z2 | D | -0.032406196 | 4.703110e-01 | -0.119845435 | 0.055530285 |\n| X1 _||_ X2 | Z1 | 0.017365139 | 6.989199e-01 | -0.070520958 | 0.104984454 |\n| X1 _||_ X3 | -0.010916013 | 8.077221e-01 | -0.098512922 | 0.076848315 |\n| X1 _||_ Y | D, X2, X3 | -0.305955862 | 2.138011e-12 | -0.384173754 | -0.224067782 |\n| X1 _||_ Y | D, X2, Z2 | -0.404061952 | 1.670319e-21 | -0.476899981 | -0.328017298 |\n| X1 _||_ Y | D, X2, Z1 | -0.400948576 | 3.686103e-21 | -0.473963077 | -0.324683850 |\n| X1 _||_ Z2 | -0.081131074 | 6.988127e-02 | -0.167637985 | 0.006606516 |\n| X2 _||_ X3 | Z2 | 0.012497661 | 7.807432e-01 | -0.075363664 | 0.100166937 |\n| X3 _||_ Z1 | -0.033463732 | 4.554848e-01 | -0.120801562 | 0.054386481 |\n| Y _||_ Z1 | X1, X2, Z2 | 0.019055134 | 6.718774e-01 | -0.069015721 | 0.106832097 |\n| Y _||_ Z1 | D, X2, Z2 | -0.046391382 | 3.021468e-01 | -0.133803384 | 0.041733997 |\n| Y _||_ Z1 | X1, X2, X3 | 0.040703777 | 3.653664e-01 | -0.047421092 | 0.128202210 |\n| Y _||_ Z1 | D, X2, X3 | -0.008222698 | 8.549842e-01 | -0.096108819 | 0.079790305 |\n| Y _||_ Z2 | X2, X3, Z1 | -0.007522454 | 8.672142e-01 | -0.095414934 | 0.080486104 |\n| Y _||_ Z2 | X1, X2, X3 | -0.011245710 | 8.026191e-01 | -0.099103406 | 0.076785498 |\n| Y _||_ Z2 | D, X2, X3 | 0.015905495 | 7.236783e-01 | -0.072150652 | 0.103716284 |\n| Z1 _||_ Z2 | 0.029331384 | 5.130559e-01 | -0.058509741 | 0.116723204 |\n\n", - "text/latex": "A data.frame: 33 × 4\n\\begin{tabular}{r|llll}\n & estimate & p.value & 2.5\\% & 97.5\\%\\\\\n & & & & \\\\\n\\hline\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & 0.458096731 & 2.995215e-28 & 0.386436079 & 0.527754692\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X2, Z1 & 0.486464318 & 2.958117e-32 & 0.417288505 & 0.554643072\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X1, X2 & 0.543564432 & 7.516370e-42 & 0.480390426 & 0.608908119\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2 & -0.060398792 & 1.784877e-01 & -0.147487432 & 0.027614410\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, Z1 & -0.185051476 & 3.111874e-05 & -0.268647050 & -0.098791785\\\\\n\tD \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2 & -0.166353592 & 1.870798e-04 & -0.250625158 & -0.079651412\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X1 \\textbar{} D & -0.113744220 & 1.095239e-02 & -0.199551910 & -0.026227598\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X2 \\textbar{} D & 0.082049261 & 6.703474e-02 & -0.005770787 & 0.168622542\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & 0.069216401 & 1.225904e-01 & -0.018675553 & 0.156053012\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X2, Z1 & 0.057114133 & 2.033399e-01 & -0.030907627 & 0.144261035\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} X1, X2 & 0.058055858 & 1.959711e-01 & -0.029963644 & 0.145186241\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} D & 0.107175247 & 1.657018e-02 & 0.019580972 & 0.193154911\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2 & -0.024383873 & 5.873953e-01 & -0.112011718 & 0.063619153\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D & -0.037431938 & 4.042593e-01 & -0.124802395 & 0.050512540\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, Z1 & 0.030257618 & 5.006960e-01 & -0.057762645 & 0.117812572\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2 & 0.015146292 & 7.361097e-01 & -0.072817405 & 0.102876800\\\\\n\tM \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} D & -0.032406196 & 4.703110e-01 & -0.119845435 & 0.055530285\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ X2 \\textbar{} Z1 & 0.017365139 & 6.989199e-01 & -0.070520958 & 0.104984454\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ X3 & -0.010916013 & 8.077221e-01 & -0.098512922 & 0.076848315\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, X3 & -0.305955862 & 2.138011e-12 & -0.384173754 & -0.224067782\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, Z2 & -0.404061952 & 1.670319e-21 & -0.476899981 & -0.328017298\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Y \\textbar{} D, X2, Z1 & -0.400948576 & 3.686103e-21 & -0.473963077 & -0.324683850\\\\\n\tX1 \\_\\textbar{}\\textbar{}\\_ Z2 & -0.081131074 & 6.988127e-02 & -0.167637985 & 0.006606516\\\\\n\tX2 \\_\\textbar{}\\textbar{}\\_ X3 \\textbar{} Z2 & 0.012497661 & 7.807432e-01 & -0.075363664 & 0.100166937\\\\\n\tX3 \\_\\textbar{}\\textbar{}\\_ Z1 & -0.033463732 & 4.554848e-01 & -0.120801562 & 0.054386481\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2, Z2 & 0.019055134 & 6.718774e-01 & -0.069015721 & 0.106832097\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D, X2, Z2 & -0.046391382 & 3.021468e-01 & -0.133803384 & 0.041733997\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} X1, X2, X3 & 0.040703777 & 3.653664e-01 & -0.047421092 & 0.128202210\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z1 \\textbar{} D, X2, X3 & -0.008222698 & 8.549842e-01 & -0.096108819 & 0.079790305\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X2, X3, Z1 & -0.007522454 & 8.672142e-01 & -0.095414934 & 0.080486104\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} X1, X2, X3 & -0.011245710 & 8.026191e-01 & -0.099103406 & 0.076785498\\\\\n\tY \\_\\textbar{}\\textbar{}\\_ Z2 \\textbar{} D, X2, X3 & 0.015905495 & 7.236783e-01 & -0.072150652 & 0.103716284\\\\\n\tZ1 \\_\\textbar{}\\textbar{}\\_ Z2 & 0.029331384 & 5.130559e-01 & -0.058509741 & 0.116723204\\\\\n\\end{tabular}\n", - "text/plain": [ - " estimate p.value 2.5% 97.5% \n", - "D _||_ X3 | Z2 0.458096731 2.995215e-28 0.386436079 0.527754692\n", - "D _||_ X3 | X2, Z1 0.486464318 2.958117e-32 0.417288505 0.554643072\n", - "D _||_ X3 | X1, X2 0.543564432 7.516370e-42 0.480390426 0.608908119\n", - "D _||_ Z1 | X1, X2 -0.060398792 1.784877e-01 -0.147487432 0.027614410\n", - "D _||_ Z2 | X2, Z1 -0.185051476 3.111874e-05 -0.268647050 -0.098791785\n", - "D _||_ Z2 | X1, X2 -0.166353592 1.870798e-04 -0.250625158 -0.079651412\n", - "M _||_ X1 | D -0.113744220 1.095239e-02 -0.199551910 -0.026227598\n", - "M _||_ X2 | D 0.082049261 6.703474e-02 -0.005770787 0.168622542\n", - "M _||_ X3 | Z2 0.069216401 1.225904e-01 -0.018675553 0.156053012\n", - "M _||_ X3 | X2, Z1 0.057114133 2.033399e-01 -0.030907627 0.144261035\n", - "M _||_ X3 | X1, X2 0.058055858 1.959711e-01 -0.029963644 0.145186241\n", - "M _||_ X3 | D 0.107175247 1.657018e-02 0.019580972 0.193154911\n", - "M _||_ Z1 | X1, X2 -0.024383873 5.873953e-01 -0.112011718 0.063619153\n", - "M _||_ Z1 | D -0.037431938 4.042593e-01 -0.124802395 0.050512540\n", - "M _||_ Z2 | X2, Z1 0.030257618 5.006960e-01 -0.057762645 0.117812572\n", - "M _||_ Z2 | X1, X2 0.015146292 7.361097e-01 -0.072817405 0.102876800\n", - "M _||_ Z2 | D -0.032406196 4.703110e-01 -0.119845435 0.055530285\n", - "X1 _||_ X2 | Z1 0.017365139 6.989199e-01 -0.070520958 0.104984454\n", - "X1 _||_ X3 -0.010916013 8.077221e-01 -0.098512922 0.076848315\n", - "X1 _||_ Y | D, X2, X3 -0.305955862 2.138011e-12 -0.384173754 -0.224067782\n", - "X1 _||_ Y | D, X2, Z2 -0.404061952 1.670319e-21 -0.476899981 -0.328017298\n", - "X1 _||_ Y | D, X2, Z1 -0.400948576 3.686103e-21 -0.473963077 -0.324683850\n", - "X1 _||_ Z2 -0.081131074 6.988127e-02 -0.167637985 0.006606516\n", - "X2 _||_ X3 | Z2 0.012497661 7.807432e-01 -0.075363664 0.100166937\n", - "X3 _||_ Z1 -0.033463732 4.554848e-01 -0.120801562 0.054386481\n", - "Y _||_ Z1 | X1, X2, Z2 0.019055134 6.718774e-01 -0.069015721 0.106832097\n", - "Y _||_ Z1 | D, X2, Z2 -0.046391382 3.021468e-01 -0.133803384 0.041733997\n", - "Y _||_ Z1 | X1, X2, X3 0.040703777 3.653664e-01 -0.047421092 0.128202210\n", - "Y _||_ Z1 | D, X2, X3 -0.008222698 8.549842e-01 -0.096108819 0.079790305\n", - "Y _||_ Z2 | X2, X3, Z1 -0.007522454 8.672142e-01 -0.095414934 0.080486104\n", - "Y _||_ Z2 | X1, X2, X3 -0.011245710 8.026191e-01 -0.099103406 0.076785498\n", - "Y _||_ Z2 | D, X2, X3 0.015905495 7.236783e-01 -0.072150652 0.103716284\n", - "Z1 _||_ Z2 0.029331384 5.130559e-01 -0.058509741 0.116723204" - ] - }, - "metadata": {} - } - ] - } - ], - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "POIGemzqP6P9" + }, + "source": [ + "There are system packages that some of the R packages need. We install them here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9TqXQzIlOelc" + }, + "outputs": [], + "source": [ + "system('sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable')\n", + "system('sudo apt-get update')\n", + "system('sudo apt-get install libglpk-dev libgmp-dev libxml2-dev')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mll4gH73k2-A" + }, + "source": [ + "# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "md3VArZXk2-G", + "outputId": "2b1a6a60-18a4-4de3-cec6-de65a2516641" + }, + "outputs": [], + "source": [ + "#install and load package\n", + "install.packages(\"dagitty\")\n", + "install.packages(\"ggdag\")\n", + "library(dagitty)\n", + "library(ggdag)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5DaJmPrbk2-J" + }, + "source": [ + "# Graph Generation and Plotting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eeldklHqk2-K" + }, + "source": [ + "The following DAG is due to Judea Pearl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "1fgBFtRxk2-K", + "outputId": "88af1fad-7fbe-46a6-986c-637462e8e86d" + }, + "outputs": [], + "source": [ + "#generate a couple of DAGs and plot them\n", + "\n", + "G = dagitty('dag{\n", + "Z1 [pos=\"-2,-1.5\"]\n", + "X1 [pos=\"-2,0\"]\n", + "Z2 [pos=\"1.5,-1.5\"]\n", + "X3 [pos=\"1.5, 0\"]\n", + "Y [outcome,pos=\"1.5,1.5\"]\n", + "D [exposure,pos=\"-2,1.5\"]\n", + "M [mediator, pos=\"0,1.5\"]\n", + "X2 [pos=\"0,0\"]\n", + "Z1 -> X1\n", + "X1 -> D\n", + "Z1 -> X2\n", + "Z2 -> X3\n", + "X3 -> Y\n", + "Z2 -> X2\n", + "D -> Y\n", + "X2 -> Y\n", + "X2 -> D\n", + "M->Y\n", + "D->M\n", + "}')\n", + "\n", + "\n", + "ggdag(G)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SfZKYPLgk2-K" + }, + "source": [ + "# Report Relatives of X2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PDE3ROjfk2-P", + "outputId": "0890ac94-4658-4ec7-b984-e7d0b2913483" + }, + "outputs": [], + "source": [ + "print(parents(G, \"X2\"))\n", + "print(children(G, \"X2\"))\n", + "print(ancestors(G, \"X2\"))\n", + "print(descendants(G, \"X2\"))\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LEAIT0-_k2-R" + }, + "source": [ + "# Find Paths Between D and Y\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 114 + }, + "id": "5JhG_60wk2-R", + "outputId": "10ada2ca-f201-4a3a-ff84-89b2948775d6" + }, + "outputs": [], + "source": [ + "paths(G, \"D\", \"Y\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i8h9AIZYk2-S" + }, + "source": [ + "# List All Testable Implications of the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B5LnSoCik2-T", + "outputId": "a03a58e3-5ae1-4d22-beea-ff8f97f6c64b" + }, + "outputs": [], + "source": [ + "print( impliedConditionalIndependencies(G) )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TKxYwKyuk2-U" + }, + "source": [ + "# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \\to Y$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dXKGvXgTk2-V", + "outputId": "daefd9dd-eba8-4057-b2d8-e5347ee30c79" + }, + "outputs": [], + "source": [ + "print( adjustmentSets( G, \"D\", \"Y\" ) )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4QqmWahfk2-W" + }, + "source": [ + "# Identification via SWIG and D-separation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "Zv3rbjEuk2-W", + "outputId": "ce658fd8-b475-4deb-c809-602de1686b60" + }, + "outputs": [], + "source": [ + "SWIG = dagitty('dag{\n", + "Z1 [pos=\"-2,-1.5\"]\n", + "X1 [pos=\"-2,0\"]\n", + "Z2 [pos=\"1.5,-1.5\"]\n", + "X3 [pos=\"1.5, 0\"]\n", + "Yd [outcome,pos=\"1.5,1.5\"]\n", + "D [exposure,pos=\"-2,1.5\"]\n", + "d [pos=\"-1, 1.5\"]\n", + "Md [mediator, pos=\"0,1.5\"]\n", + "X2 [pos=\"0,0\"]\n", + "Z1 -> X1\n", + "X1 -> D\n", + "Z1 -> X2\n", + "Z2 -> X3\n", + "X3 -> Yd\n", + "Z2 -> X2\n", + "X2 -> Yd\n", + "X2 -> D\n", + "X3-> Yd\n", + "Md-> Yd\n", + "d-> Md\n", + "}')\n", + "\n", + "ggdag(SWIG)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7Bdrp3mOk2-W" + }, + "source": [ + "\n", + "# Deduce Conditional Exogeneity or Ignorability by D-separation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] - } + "base_uri": "https://localhost:8080/" + }, + "id": "KwJgFzoqk2-X", + "outputId": "b66ee434-d948-4c7e-cf45-0fbc6843c6cc" + }, + "outputs": [], + "source": [ + "print( impliedConditionalIndependencies(SWIG)[5:8] )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wgaiPTysk2-X" + }, + "source": [ + "This coincides with the backdoor criterion for this graph." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iHHStLhqk2-X" + }, + "source": [ + "# Print All Average Effects Identifiable by Conditioning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OcBfUurBk2-X", + "outputId": "b2a8b5c2-3407-48e4-e68e-61f19ac9e598" + }, + "outputs": [], + "source": [ + "for( n in names(G) ){\n", + " for( m in children(G,n) ){\n", + " a <- adjustmentSets( G, n, m )\n", + " if( length(a) > 0 ){\n", + " cat(\"The effect \",n,\"->\",m,\n", + " \" is identifiable by controlling for:\\n\",sep=\"\")\n", + " print( a, prefix=\" * \" )\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gZRMiUymk2-Y" + }, + "source": [ + "# Equivalence Classes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "8vNiL5HWk2-Y", + "outputId": "fcd29ac8-675b-4fb1-8a95-8e62ccba8d44" + }, + "outputs": [], + "source": [ + "P=equivalenceClass(G)\n", + "plot(P)\n", + "#equivalentDAGs(G,10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mUAnGdrkk2-Z" + }, + "source": [ + "Next Consider the elemntary Triangular Model:\n", + "$$\n", + "D \\to Y, \\quad X \\to (D,Y).\n", + "$$\n", + "This model has no testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "pBHDnH7Fk2-Z", + "outputId": "ea6c8fd5-4e90-4ce6-e84b-449d9c49d8b5" + }, + "outputs": [], + "source": [ + "G3<- dagitty('dag{\n", + "D -> Y\n", + "X -> D\n", + "X -> Y\n", + "}\n", + "')\n", + "\n", + "ggdag(G3)+ theme_dag()\n", + "\n", + "print(impliedConditionalIndependencies(G3))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "1cw47mOEk2-Z", + "outputId": "9e4f3af6-1f2b-4eac-ab68-841aaebaa87d" + }, + "outputs": [], + "source": [ + "P=equivalenceClass(G3)\n", + "plot(P)\n", + "equivalentDAGs(G3,10)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kjil8zIfk2-Z" + }, + "source": [ + "# Example of Testing DAG Validity\n", + "\n", + "Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity.\n", + "\n", + "\n", + "There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "MZ_4jxNdk2-a", + "outputId": "d91485d7-311d-4ae2-ea60-2e7f87a2fb96" + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "x <- simulateSEM(G)\n", + "head(x)\n", + "#cov(x)\n", + "localTests(G, data = x, type = c(\"cis\"))\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aUQHK-vEk2-a" + }, + "source": [ + "Next we replaced $D$ by $\\bar D$ generated differently:\n", + "$$\n", + "\\bar D= (D + Y)/2.\n", + "$$\n", + "$\\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional independence restrictions, exploiting linearity. We end up rejectiong these restrictions and therefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "C0ND4GbEk2-a", + "outputId": "37236927-3ece-458f-fcd4-820422609d7b" + }, + "outputs": [], + "source": [ + "x.R = x\n", + "x.R$D = (x$D+ x$Y)/2\n", + "\n", + "localTests(G, data = x.R, type = c(\"cis\"))\n", + "\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/CM3/r-dosearch.Rmd b/CM3/r-dosearch.Rmd new file mode 100644 index 00000000..babe674c --- /dev/null +++ b/CM3/r-dosearch.Rmd @@ -0,0 +1,205 @@ +--- +title: An R Markdown document converted from "CM3/r-dosearch.irnb" +output: html_document +--- + +# Dosearch for Causal Identification in DAGs. + + +This a simple notebook for teaching that illustrates capabilites of the "dosearch" package, which is a great tool. + +NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples. + +```{r} +install.packages("dosearch") +library("dosearch") +``` + +We start with the simplest graph, with the simplest example +where $D$ is policy, $Y$ is outcomes, $X$ is a confounder: +$$ +D\to Y, \quad X \to (D,Y) +$$ + +Now suppose we want conditional average policy effect. + +```{r} +data <- "p(y,d,x)" #data structure + +query <- "p(y | do(d),x)" #query -- target parameter + +graph <- "x -> y + x -> d + d -> y" + +dosearch(data, query, graph) +``` + +This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$: +$$ +p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x). +$$ + +```{r} +data <- "p(y,d,x)" + +query <- "p(y | do(d))" + +graph <- "x -> y + x -> d + d -> y" + + +dosearch(data, query, graph) +``` + +This recovers the correct identification formula: +$$ +p_{Y(d)}(y) := p(y: do(d)) = \sum_{x}\left(p(x)p(y|d,x)\right) +$$ +We integrate out $x$ in the previous formula. + +Suppose we don't observe the confounder. The effect is generally not identified. + +```{r} +data <- "p(y,d)" + +query <- "p(y | do(d))" + +graph <- "x -> y + x -> d + d -> y" + +dosearch(data, query, graph) +``` + +The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. See "Pearl's Example" in the book - e.g. Figure 7.14. We are interested in $D \to Y$. + +Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully). + +```{r} + +data <- "p(y,d,x2)" #observed only (Y, D, X_2) + +query<- "p(y|do(d))" #target parameter + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" + +dosearch(data, query, graph) +``` + +Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work. + +```{r} + +data <- "p(y,d,x2,x3)" + +conditional.query<- "p(y|do(d),x2, x3)" #can ID conditional average effect? +query<- "p(y|do(d))" #can ID unconditional effect? + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" + +print(dosearch(data, conditional.query, graph)) +print(dosearch(data, query, graph)) +``` + +This retrieves the correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$: + +The conditional distribution is identified by +$$ +p_{Y(d) \mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d). +$$ + +The unconditional distribution is obtained by integration out over $x_2$ and $x_3$: + +$$ +p_{Y(d) }(y) := p(y do(d)) = \sum_{x2,x3}\left(p(x_2,x_3)p(y|x_2,x_3,d)\right). +$$ + + + +Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \to Y$? Can we use back-door-criterion twice to get $D \to M$ and $M \to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively. + +```{r} +data <- "p(y,d, m)" + +query.dm<- "p(m|do(d))" +query.md<- "p(y|do(m))" +query<- "p(y|do(d))" + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" +print(dosearch(data, query.dm, graph)) +print(dosearch(data, query.md, graph)) +print(dosearch(data, query, graph)) +``` + +So we get identification results: +First, +$$ +p_{M(d)}(m) := p(m|do(d)) = p(m|d). +$$ +Second, +$$ +p_{Y(m)}(y) := p(y|do(m)) = \sum_{d}\left(p(d)p(y|d,m)\right), +$$ +and the last by integrating the product of these two formulas: +$$ +p_{Y(d)}(y) := p(y|do(d)) = \sum_{m}\left(p(m|d)\sum_{d}\left(p(d)p(y|d,m)\right)\right) +$$ + +The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \to Y$. The answer is (guess) and the package correctly recovers it. + +```{r} +data <- "p(y,m) + p(m,d)" + +query.dm<- "p(m|do(d))" +query.md<- "p(y|do(m))" +query<- "p(y|do(d))" + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" +print(dosearch(data, query.dm, graph)) +print(dosearch(data, query.md, graph)) +print(dosearch(data, query, graph)) +``` + diff --git a/CM3/r-dosearch.irnb b/CM3/r-dosearch.irnb index 36cc8b06..6125719e 100644 --- a/CM3/r-dosearch.irnb +++ b/CM3/r-dosearch.irnb @@ -1,492 +1,402 @@ { - "cells": [ - { - "metadata": { - "id": "GckAu5W7kzXS" - }, - "cell_type": "markdown", - "source": [ - "# Dosearch for Causal Identification in DAGs.\n", - "\n", - "\n", - "This a simple notebook for teaching that illustrates capabilites of the \"dosearch\" package, which is a great tool.\n", - "\n", - "NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples." - ] - }, - { - "metadata": { - "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", - "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", - "trusted": true, - "id": "_67DDyL8kzXV", - "outputId": "8c8ac5d4-b0b1-4a10-ed14-ddb50956add0", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "install.packages(\"dosearch\")\n", - "library(\"dosearch\")" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependency ‘Rcpp’\n", - "\n", - "\n" - ] - } - ] - }, - { - "metadata": { - "id": "8ov_tpHJkzXY" - }, - "cell_type": "markdown", - "source": [ - "We start with the simplest graph, with the simplest example\n", - "where $D$ is policy, $Y$ is outcomes, $X$ is a confounder:\n", - "$$\n", - "D\\to Y, \\quad X \\to (D,Y)\n", - "$$\n" - ] - }, - { - "metadata": { - "id": "YS75qAvjkzXY" - }, - "cell_type": "markdown", - "source": [ - "Now suppose we want conditional average policy effect." - ] - }, - { - "metadata": { - "trusted": true, - "id": "0vvfKbSzkzXZ", - "outputId": "2c8dd4d2-ba1b-42ea-fb02-a326b6cc62ec", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, - "cell_type": "code", - "source": [ - "data <- \"p(y,d,x)\" #data structure\n", - "\n", - "query <- \"p(y | do(d),x)\" #query -- target parameter\n", - "\n", - "graph <- \"x -> y\n", - " x -> d\n", - " d -> y\"\n", - "\n", - "dosearch(data, query, graph)" - ], - "execution_count": 2, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "p(y|d,x) " - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "LxrtjzV6kzXZ" - }, - "cell_type": "markdown", - "source": [ - "This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$:\n", - "$$\n", - "p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x).\n", - "$$" - ] - }, - { - "metadata": { - "trusted": true, - "id": "tXw8HnV2kzXZ", - "outputId": "29481c74-cc2e-4042-f39d-5612862bfb68", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, - "cell_type": "code", - "source": [ - "data <- \"p(y,d,x)\"\n", - "\n", - "query <- \"p(y | do(d))\"\n", - "\n", - "graph <- \"x -> y\n", - " x -> d\n", - " d -> y\"\n", - "\n", - "\n", - "dosearch(data, query, graph)\n" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "\\sum_{x}\\left(p(x)p(y|d,x)\\right) " - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "8zR2Xks0kzXZ" - }, - "cell_type": "markdown", - "source": [ - "This recovers the correct identification formula:\n", - "$$\n", - "p_{Y(d)}(y) := p(y: do(d)) = \\sum_{x}\\left(p(x)p(y|d,x)\\right)\n", - "$$\n", - "We integrate out $x$ in the previous formula.\n" - ] - }, - { - "metadata": { - "id": "0CFswGBmkzXZ" - }, - "cell_type": "markdown", - "source": [ - "Suppose we don't observe the confounder. The effect is generally not identified.\n" - ] - }, - { - "metadata": { - "trusted": true, - "id": "cjis082KkzXa", - "outputId": "c803c856-acab-4cc7-9e20-f6e5c790ffc1", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, - "cell_type": "code", - "source": [ - "data <- \"p(y,d)\"\n", - "\n", - "query <- \"p(y | do(d))\"\n", - "\n", - "graph <- \"x -> y\n", - " x -> d\n", - " d -> y\"\n", - "\n", - "dosearch(data, query, graph)" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "The query p(y | do(d)) is non-identifiable. " - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "BHgbhd8pkzXa" - }, - "cell_type": "markdown", - "source": [ - "The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. See \"Pearl's Example\" in the book - e.g. Figure 7.14. We are interested in $D \\to Y$.\n", - "\n", - "Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully).\n" - ] - }, - { - "metadata": { - "trusted": true, - "id": "J4mz88VZkzXa", - "outputId": "30f73953-79de-448d-ddc2-aa470a97840e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } - }, - "cell_type": "code", - "source": [ - "\n", - "data <- \"p(y,d,x2)\" #observed only (Y, D, X_2)\n", - "\n", - "query<- \"p(y|do(d))\" #target parameter\n", - "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", - "\"\n", - "\n", - "dosearch(data, query, graph)\n" - ], - "execution_count": 5, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "The query p(y|do(d)) is non-identifiable. " - ] - }, - "metadata": {} - } - ] - }, - { - "metadata": { - "id": "w_JQY9sPkzXa" - }, - "cell_type": "markdown", - "source": [ - "Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work." - ] - }, - { - "metadata": { - "trusted": true, - "id": "wGIVqPAIkzXb", - "outputId": "0402b81b-ca16-4274-df93-33c0c2ae2327", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "\n", - "data <- \"p(y,d,x2,x3)\"\n", - "\n", - "conditional.query<- \"p(y|do(d),x2, x3)\" #can ID conditional average effect?\n", - "query<- \"p(y|do(d))\" #can ID unconditional effect?\n", - "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", - "\"\n", - "\n", - "print(dosearch(data, conditional.query, graph))\n", - "print(dosearch(data, query, graph))\n" - ], - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "p(y|x2,x3,d) \n", - "\\sum_{x2,x3}\\left(p(x2,x3)p(y|x2,x3,d)\\right) \n" - ] - } - ] - }, - { - "metadata": { - "id": "kAAC1aPokzXb" - }, - "cell_type": "markdown", - "source": [ - "This retrieves the correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$:\n", - "\n", - "The conditional distribution is identified by\n", - "$$\n", - "p_{Y(d) \\mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d).\n", - "$$\n", - "\n", - "The unconditional distribution is obtained by integration out over $x_2$ and $x_3$:\n", - "\n", - "$$\n", - "p_{Y(d) }(y) := p(y do(d)) = \\sum_{x2,x3}\\left(p(x_2,x_3)p(y|x_2,x_3,d)\\right).\n", - "$$\n", - "\n", - "\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "GckAu5W7kzXS" + }, + "source": [ + "# Dosearch for Causal Identification in DAGs.\n", + "\n", + "\n", + "This a simple notebook for teaching that illustrates capabilites of the \"dosearch\" package, which is a great tool.\n", + "\n", + "NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "metadata": { - "id": "6D9XRwyFkzXb" - }, - "cell_type": "markdown", - "source": [ - "Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \\to Y$? Can we use back-door-criterion twice to get $D \\to M$ and $M \\to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively." - ] + "id": "_67DDyL8kzXV", + "outputId": "8c8ac5d4-b0b1-4a10-ed14-ddb50956add0" + }, + "outputs": [], + "source": [ + "install.packages(\"dosearch\")\n", + "library(\"dosearch\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8ov_tpHJkzXY" + }, + "source": [ + "We start with the simplest graph, with the simplest example\n", + "where $D$ is policy, $Y$ is outcomes, $X$ is a confounder:\n", + "$$\n", + "D\\to Y, \\quad X \\to (D,Y)\n", + "$$\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YS75qAvjkzXY" + }, + "source": [ + "Now suppose we want conditional average policy effect." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - { - "metadata": { - "trusted": true, - "id": "eaPrMHWTkzXb", - "outputId": "4cc25d7c-75ba-44a6-93df-41359b55495b", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "data <- \"p(y,d, m)\"\n", - "\n", - "query.dm<- \"p(m|do(d))\"\n", - "query.md<- \"p(y|do(m))\"\n", - "query<- \"p(y|do(d))\"\n", - "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", - "\"\n", - "print(dosearch(data, query.dm, graph))\n", - "print(dosearch(data, query.md, graph))\n", - "print(dosearch(data, query, graph))\n" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "p(m|d) \n", - "\\sum_{d}\\left(p(d)p(y|d,m)\\right) \n", - "\\sum_{m}\\left(p(m|d)\\sum_{d}\\left(p(d)p(y|d,m)\\right)\\right) \n" - ] - } - ] + "id": "0vvfKbSzkzXZ", + "outputId": "2c8dd4d2-ba1b-42ea-fb02-a326b6cc62ec" + }, + "outputs": [], + "source": [ + "data <- \"p(y,d,x)\" #data structure\n", + "\n", + "query <- \"p(y | do(d),x)\" #query -- target parameter\n", + "\n", + "graph <- \"x -> y\n", + " x -> d\n", + " d -> y\"\n", + "\n", + "dosearch(data, query, graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LxrtjzV6kzXZ" + }, + "source": [ + "This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$:\n", + "$$\n", + "p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x).\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - { - "metadata": { - "id": "Khy7jCIJkzXb" - }, - "cell_type": "markdown", - "source": [ - "So we get identification results:\n", - "First,\n", - "$$\n", - "p_{M(d)}(m) := p(m|do(d)) = p(m|d).\n", - "$$\n", - "Second,\n", - "$$\n", - "p_{Y(m)}(y) := p(y|do(m)) = \\sum_{d}\\left(p(d)p(y|d,m)\\right),\n", - "$$\n", - "and the last by integrating the product of these two formulas:\n", - "$$\n", - "p_{Y(d)}(y) := p(y|do(d)) = \\sum_{m}\\left(p(m|d)\\sum_{d}\\left(p(d)p(y|d,m)\\right)\\right)\n", - "$$\n" - ] + "id": "tXw8HnV2kzXZ", + "outputId": "29481c74-cc2e-4042-f39d-5612862bfb68" + }, + "outputs": [], + "source": [ + "data <- \"p(y,d,x)\"\n", + "\n", + "query <- \"p(y | do(d))\"\n", + "\n", + "graph <- \"x -> y\n", + " x -> d\n", + " d -> y\"\n", + "\n", + "\n", + "dosearch(data, query, graph)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8zR2Xks0kzXZ" + }, + "source": [ + "This recovers the correct identification formula:\n", + "$$\n", + "p_{Y(d)}(y) := p(y: do(d)) = \\sum_{x}\\left(p(x)p(y|d,x)\\right)\n", + "$$\n", + "We integrate out $x$ in the previous formula.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0CFswGBmkzXZ" + }, + "source": [ + "Suppose we don't observe the confounder. The effect is generally not identified.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - { - "metadata": { - "id": "hpL6jYtPkzXb" - }, - "cell_type": "markdown", - "source": [ - "The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \\to Y$. The answer is (guess) and the package correctly recovers it." - ] + "id": "cjis082KkzXa", + "outputId": "c803c856-acab-4cc7-9e20-f6e5c790ffc1" + }, + "outputs": [], + "source": [ + "data <- \"p(y,d)\"\n", + "\n", + "query <- \"p(y | do(d))\"\n", + "\n", + "graph <- \"x -> y\n", + " x -> d\n", + " d -> y\"\n", + "\n", + "dosearch(data, query, graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BHgbhd8pkzXa" + }, + "source": [ + "The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. See \"Pearl's Example\" in the book - e.g. Figure 7.14. We are interested in $D \\to Y$.\n", + "\n", + "Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 }, - { - "metadata": { - "trusted": true, - "id": "x-ACaOv_kzXc", - "outputId": "e5ba4476-2eb2-4806-fa2d-71c73dac986d", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "cell_type": "code", - "source": [ - "data <- \"p(y,m)\n", - " p(m,d)\"\n", - "\n", - "query.dm<- \"p(m|do(d))\"\n", - "query.md<- \"p(y|do(m))\"\n", - "query<- \"p(y|do(d))\"\n", - "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", - "\"\n", - "print(dosearch(data, query.dm, graph))\n", - "print(dosearch(data, query.md, graph))\n", - "print(dosearch(data, query, graph))" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "p(m|d) \n", - "The query p(y|do(m)) is non-identifiable. \n", - "The query p(y|do(d)) is non-identifiable. \n" - ] - } - ] - } - ], - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" + "id": "J4mz88VZkzXa", + "outputId": "30f73953-79de-448d-ddc2-aa470a97840e" + }, + "outputs": [], + "source": [ + "\n", + "data <- \"p(y,d,x2)\" #observed only (Y, D, X_2)\n", + "\n", + "query<- \"p(y|do(d))\" #target parameter\n", + "\n", + "graph<- \"z1 -> x1\n", + "z1 -> x2\n", + "z2 -> x2\n", + "z2 -> x3\n", + "x2 -> d\n", + "x2 -> y\n", + "x3 -> y\n", + "x1 -> d\n", + "d -> m\n", + "m -> y\n", + "\"\n", + "\n", + "dosearch(data, query, graph)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w_JQY9sPkzXa" + }, + "source": [ + "Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" + "id": "wGIVqPAIkzXb", + "outputId": "0402b81b-ca16-4274-df93-33c0c2ae2327" + }, + "outputs": [], + "source": [ + "\n", + "data <- \"p(y,d,x2,x3)\"\n", + "\n", + "conditional.query<- \"p(y|do(d),x2, x3)\" #can ID conditional average effect?\n", + "query<- \"p(y|do(d))\" #can ID unconditional effect?\n", + "\n", + "graph<- \"z1 -> x1\n", + "z1 -> x2\n", + "z2 -> x2\n", + "z2 -> x3\n", + "x2 -> d\n", + "x2 -> y\n", + "x3 -> y\n", + "x1 -> d\n", + "d -> m\n", + "m -> y\n", + "\"\n", + "\n", + "print(dosearch(data, conditional.query, graph))\n", + "print(dosearch(data, query, graph))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kAAC1aPokzXb" + }, + "source": [ + "This retrieves the correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$:\n", + "\n", + "The conditional distribution is identified by\n", + "$$\n", + "p_{Y(d) \\mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d).\n", + "$$\n", + "\n", + "The unconditional distribution is obtained by integration out over $x_2$ and $x_3$:\n", + "\n", + "$$\n", + "p_{Y(d) }(y) := p(y do(d)) = \\sum_{x2,x3}\\left(p(x_2,x_3)p(y|x_2,x_3,d)\\right).\n", + "$$\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6D9XRwyFkzXb" + }, + "source": [ + "Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \\to Y$? Can we use back-door-criterion twice to get $D \\to M$ and $M \\to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "eaPrMHWTkzXb", + "outputId": "4cc25d7c-75ba-44a6-93df-41359b55495b" + }, + "outputs": [], + "source": [ + "data <- \"p(y,d, m)\"\n", + "\n", + "query.dm<- \"p(m|do(d))\"\n", + "query.md<- \"p(y|do(m))\"\n", + "query<- \"p(y|do(d))\"\n", + "\n", + "graph<- \"z1 -> x1\n", + "z1 -> x2\n", + "z2 -> x2\n", + "z2 -> x3\n", + "x2 -> d\n", + "x2 -> y\n", + "x3 -> y\n", + "x1 -> d\n", + "d -> m\n", + "m -> y\n", + "\"\n", + "print(dosearch(data, query.dm, graph))\n", + "print(dosearch(data, query.md, graph))\n", + "print(dosearch(data, query, graph))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Khy7jCIJkzXb" + }, + "source": [ + "So we get identification results:\n", + "First,\n", + "$$\n", + "p_{M(d)}(m) := p(m|do(d)) = p(m|d).\n", + "$$\n", + "Second,\n", + "$$\n", + "p_{Y(m)}(y) := p(y|do(m)) = \\sum_{d}\\left(p(d)p(y|d,m)\\right),\n", + "$$\n", + "and the last by integrating the product of these two formulas:\n", + "$$\n", + "p_{Y(d)}(y) := p(y|do(d)) = \\sum_{m}\\left(p(m|d)\\sum_{d}\\left(p(d)p(y|d,m)\\right)\\right)\n", + "$$\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hpL6jYtPkzXb" + }, + "source": [ + "The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \\to Y$. The answer is (guess) and the package correctly recovers it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] - } + "base_uri": "https://localhost:8080/" + }, + "id": "x-ACaOv_kzXc", + "outputId": "e5ba4476-2eb2-4806-fa2d-71c73dac986d" + }, + "outputs": [], + "source": [ + "data <- \"p(y,m)\n", + " p(m,d)\"\n", + "\n", + "query.dm<- \"p(m|do(d))\"\n", + "query.md<- \"p(y|do(m))\"\n", + "query<- \"p(y|do(d))\"\n", + "\n", + "graph<- \"z1 -> x1\n", + "z1 -> x2\n", + "z2 -> x2\n", + "z2 -> x3\n", + "x2 -> d\n", + "x2 -> y\n", + "x3 -> y\n", + "x1 -> d\n", + "d -> m\n", + "m -> y\n", + "\"\n", + "print(dosearch(data, query.dm, graph))\n", + "print(dosearch(data, query.md, graph))\n", + "print(dosearch(data, query, graph))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 7adf2e6ac5c6be20565c3e601df3fb36226d8a05 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 13:18:34 +0000 Subject: [PATCH 072/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in AC2 --- ...iased-ml-for-partially-linear-iv-model.Rmd | 271 + ...ased-ml-for-partially-linear-iv-model.irnb | 970 ++-- AC2/r-dml-401k-IV.Rmd | 1151 +++++ AC2/r-dml-401k-IV.irnb | 4384 ++++++++--------- AC2/r-weak-iv-experiments.Rmd | 74 + AC2/r-weak-iv-experiments.irnb | 323 +- 6 files changed, 4270 insertions(+), 2903 deletions(-) create mode 100644 AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd create mode 100644 AC2/r-dml-401k-IV.Rmd create mode 100644 AC2/r-weak-iv-experiments.Rmd diff --git a/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd b/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd new file mode 100644 index 00000000..8bace899 --- /dev/null +++ b/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd @@ -0,0 +1,271 @@ +--- +title: An R Markdown document converted from "AC2/r-debiased-ml-for-partially-linear-iv-model.irnb" +output: html_document +--- + +# Double/Debiased ML for Partially Linear IV Model + +This is a simple implementation of Debiased Machine Learning for the Partially Linear +IV Regression Model, which provides an application of DML IV inference. + + +Reference: + +- https://arxiv.org/abs/1608.00060 +- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 + +The code is based on the book. + + +# Partially Linear IV Model + +We consider the partially linear structural equation model: +\begin{eqnarray} + & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ + & Z = m_0(X) + V, & E[V \mid X] = 0. +\end{eqnarray} + + +Note that this model is not a regression model unless $Z=D$. The model is a canonical +model in causal inference, going back to P. Wright's work on IV methods for estimating demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. + + +The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation +in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. + + +The causal DAG this model corresponds to is given by: +$$ +Z \to D, X \to (Y, Z, D), L \to (Y,D), +$$ +where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. + + + +--- + +# Example + +A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. + +---- + + + +# PLIVM in Residualized Form + +The PLIV model above can be rewritten in the following residualized form: +$$ + \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, +$$ +where +$$ + \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ + \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ + \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. +$$ + The "tilde" variables (e.g. $\tilde Y$) above represent original variables after taking out or "partialling out" + the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ + and $U$ have non-zero correlation, which automatically means that $U$ and $V$ + must have non-zero variation. + + + +----- + +# DML for PLIV Model + +Given identification, DML proceeds as follows + +Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts +to solving the three problems of predicting $Y$, $D$, and $Z$ using +$X$, using any generic ML method, giving us estimated residuals +$$ +\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). +$$ +The estimates should be of a cross-validated form, as detailed in the algorithm below. + +Estimate $\theta_0$ by the the intstrumental +variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. +Use the conventional inference for the IV regression estimator, ignoring +the estimation error in these residuals. + +The reason we work with this residualized form is that it eliminates the bias +arising when solving the prediction problem in stage 1. The role of cross-validation +is to avoid another source of bias due to potential overfitting. + +The estimator is adaptive, +in the sense that the first stage estimation errors do not affect the second +stage errors. + +```{r} +install.packages("hdm") +install.packages("AER") +install.packages("randomForest") +install.packages("lfe") +install.packages("glmnet") + +library(hdm) +library(AER) #applied econometrics library +library(randomForest) #random Forest library +library(lfe) #high-dimensional econometrics library +library(glmnet) #glm net + +set.seed(1) +``` + +```{r} +# DML for PLIVM + +DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5) { + # this implements DML2 algorithm, where there moments are estimated via DML, before constructing + # the pooled estimate of theta randomly split data into folds + nobs <- nrow(x) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- ztil<- rep(NA, nobs) + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out + zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual + ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial + cat(b," ") + } + ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) + coef.est <- ivfit$coef #extract coefficient + se <- ivfit$se #record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) +} + +``` + +----- + +# Emprical Example: Acemoglu, Johnson, Robinson (AER). + + +* Y is log GDP; +* D is a measure of Protection from Expropriation, a proxy for +quality of insitutions; +* Z is the log of Settler's mortality; +* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) + + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/AJR.csv" +AJR <- read.csv(file) +dim(AJR) +``` + +```{r} +y = AJR$GDP; +d = AJR$Exprop; +z = AJR$logMort +xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) +x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data=AJR) +dim(x) +``` + +```{r} +set.seed(1) +# DML with PostLasso +cat(sprintf("\n DML with Post-Lasso \n")) + +dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso +yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso +zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso + +DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=5) + + +# DML with Random Forest +cat(sprintf("\n DML with Random Forest \n")) + +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest + +DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=5) + +# Compare Forest vs Lasso +comp.tab= matrix(NA, 3, 2) +comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) +comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) +comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) +rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") +colnames(comp.tab) = c("RF", "LASSO") +print(comp.tab, digits=3) +``` + +# Weak Instruments? + +```{r} +# using lfe package +summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) +summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) +``` + +## Anderson-Rubin Idea for Inference with Weak Instruments + +As shown above, we may have weak instruments because the t-stat in the regression $\tilde D \sim \tilde Z$ is small relative to standard rules of thumb -- for example Stock and Yogo (2005) suggest accounting for weak instruments if the first stage F-statistic is less than 10 (and more recent work suggests even larger cutoffs). + + + Here, we consider one specific approach (from Anderson and Rubin (1949)) to doing valid inference under weak identification based upon the statistic: +$$ +C(\theta) = \frac{ |E_n [(\tilde Y - \theta \tilde D) \tilde Z]|^2}{ \mathbb{V}_n [(\tilde Y - \theta \tilde D) \tilde Z ]/n}. +$$ +The empirical variance $\mathbb{V}_n$ is defined as: +\begin{align*} +\mathbb{V}_{n}[ g(W_i)] &:= \mathbb{E}_{n}g(W_i)g(W_i)' - \mathbb{E}_{n}[ g(W_i)]\mathbb{E}_{n}[ g(W_i)]'. +\end{align*} + +If $\theta_0 = \theta$, then $C(\theta) \overset{a}{\sim} N(0,1)^2 = \chi^2(1)$. Therefore, we can reject the hypothesis $\theta_0 = \theta$ at level $a$ (for example $a = .05$ for a 5\% level test) if $C(\theta)> c(1-a)$ where $c(1-a)$ is the $(1-a)$- quantile of a $\chi^2(1)$ variable. The probability of falsely rejecting the true hypothesis is approximately $a \times 100\%$. +To construct a $(1-a) \times 100$\% confidence region for $\theta$, we can then simply invert the test by collecting all parameter values that are not rejected at the $a$ level: +$$ +CR(\theta) = \{ \theta \in \Theta: C(\theta) \leq c(1-a)\}. +$$ + + +In more complex settings with many controls or controls that enter with unknown functional form, we can simply replace the residuals $\tilde Y$, $\tilde D$, and $\tilde Z$ by machine +learned cross-fitted residuals $\check Y$, $\check D$, and $ \check Z$. Thanks to the orthogonality of the IV moment condition underlying the formulation outlined above, we can formally assert that the properties of $C(\theta)$ and the subsequent testing procedure and confidence region for $\theta$ continue to hold when using cross-fitted residuals. We will further be able to apply the general procedure to cases where $D$ +is a vector, with a suitable adjustment of the statistic $C(\theta)$. + +```{r} +# DML-AR (DML with Anderson-Rubin) + +DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ + n = length(rY) + Cstat = rep(0, length(grid)) + for (i in 1:length(grid)) { + Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) + }; + LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); + UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); + plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); + lines(grid, Cstat, lty = 1, col = 1); + abline(h=qchisq(1-alpha,1), lty = 3, col = 4); + abline(v=LB, lty = 3, col = 2); + abline(v=UB, lty = 3, col = 2); + return(list(UB=UB, LB=LB)) + } +``` + +```{r} +DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, + grid = seq(-2, 2, by =.01)) + + +DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, + grid = seq(-2, 2, by =.01)) +``` + diff --git a/AC2/r-debiased-ml-for-partially-linear-iv-model.irnb b/AC2/r-debiased-ml-for-partially-linear-iv-model.irnb index ee4640c4..055b8070 100644 --- a/AC2/r-debiased-ml-for-partially-linear-iv-model.irnb +++ b/AC2/r-debiased-ml-for-partially-linear-iv-model.irnb @@ -1,517 +1,481 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.008106, - "end_time": "2021-04-23T10:41:33.911944", - "exception": false, - "start_time": "2021-04-23T10:41:33.903838", - "status": "completed" - }, - "tags": [], - "id": "qjaFmAyEzcLz" - }, - "source": [ - "# Double/Debiased ML for Partially Linear IV Model\n", - "\n", - "This is a simple implementation of Debiased Machine Learning for the Partially Linear\n", - "IV Regression Model, which provides an application of DML IV inference.\n", - "\n", - "\n", - "Reference:\n", - "\n", - "- https://arxiv.org/abs/1608.00060\n", - "- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778\n", - "\n", - "The code is based on the book.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.006963, - "end_time": "2021-04-23T10:41:33.926085", - "exception": false, - "start_time": "2021-04-23T10:41:33.919122", - "status": "completed" - }, - "tags": [], - "id": "G3YNyr-ezcL4" - }, - "source": [ - "\n", - "# Partially Linear IV Model\n", - "\n", - "We consider the partially linear structural equation model:\n", - "\\begin{eqnarray}\n", - " & Y - D\\theta_0 = g_0(X) + \\zeta, & E[\\zeta \\mid Z,X]= 0,\\\\\n", - " & Z = m_0(X) + V, & E[V \\mid X] = 0.\n", - "\\end{eqnarray}\n", - "\n", - "\n", - "Note that this model is not a regression model unless $Z=D$. The model is a canonical\n", - "model in causal inference, going back to P. Wright's work on IV methods for estimating demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. \n", - "\n", - "\n", - "The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\\theta_0$, and $g_0(X) + \\zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation\n", - "in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$.\n", - "\n", - "\n", - "The causal DAG this model corresponds to is given by:\n", - "$$\n", - "Z \\to D, X \\to (Y, Z, D), L \\to (Y,D),\n", - "$$\n", - "where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$.\n", - "\n", - "\n", - "\n", - "---\n", - "\n", - "# Example\n", - "\n", - "A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. \n", - "\n", - "----\n", - "\n", - "\n", - "\n", - "# PLIVM in Residualized Form\n", - "\n", - "The PLIV model above can be rewritten in the following residualized form:\n", - "$$\n", - " \\tilde Y = \\tilde D \\theta_0 + \\zeta, \\quad E[\\zeta \\mid V,X]= 0,\n", - "$$\n", - "where\n", - "$$\n", - " \\tilde Y = (Y- \\ell_0(X)), \\quad \\ell_0(X) = E[Y \\mid X] \\\\\n", - " \\tilde D = (D - r_0(X)), \\quad r_0(X) = E[D \\mid X] \\\\\n", - " \\tilde Z = (Z- m_0(X)), \\quad m_0(X) = E[Z \\mid X].\n", - "$$\n", - " The \"tilde\" variables (e.g. $\\tilde Y$) above represent original variables after taking out or \"partialling out\"\n", - " the effect of $X$. Note that $\\theta_0$ is identified from this equation if $V$\n", - " and $U$ have non-zero correlation, which automatically means that $U$ and $V$\n", - " must have non-zero variation.\n", - "\n", - " \n", - "\n", - "-----\n", - "\n", - "# DML for PLIV Model\n", - "\n", - "Given identification, DML proceeds as follows\n", - "\n", - "Compute the estimates $\\hat \\ell_0$, $\\hat r_0$, and $\\hat m_0$ , which amounts\n", - "to solving the three problems of predicting $Y$, $D$, and $Z$ using\n", - "$X$, using any generic ML method, giving us estimated residuals\n", - "$$\n", - "\\tilde Y = Y - \\hat \\ell_0(X), \\\\ \\tilde D= D - \\hat r_0(X), \\\\ \\tilde Z = Z- \\hat m_0(X).\n", - "$$\n", - "The estimates should be of a cross-validated form, as detailed in the algorithm below.\n", - "\n", - "Estimate $\\theta_0$ by the the intstrumental\n", - "variable regression of $\\tilde Y$ on $\\tilde D$ using $\\tilde Z$ as an instrument.\n", - "Use the conventional inference for the IV regression estimator, ignoring\n", - "the estimation error in these residuals.\n", - "\n", - "The reason we work with this residualized form is that it eliminates the bias\n", - "arising when solving the prediction problem in stage 1. The role of cross-validation\n", - "is to avoid another source of bias due to potential overfitting.\n", - "\n", - "The estimator is adaptive,\n", - "in the sense that the first stage estimation errors do not affect the second\n", - "stage errors.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "_kg_hide-output": true, - "execution": { - "iopub.execute_input": "2021-04-23T10:41:33.973149Z", - "iopub.status.busy": "2021-04-23T10:41:33.971090Z", - "iopub.status.idle": "2021-04-23T10:42:08.602961Z", - "shell.execute_reply": "2021-04-23T10:42:08.601638Z" - }, - "papermill": { - "duration": 34.670095, - "end_time": "2021-04-23T10:42:08.603197", - "exception": false, - "start_time": "2021-04-23T10:41:33.933102", - "status": "completed" - }, - "tags": [], - "id": "yGW6JhG5zcL5" - }, - "outputs": [], - "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"AER\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"lfe\")\n", - "install.packages(\"glmnet\")\n", - "\n", - "library(hdm)\n", - "library(AER) #applied econometrics library\n", - "library(randomForest) #random Forest library\n", - "library(lfe) #high-dimensional econometrics library\n", - "library(glmnet) #glm net\n", - "\n", - "set.seed(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:08.664371Z", - "iopub.status.busy": "2021-04-23T10:42:08.629661Z", - "iopub.status.idle": "2021-04-23T10:42:10.458175Z", - "shell.execute_reply": "2021-04-23T10:42:10.456976Z" - }, - "papermill": { - "duration": 1.846109, - "end_time": "2021-04-23T10:42:10.458406", - "exception": false, - "start_time": "2021-04-23T10:42:08.612297", - "status": "completed" - }, - "tags": [], - "id": "j2WVUbBDzcL-" - }, - "outputs": [], - "source": [ - "# DML for PLIVM\n", - "\n", - "DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5) {\n", - " # this implements DML2 algorithm, where there moments are estimated via DML, before constructing\n", - " # the pooled estimate of theta randomly split data into folds\n", - " nobs <- nrow(x)\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", - " I <- split(1:nobs, foldid)\n", - " # create residualized objects to fill\n", - " ytil <- dtil <- ztil<- rep(NA, nobs)\n", - " # obtain cross-fitted residuals\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the fold out\n", - " zhat <- predict(zfit, x[I[[b]],], type=\"response\") #predict the fold out\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the fold out\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual\n", - " ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial\n", - " cat(b,\" \")\n", - " }\n", - " ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE)\n", - " coef.est <- ivfit$coef #extract coefficient\n", - " se <- ivfit$se #record standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) )\n", - "}\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.011698, - "end_time": "2021-04-23T10:42:10.482689", - "exception": false, - "start_time": "2021-04-23T10:42:10.470991", - "status": "completed" - }, - "tags": [], - "id": "x1g3XjsIzcL_" - }, - "source": [ - "-----\n", - "\n", - "# Emprical Example: Acemoglu, Johnson, Robinson (AER).\n", - "\n", - "\n", - "* Y is log GDP;\n", - "* D is a measure of Protection from Expropriation, a proxy for\n", - "quality of insitutions;\n", - "* Z is the log of Settler's mortality;\n", - "* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions)\n", - "\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "qjaFmAyEzcLz", + "papermill": { + "duration": 0.008106, + "end_time": "2021-04-23T10:41:33.911944", + "exception": false, + "start_time": "2021-04-23T10:41:33.903838", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/AJR.csv\"\n", - "AJR <- read.csv(file)\n", - "dim(AJR)" - ], - "metadata": { - "id": "0Pc6OCp24rji" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "# Double/Debiased ML for Partially Linear IV Model\n", + "\n", + "This is a simple implementation of Debiased Machine Learning for the Partially Linear\n", + "IV Regression Model, which provides an application of DML IV inference.\n", + "\n", + "\n", + "Reference:\n", + "\n", + "- https://arxiv.org/abs/1608.00060\n", + "- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778\n", + "\n", + "The code is based on the book.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G3YNyr-ezcL4", + "papermill": { + "duration": 0.006963, + "end_time": "2021-04-23T10:41:33.926085", + "exception": false, + "start_time": "2021-04-23T10:41:33.919122", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "y = AJR$GDP;\n", - "d = AJR$Exprop;\n", - "z = AJR$logMort\n", - "xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR)\n", - "x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa +\n", - " Asia + Namer + Samer)^2, data=AJR)\n", - "dim(x)" - ], - "metadata": { - "id": "Z9uynanA9H8m" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "\n", + "# Partially Linear IV Model\n", + "\n", + "We consider the partially linear structural equation model:\n", + "\\begin{eqnarray}\n", + " & Y - D\\theta_0 = g_0(X) + \\zeta, & E[\\zeta \\mid Z,X]= 0,\\\\\n", + " & Z = m_0(X) + V, & E[V \\mid X] = 0.\n", + "\\end{eqnarray}\n", + "\n", + "\n", + "Note that this model is not a regression model unless $Z=D$. The model is a canonical\n", + "model in causal inference, going back to P. Wright's work on IV methods for estimating demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. \n", + "\n", + "\n", + "The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\\theta_0$, and $g_0(X) + \\zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation\n", + "in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$.\n", + "\n", + "\n", + "The causal DAG this model corresponds to is given by:\n", + "$$\n", + "Z \\to D, X \\to (Y, Z, D), L \\to (Y,D),\n", + "$$\n", + "where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$.\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "# Example\n", + "\n", + "A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. \n", + "\n", + "----\n", + "\n", + "\n", + "\n", + "# PLIVM in Residualized Form\n", + "\n", + "The PLIV model above can be rewritten in the following residualized form:\n", + "$$\n", + " \\tilde Y = \\tilde D \\theta_0 + \\zeta, \\quad E[\\zeta \\mid V,X]= 0,\n", + "$$\n", + "where\n", + "$$\n", + " \\tilde Y = (Y- \\ell_0(X)), \\quad \\ell_0(X) = E[Y \\mid X] \\\\\n", + " \\tilde D = (D - r_0(X)), \\quad r_0(X) = E[D \\mid X] \\\\\n", + " \\tilde Z = (Z- m_0(X)), \\quad m_0(X) = E[Z \\mid X].\n", + "$$\n", + " The \"tilde\" variables (e.g. $\\tilde Y$) above represent original variables after taking out or \"partialling out\"\n", + " the effect of $X$. Note that $\\theta_0$ is identified from this equation if $V$\n", + " and $U$ have non-zero correlation, which automatically means that $U$ and $V$\n", + " must have non-zero variation.\n", + "\n", + " \n", + "\n", + "-----\n", + "\n", + "# DML for PLIV Model\n", + "\n", + "Given identification, DML proceeds as follows\n", + "\n", + "Compute the estimates $\\hat \\ell_0$, $\\hat r_0$, and $\\hat m_0$ , which amounts\n", + "to solving the three problems of predicting $Y$, $D$, and $Z$ using\n", + "$X$, using any generic ML method, giving us estimated residuals\n", + "$$\n", + "\\tilde Y = Y - \\hat \\ell_0(X), \\\\ \\tilde D= D - \\hat r_0(X), \\\\ \\tilde Z = Z- \\hat m_0(X).\n", + "$$\n", + "The estimates should be of a cross-validated form, as detailed in the algorithm below.\n", + "\n", + "Estimate $\\theta_0$ by the the intstrumental\n", + "variable regression of $\\tilde Y$ on $\\tilde D$ using $\\tilde Z$ as an instrument.\n", + "Use the conventional inference for the IV regression estimator, ignoring\n", + "the estimation error in these residuals.\n", + "\n", + "The reason we work with this residualized form is that it eliminates the bias\n", + "arising when solving the prediction problem in stage 1. The role of cross-validation\n", + "is to avoid another source of bias due to potential overfitting.\n", + "\n", + "The estimator is adaptive,\n", + "in the sense that the first stage estimation errors do not affect the second\n", + "stage errors.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_kg_hide-output": true, + "id": "yGW6JhG5zcL5", + "papermill": { + "duration": 34.670095, + "end_time": "2021-04-23T10:42:08.603197", + "exception": false, + "start_time": "2021-04-23T10:41:33.933102", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:10.520943Z", - "iopub.status.busy": "2021-04-23T10:42:10.518743Z", - "iopub.status.idle": "2021-04-23T10:42:16.240485Z", - "shell.execute_reply": "2021-04-23T10:42:16.238577Z" - }, - "papermill": { - "duration": 5.743953, - "end_time": "2021-04-23T10:42:16.240693", - "exception": false, - "start_time": "2021-04-23T10:42:10.496740", - "status": "completed" - }, - "tags": [], - "id": "LwVXRiAqzcL_" - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "# DML with PostLasso\n", - "cat(sprintf(\"\\n DML with Post-Lasso \\n\"))\n", - "\n", - "dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso\n", - "yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso\n", - "zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso\n", - "\n", - "DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=5)\n", - "\n", - "\n", - "# DML with Random Forest\n", - "cat(sprintf(\"\\n DML with Random Forest \\n\"))\n", - "\n", - "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest\n", - "\n", - "DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=5)\n", - "\n", - "# Compare Forest vs Lasso\n", - "comp.tab= matrix(NA, 3, 2)\n", - "comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) )\n", - "comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) )\n", - "comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) )\n", - "rownames(comp.tab) = c(\"RMSE for Y:\", \"RMSE for D:\", \"RMSE for Z:\")\n", - "colnames(comp.tab) = c(\"RF\", \"LASSO\")\n", - "print(comp.tab, digits=3)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "install.packages(\"AER\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"lfe\")\n", + "install.packages(\"glmnet\")\n", + "\n", + "library(hdm)\n", + "library(AER) #applied econometrics library\n", + "library(randomForest) #random Forest library\n", + "library(lfe) #high-dimensional econometrics library\n", + "library(glmnet) #glm net\n", + "\n", + "set.seed(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j2WVUbBDzcL-", + "papermill": { + "duration": 1.846109, + "end_time": "2021-04-23T10:42:10.458406", + "exception": false, + "start_time": "2021-04-23T10:42:08.612297", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.013703, - "end_time": "2021-04-23T10:42:16.269251", - "exception": false, - "start_time": "2021-04-23T10:42:16.255548", - "status": "completed" - }, - "tags": [], - "id": "r7l6cFi2zcMA" - }, - "source": [ - "# Weak Instruments?" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "# DML for PLIVM\n", + "\n", + "DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5) {\n", + " # this implements DML2 algorithm, where there moments are estimated via DML, before constructing\n", + " # the pooled estimate of theta randomly split data into folds\n", + " nobs <- nrow(x)\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", + " I <- split(1:nobs, foldid)\n", + " # create residualized objects to fill\n", + " ytil <- dtil <- ztil<- rep(NA, nobs)\n", + " # obtain cross-fitted residuals\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the fold out\n", + " zhat <- predict(zfit, x[I[[b]],], type=\"response\") #predict the fold out\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the fold out\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual\n", + " ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial\n", + " cat(b,\" \")\n", + " }\n", + " ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE)\n", + " coef.est <- ivfit$coef #extract coefficient\n", + " se <- ivfit$se #record standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", + " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) )\n", + "}\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x1g3XjsIzcL_", + "papermill": { + "duration": 0.011698, + "end_time": "2021-04-23T10:42:10.482689", + "exception": false, + "start_time": "2021-04-23T10:42:10.470991", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:16.301968Z", - "iopub.status.busy": "2021-04-23T10:42:16.300698Z", - "iopub.status.idle": "2021-04-23T10:42:32.390351Z", - "shell.execute_reply": "2021-04-23T10:42:32.388488Z" - }, - "papermill": { - "duration": 16.107321, - "end_time": "2021-04-23T10:42:32.390552", - "exception": false, - "start_time": "2021-04-23T10:42:16.283231", - "status": "completed" - }, - "tags": [], - "id": "rsUnPDfpzcMB" - }, - "outputs": [], - "source": [ - "# using lfe package\n", - "summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T)\n", - "summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T)" - ] + "tags": [] + }, + "source": [ + "-----\n", + "\n", + "# Emprical Example: Acemoglu, Johnson, Robinson (AER).\n", + "\n", + "\n", + "* Y is log GDP;\n", + "* D is a measure of Protection from Expropriation, a proxy for\n", + "quality of insitutions;\n", + "* Z is the log of Settler's mortality;\n", + "* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0Pc6OCp24rji" + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/AJR.csv\"\n", + "AJR <- read.csv(file)\n", + "dim(AJR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z9uynanA9H8m" + }, + "outputs": [], + "source": [ + "y = AJR$GDP;\n", + "d = AJR$Exprop;\n", + "z = AJR$logMort\n", + "xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR)\n", + "x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa +\n", + " Asia + Namer + Samer)^2, data=AJR)\n", + "dim(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LwVXRiAqzcL_", + "papermill": { + "duration": 5.743953, + "end_time": "2021-04-23T10:42:16.240693", + "exception": false, + "start_time": "2021-04-23T10:42:10.496740", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.015919, - "end_time": "2021-04-23T10:42:32.423865", - "exception": false, - "start_time": "2021-04-23T10:42:32.407946", - "status": "completed" - }, - "tags": [], - "id": "L3fDDOmfzcMD" - }, - "source": [ - "## Anderson-Rubin Idea for Inference with Weak Instruments\n", - "\n", - "As shown above, we may have weak instruments because the t-stat in the regression $\\tilde D \\sim \\tilde Z$ is small relative to standard rules of thumb -- for example Stock and Yogo (2005) suggest accounting for weak instruments if the first stage F-statistic is less than 10 (and more recent work suggests even larger cutoffs).\n", - "\n", - "\n", - " Here, we consider one specific approach (from Anderson and Rubin (1949)) to doing valid inference under weak identification based upon the statistic:\n", - "$$\n", - "C(\\theta) = \\frac{ |E_n [(\\tilde Y - \\theta \\tilde D) \\tilde Z]|^2}{ \\mathbb{V}_n [(\\tilde Y - \\theta \\tilde D) \\tilde Z ]/n}.\n", - "$$\n", - "The empirical variance $\\mathbb{V}_n$ is defined as:\n", - "\\begin{align*}\n", - "\\mathbb{V}_{n}[ g(W_i)] &:= \\mathbb{E}_{n}g(W_i)g(W_i)' - \\mathbb{E}_{n}[ g(W_i)]\\mathbb{E}_{n}[ g(W_i)]'.\n", - "\\end{align*}\n", - "\n", - "If $\\theta_0 = \\theta$, then $C(\\theta) \\overset{a}{\\sim} N(0,1)^2 = \\chi^2(1)$. Therefore, we can reject the hypothesis $\\theta_0 = \\theta$ at level $a$ (for example $a = .05$ for a 5\\% level test) if $C(\\theta)> c(1-a)$ where $c(1-a)$ is the $(1-a)$- quantile of a $\\chi^2(1)$ variable. The probability of falsely rejecting the true hypothesis is approximately $a \\times 100\\%$. \n", - "To construct a $(1-a) \\times 100$\\% confidence region for $\\theta$, we can then simply invert the test by collecting all parameter values that are not rejected at the $a$ level:\n", - "$$\n", - "CR(\\theta) = \\{ \\theta \\in \\Theta: C(\\theta) \\leq c(1-a)\\}.\n", - "$$\n", - "\n", - "\n", - "In more complex settings with many controls or controls that enter with unknown functional form, we can simply replace the residuals $\\tilde Y$, $\\tilde D$, and $\\tilde Z$ by machine\n", - "learned cross-fitted residuals $\\check Y$, $\\check D$, and $ \\check Z$. Thanks to the orthogonality of the IV moment condition underlying the formulation outlined above, we can formally assert that the properties of $C(\\theta)$ and the subsequent testing procedure and confidence region for $\\theta$ continue to hold when using cross-fitted residuals. We will further be able to apply the general procedure to cases where $D$\n", - "is a vector, with a suitable adjustment of the statistic $C(\\theta)$.\n" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "# DML with PostLasso\n", + "cat(sprintf(\"\\n DML with Post-Lasso \\n\"))\n", + "\n", + "dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso\n", + "yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso\n", + "zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso\n", + "\n", + "DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=5)\n", + "\n", + "\n", + "# DML with Random Forest\n", + "cat(sprintf(\"\\n DML with Random Forest \\n\"))\n", + "\n", + "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", + "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", + "zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest\n", + "\n", + "DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=5)\n", + "\n", + "# Compare Forest vs Lasso\n", + "comp.tab= matrix(NA, 3, 2)\n", + "comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) )\n", + "comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) )\n", + "comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) )\n", + "rownames(comp.tab) = c(\"RMSE for Y:\", \"RMSE for D:\", \"RMSE for Z:\")\n", + "colnames(comp.tab) = c(\"RF\", \"LASSO\")\n", + "print(comp.tab, digits=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r7l6cFi2zcMA", + "papermill": { + "duration": 0.013703, + "end_time": "2021-04-23T10:42:16.269251", + "exception": false, + "start_time": "2021-04-23T10:42:16.255548", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:32.493480Z", - "iopub.status.busy": "2021-04-23T10:42:32.491831Z", - "iopub.status.idle": "2021-04-23T10:42:32.507653Z", - "shell.execute_reply": "2021-04-23T10:42:32.506463Z" - }, - "papermill": { - "duration": 0.036055, - "end_time": "2021-04-23T10:42:32.507790", - "exception": false, - "start_time": "2021-04-23T10:42:32.471735", - "status": "completed" - }, - "tags": [], - "id": "zi5Gkqw2zcMD" - }, - "outputs": [], - "source": [ - "# DML-AR (DML with Anderson-Rubin)\n", - "\n", - "DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){\n", - " n = length(rY)\n", - " Cstat = rep(0, length(grid))\n", - " for (i in 1:length(grid)) {\n", - " Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ )\n", - " };\n", - " LB<- min(grid[ Cstat < qchisq(1-alpha,1)]);\n", - " UB <- max(grid[ Cstat < qchisq(1-alpha,1)]);\n", - " plot(range(grid),range(c( Cstat)) , type=\"n\",xlab=\"Effect of institutions\", ylab=\"Statistic\", main=\" \");\n", - " lines(grid, Cstat, lty = 1, col = 1);\n", - " abline(h=qchisq(1-alpha,1), lty = 3, col = 4);\n", - " abline(v=LB, lty = 3, col = 2);\n", - " abline(v=UB, lty = 3, col = 2);\n", - " return(list(UB=UB, LB=LB))\n", - " }\n" - ] + "tags": [] + }, + "source": [ + "# Weak Instruments?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rsUnPDfpzcMB", + "papermill": { + "duration": 16.107321, + "end_time": "2021-04-23T10:42:32.390552", + "exception": false, + "start_time": "2021-04-23T10:42:16.283231", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:32.545479Z", - "iopub.status.busy": "2021-04-23T10:42:32.543976Z", - "iopub.status.idle": "2021-04-23T10:42:33.002321Z", - "shell.execute_reply": "2021-04-23T10:42:33.001039Z" - }, - "papermill": { - "duration": 0.478528, - "end_time": "2021-04-23T10:42:33.002469", - "exception": false, - "start_time": "2021-04-23T10:42:32.523941", - "status": "completed" - }, - "tags": [], - "id": "k9bB2O13zcME" - }, - "outputs": [], - "source": [ - "DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil,\n", - " grid = seq(-2, 2, by =.01))\n", - "\n", - "\n", - "DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil,\n", - " grid = seq(-2, 2, by =.01))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "outputs": [], + "source": [ + "# using lfe package\n", + "summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T)\n", + "summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L3fDDOmfzcMD", + "papermill": { + "duration": 0.015919, + "end_time": "2021-04-23T10:42:32.423865", + "exception": false, + "start_time": "2021-04-23T10:42:32.407946", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "source": [ + "## Anderson-Rubin Idea for Inference with Weak Instruments\n", + "\n", + "As shown above, we may have weak instruments because the t-stat in the regression $\\tilde D \\sim \\tilde Z$ is small relative to standard rules of thumb -- for example Stock and Yogo (2005) suggest accounting for weak instruments if the first stage F-statistic is less than 10 (and more recent work suggests even larger cutoffs).\n", + "\n", + "\n", + " Here, we consider one specific approach (from Anderson and Rubin (1949)) to doing valid inference under weak identification based upon the statistic:\n", + "$$\n", + "C(\\theta) = \\frac{ |E_n [(\\tilde Y - \\theta \\tilde D) \\tilde Z]|^2}{ \\mathbb{V}_n [(\\tilde Y - \\theta \\tilde D) \\tilde Z ]/n}.\n", + "$$\n", + "The empirical variance $\\mathbb{V}_n$ is defined as:\n", + "\\begin{align*}\n", + "\\mathbb{V}_{n}[ g(W_i)] &:= \\mathbb{E}_{n}g(W_i)g(W_i)' - \\mathbb{E}_{n}[ g(W_i)]\\mathbb{E}_{n}[ g(W_i)]'.\n", + "\\end{align*}\n", + "\n", + "If $\\theta_0 = \\theta$, then $C(\\theta) \\overset{a}{\\sim} N(0,1)^2 = \\chi^2(1)$. Therefore, we can reject the hypothesis $\\theta_0 = \\theta$ at level $a$ (for example $a = .05$ for a 5\\% level test) if $C(\\theta)> c(1-a)$ where $c(1-a)$ is the $(1-a)$- quantile of a $\\chi^2(1)$ variable. The probability of falsely rejecting the true hypothesis is approximately $a \\times 100\\%$. \n", + "To construct a $(1-a) \\times 100$\\% confidence region for $\\theta$, we can then simply invert the test by collecting all parameter values that are not rejected at the $a$ level:\n", + "$$\n", + "CR(\\theta) = \\{ \\theta \\in \\Theta: C(\\theta) \\leq c(1-a)\\}.\n", + "$$\n", + "\n", + "\n", + "In more complex settings with many controls or controls that enter with unknown functional form, we can simply replace the residuals $\\tilde Y$, $\\tilde D$, and $\\tilde Z$ by machine\n", + "learned cross-fitted residuals $\\check Y$, $\\check D$, and $ \\check Z$. Thanks to the orthogonality of the IV moment condition underlying the formulation outlined above, we can formally assert that the properties of $C(\\theta)$ and the subsequent testing procedure and confidence region for $\\theta$ continue to hold when using cross-fitted residuals. We will further be able to apply the general procedure to cases where $D$\n", + "is a vector, with a suitable adjustment of the statistic $C(\\theta)$.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zi5Gkqw2zcMD", + "papermill": { + "duration": 0.036055, + "end_time": "2021-04-23T10:42:32.507790", + "exception": false, + "start_time": "2021-04-23T10:42:32.471735", + "status": "completed" }, + "tags": [] + }, + "outputs": [], + "source": [ + "# DML-AR (DML with Anderson-Rubin)\n", + "\n", + "DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){\n", + " n = length(rY)\n", + " Cstat = rep(0, length(grid))\n", + " for (i in 1:length(grid)) {\n", + " Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ )\n", + " };\n", + " LB<- min(grid[ Cstat < qchisq(1-alpha,1)]);\n", + " UB <- max(grid[ Cstat < qchisq(1-alpha,1)]);\n", + " plot(range(grid),range(c( Cstat)) , type=\"n\",xlab=\"Effect of institutions\", ylab=\"Statistic\", main=\" \");\n", + " lines(grid, Cstat, lty = 1, col = 1);\n", + " abline(h=qchisq(1-alpha,1), lty = 3, col = 4);\n", + " abline(v=LB, lty = 3, col = 2);\n", + " abline(v=UB, lty = 3, col = 2);\n", + " return(list(UB=UB, LB=LB))\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k9bB2O13zcME", "papermill": { - "duration": 62.79319, - "end_time": "2021-04-23T10:42:34.038582", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-04-23T10:41:31.245392", - "version": "2.1.3" + "duration": 0.478528, + "end_time": "2021-04-23T10:42:33.002469", + "exception": false, + "start_time": "2021-04-23T10:42:32.523941", + "status": "completed" }, - "colab": { - "provenance": [] - } + "tags": [] + }, + "outputs": [], + "source": [ + "DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil,\n", + " grid = seq(-2, 2, by =.01))\n", + "\n", + "\n", + "DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil,\n", + " grid = seq(-2, 2, by =.01))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "papermill": { + "duration": 62.79319, + "end_time": "2021-04-23T10:42:34.038582", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-04-23T10:41:31.245392", + "version": "2.1.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/AC2/r-dml-401k-IV.Rmd b/AC2/r-dml-401k-IV.Rmd new file mode 100644 index 00000000..a85c9e24 --- /dev/null +++ b/AC2/r-dml-401k-IV.Rmd @@ -0,0 +1,1151 @@ +--- +title: An R Markdown document converted from "AC2/r-dml-401k-IV.irnb" +output: html_document +--- + +# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models + +## Impact of 401(k) on Financial Wealth + +We consider estimation of the effect of 401(k) participation +on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. + +One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. + +```{r} +install.packages("xtable") +install.packages("hdm") +install.packages("sandwich") +install.packages("ggplot2") +install.packages("randomForest") +install.packages("glmnet") +install.packages("rpart") +install.packages("gbm") + +library(xtable) +library(hdm) +library(sandwich) +library(ggplot2) +library(randomForest) +library(data.table) +library(glmnet) +library(rpart) +library(gbm) + +set.seed(123) +``` + +### Data + +The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv). +The data set can be loaded from the `hdm` package for R directly by typing: + + +```{r} +data(pension) +data <- pension +dim(data) +``` + +See the "Details" section on the description of the data set, which can be accessed by + +```{r} +help(pension) +``` + +The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. + +Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. + +```{r} +hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() +hist_e401 +``` + +Eligibility is highly associated with financial wealth: + +```{r} +dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + + geom_density() + xlim(c(-20000, 150000)) + + facet_wrap(.~e401) + +dens_net_tfa +``` + +The unconditional APE of e401 is about $19559$: + +```{r} +e1 <- data[data$e401==1,] +e0 <- data[data$e401==0,] +round(mean(e1$net_tfa)-mean(e0$net_tfa),0) +``` + +Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: + +```{r} +p1 <- data[data$p401==1,] +p0 <- data[data$p401==0,] +round(mean(p1$net_tfa)-mean(p0$net_tfa),0) +``` + +As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. + +```{r} +# instrument variable +Z <- data[,'e401'] +# treatment variable +D <- data[, 'p401'] +# outcome variable +y <- data[,'net_tfa'] +``` + +### We construct the engineered features for controls + +```{r} +# Constructing the controls +X_formula = "~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" +X = as.data.table(model.frame(X_formula, pension)) +head(X) +``` + +# Instrumental Variables: Effect of 401k Participation on Financial Assets + +## Double ML IV under Partial Linearity + +Now, we consider estimation of average treatment effects of participation in 401k, i.e. `p401`, with the binary instrument being eligibility in 401k, i.e. `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. We consider a partially linear structural equation model: +\begin{eqnarray*} +Y & := & g_Y(\epsilon_Y) D + f_Y(A, X, \epsilon_Y), \\ +D & := & f_D(Z, X, A, \epsilon_D), \\ +Z & := & f_Z(X, \epsilon_Z),\\ +A & : = & f_A(X, \epsilon_A), \\ +X & := & \epsilon_X, +\end{eqnarray*} +where $A$ is a vector of un-observed confounders. + +Under this structural equation model, the average treatment effect: +\begin{align} +\alpha = E[Y(1) - Y(0)] +\end{align} +can be identified by the moment restriction: +\begin{align} +E[(\tilde{Y} - \alpha \tilde{D}) \tilde{Z}] = 0 +\end{align} +where for any variable $V$, we denote with $\tilde{V} = V - E[V|X]$. + +```{r} +set.seed(1) +yfit.lasso.cv <- cv.glmnet(as.matrix(X), y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +Dfit.lasso.cv <- cv.glmnet(as.matrix(X), D, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +Zfit.lasso.cv <- cv.glmnet(as.matrix(X), Z, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss + + +yhat.lasso.cv <- predict(yfit.lasso.cv, newx = as.matrix(X)) # predictions +Dhat.lasso.cv <- predict(Dfit.lasso.cv, newx = as.matrix(X)) # predictions +Zhat.lasso.cv <- predict(Zfit.lasso.cv, newx = as.matrix(X)) # predictions + +resy <- y-yhat.lasso.cv +resD <- D-Dhat.lasso.cv +resZ <- Z-Zhat.lasso.cv + +# Estimate +mean(resy * resZ) / mean(resZ*resD) +``` + +Recall if we want to do inference, we need to either use the theoretically driven penalty paramter for Lasso or perform cross-fitting. + +### DML with Non-Linear ML Models and Cross-fitting + +```{r} +# DML for PLIVM with D and Z as classifiers or regressors +DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5, method="regression") { + nobs <- nrow(x) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- ztil<- rep(NA, nobs) + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + if (method == "randomforest"){ + # take a fold out + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) + zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]],], type="prob")[,2] # type = "prob" is like predict_proba in scikitlearn + zhat <- predict(zfit, x[I[[b]],], type="prob")[,2] + yhat <- predict(yfit, x[I[[b]],]) # default type = "response" for regression for RF, type = "vector" for regression for Decision Trees + # record residual + dtil[I[[b]]] <- (as.numeric(d[I[[b]]])-1 - dhat) # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1! + ztil[I[[b]]] <- (as.numeric(z[I[[b]]])-1 - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) + } else if (method == "regression") { # works for both boosted trees and glmnet + # take a fold out + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) + zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]],], type="response") + zhat <- predict(zfit, x[I[[b]],], type="response") + yhat <- predict(yfit, x[I[[b]],], type="response") + # record residual + dtil[I[[b]]] <- (d[I[[b]]] - dhat) + ztil[I[[b]]] <- (z[I[[b]]] - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) + } else if (method == "decisiontrees"){ + # take a fold out + dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) + zfit <- zreg(x[-I[[b]],], as.factor(z)[-I[[b]]]) + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]],])[,2] + zhat <- predict(zfit, x[I[[b]],])[,2] + yhat <- predict(yfit, x[I[[b]],]) + # record residual + dtil[I[[b]]] <- (d[I[[b]]] - dhat) + ztil[I[[b]]] <- (z[I[[b]]] - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) + } + + cat(b," ") + } + ivfit = tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) + coef.est <- ivfit$coef #extract coefficient + se <- ivfit$se #record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + + return( list(coef.est=coef.est, se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) +} +``` + +```{r} +summary <- function(point, stderr, resy, resD, resZ, name) { + data <- data.frame( + estimate = point, # point estimate + stderr = stderr, # standard error + lower = point - 1.96 * stderr, # lower end of 95% confidence interval + upper = point + 1.96 * stderr, # upper end of 95% confidence interval + `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y + `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D + `rmse Z` = sqrt(mean(resZ^2)), # RMSE of model that predicts treatment D + `accuracy D` = mean(abs(resD) < 0.5), # binary classification accuracy of model for D + `accuracy Z` = mean(abs(resZ) < 0.5) # binary classification accuracy of model for Z + ) + rownames(data) <- name + return(data) +} +``` + +#### Double Lasso with Cross-Fitting + +```{r} +# DML with LassoCV +set.seed(123) +cat(sprintf("\nDML with Lasso CV \n")) + +dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family="gaussian", alpha=1, nfolds=5)} +yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +zreg.lasso.cv <- function(x,z){ cv.glmnet(x, z, family="gaussian", alpha=1, nfolds=5)} + +DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.lasso.cv, yreg.lasso.cv, zreg.lasso.cv, nfold=5, method="regression") +sum.lasso.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV') +tableplr <- data.frame() +tableplr <- rbind(sum.lasso.cv) +tableplr + +ytil.lasso <- DML2.results$ytil +dtil.lasso <- DML2.results$dtil +ztil.lasso <- DML2.results$ztil +``` + +#### Using a $\ell_2$ Penalized Logistic Regression for D and Z + +```{r} +# DML with Lasso/Ridge +set.seed(123) +cat(sprintf("\nDML with Lasso/Logistic \n")) + +dreg.ridge.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} +yreg.ridge.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +zreg.ridge.cv <- function(x,z){cv.glmnet(x, z, family="binomial", alpha=0, nfolds=5)} + +DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.ridge.cv, yreg.ridge.cv, zreg.ridge.cv, nfold=5, method="regression") +sum.lasso_ridge.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV/LogisticCV') +tableplr <- rbind(tableplr, sum.lasso_ridge.cv) +tableplr + +ytil.ridge <- DML2.results$ytil +dtil.ridge <- DML2.results$dtil +ztil.ridge <- DML2.results$ztil +``` + +### Random Forests + +```{r} +# DML with Random Forest +set.seed(123) +cat(sprintf("\nDML with Random Forest \n")) + +dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest +yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest +zreg.rf <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest + +DML2.results = DML2.for.PLIVM(as.matrix(X), as.factor(D), as.factor(Z), y, dreg.rf, yreg.rf, zreg.rf, nfold=5, method="randomforest") +sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'RF') +tableplr <- rbind(tableplr, sum.rf) +tableplr + +ytil.rf <- DML2.results$ytil +dtil.rf <- DML2.results$dtil +ztil.rf <- DML2.results$ztil +``` + +### Decision Trees + +```{r} +# DML with Decision Trees +set.seed(123) +cat(sprintf("\nDML with Decision Trees \n")) + +dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} +yreg.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +zreg.tr <- function(x,z){rpart(as.formula("Z~."), cbind(data.frame(Z=z),x), method = "class", minbucket=10, cp = 0.001)} + +DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.tr, yreg.tr, zreg.tr, nfold=5, method="decisiontrees") # decision tree takes in X as dataframe, not matrix/array +sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Decision Trees') +tableplr <- rbind(tableplr, sum.tr) +tableplr + +ytil.tr <- DML2.results$ytil +dtil.tr <- DML2.results$dtil +ztil.tr <- DML2.results$ztil +``` + +### Boosted Trees + +```{r} +# DML with Boosted Trees +set.seed(123) +cat(sprintf("\nDML with Boosted Trees \n")) + +# NB: early stopping cannot easily be implemented with gbm +## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) +dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +zreg.boost <- function(x,z){gbm(as.formula("Z~."), cbind(data.frame(Z=z),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} + +# passing these through regression as type="response", and D and Z should not be factors! +DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.boost, yreg.boost, zreg.boost, nfold=5, method = "regression") +sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Boosted Trees') +tableplr <- rbind(tableplr, sum.boost) +tableplr + +ytil.boost <- DML2.results$ytil +dtil.boost <- DML2.results$dtil +ztil.boost <- DML2.results$ztil +``` + +## Ensembles + +Boosted trees give the best RMSE for Y, D, and Z, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case. + +```{r} +# Best fit is boosted trees for D, Z, Y + +sum.best <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$dtil, name = 'Best') +tableplr <- rbind(tableplr, sum.best) +tableplr +``` + +We'll form a model average with unconstrained least squares weights. + +```{r} +# Least squares model average + +dhat.lasso <- D - dtil.lasso +dhat.ridge <- D - dtil.ridge +dhat.rf <- D - dtil.rf +dhat.tr <- D - dtil.tr +dhat.boost <- D - dtil.boost + +yhat.lasso <- y - ytil.lasso +yhat.ridge <- y - ytil.ridge +yhat.rf <- y - ytil.rf +yhat.tr <- y - ytil.tr +yhat.boost <- y - ytil.boost + +zhat.lasso <- Z - ztil.lasso +zhat.ridge <- Z - ztil.ridge +zhat.rf <- Z - ztil.rf +zhat.tr <- Z - ztil.tr +zhat.boost <- Z - ztil.boost + +ma.dtil <- lm(D~dhat.lasso+dhat.ridge+dhat.rf+dhat.tr+dhat.boost)$residuals +ma.ytil <- lm(y~yhat.lasso+yhat.ridge+yhat.rf+yhat.tr+yhat.boost)$residuals +ma.ztil <- lm(Z~zhat.lasso+zhat.ridge+zhat.rf+zhat.tr+zhat.boost)$residuals + +ivfit = tsls(y=ma.ytil,d=ma.dtil, x=NULL, z=ma.ztil, intercept=FALSE) +coef.est <- ivfit$coef #extract coefficient +se <- ivfit$se #record standard error + +sum.ma <- summary(coef.est, se, ma.ytil, ma.dtil, ma.ztil, name = 'Model Average') +tableplr <- rbind(tableplr, sum.ma) +tableplr +``` + +## Inference Robust to Weak Identification + +Now we turn toward robustness when the instrument is weak. + +Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\sim X$, $D \sim X$, $Z\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract the best model so we can re-use that in DML. + +Thus, in the below analysis of robust inference, we choose Boosted Trees as they perform well. + +```{r} +robust_inference <- function(point, stderr, resD, resy, resZ, grid, alpha = 0.05) { + # Inference in the partially linear IV model that is robust to weak identification. + # grid: grid of theta values to search over when trying to identify the confidence region + # alpha: confidence level + + n <- dim(X)[1] + thr <- qchisq(1 - alpha, df = 1) + accept <- c() + + for (theta in grid) { + moment <- (resy - theta * resD) * resZ + test <- n * mean(moment)^2 / var(moment) + if (test <= thr) { + accept <- c(accept, theta) + } + } + + return(accept) +} +``` + +```{r} +grid <- seq(0, 20000, length.out = 10000) +region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid) +``` + +```{r} +grid <- seq(0, 20000, length.out = 10000) +region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)# Calculate min and max +min_region <- min(region) +max_region <- max(region) + +print(min_region) +print(max_region) +``` + +# Interactive IV Model and LATE + +Now, we consider estimation of local average treatment effects (LATE) of participation `p401`, with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: +\begin{eqnarray} +Y &:=& f_Y (D, X, A, \epsilon_Y) \\ +D &:= & f_D(Z, X, A, \epsilon_D) \in \{0,1\}, \\ +Z &:= & f_Z(X,\epsilon_Z) \in \{0,1\}, \\ +X &:=& \epsilon_X, \quad A = \epsilon_A, +\end{eqnarray} +where $\epsilon$'s are all exogenous and independent, +and +$$ +z \mapsto f_D(z , A, X, \epsilon_D) \text{ is weakly increasing (weakly monotone)}. +$$ +and $A$ is a vector of unobserved confounders. Note that in our setting monotonicity is satisfied, since participation is only feasible when it is eligible. Thus we have that $D=0$ whenever $Z=0$. Thus it can only be that $f_D(1, A, X, \epsilon_D) \geq 0 = f_D(0, A, X, \epsilon_D)$. + +In this case, we can estimate the local average treatment effect (LATE): +$$ +\alpha = E[Y(1) - Y(0) | D(1) > D(0)] +$$ +This can be identified using the Neyman orthogonal moment equation: +\begin{align} +E\left[g(1, X) - g(0, X) + H(Z) (Y - g(Z, X)) - \alpha \cdot (m(1, X) - m(0, X) + H(Z) (D - m(Z, X))\right] = 0 +\end{align} +where +\begin{align} +g(Z,X) =~& E[Y|Z,X],\\ +m(Z,X) =~& E[D|Z,X],\\ +H(Z) =~& \frac{Z}{Pr(Z=1|X)} - \frac{1 - Z}{1 - Pr(Z=1|X)} +\end{align} + +```{r} +# DML for IIVM with D and Z as classifiers or regressors +DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="classification", dt=0, bt=0) { + # this implements DML2 algorithm, where there moments are estimated via DML, before constructing + # the pooled estimate of theta randomly split data into folds + + ## NB This method has many if statements to accommodate the various estimators we will use. + ## Unlike Python's sklearn, all methods have idfferent default arguments in their predict functions. + ## See official R documentation for details. + + yhat0 <- rep(0, length(y)) + yhat1 <- rep(0, length(y)) + dhat0 <- rep(0, length(d)) + dhat1 <- rep(0, length(d)) + zhat <- rep(0, length(Z)) + + nobs <- nrow(X) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- ztil<- rep(NA, nobs) + + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + + # define helpful variables + Xb = X[I[[b]],] + Xnotb = X[-I[[b]],] + Znotb = Z[-I[[b]]] + + # training dfs subsetted on the -I[[b]] fold + XZ0 = X[-I[[b]],][Z[-I[[b]]]==0] + yZ0 = y[-I[[b]]][Z[-I[[b]]]==0] + XZ1 = X[-I[[b]],][Z[-I[[b]]]==1] + yZ1 = y[-I[[b]]][Z[-I[[b]]]==1] + DZ0 = d[-I[[b]]][Z[-I[[b]]]==0] + DZ1 = d[-I[[b]]][Z[-I[[b]]]==1] + + + if (method == "regression") { + XZ0 = as.matrix(XZ0) + XZ1 = as.matrix(XZ1) + Xb = as.matrix(Xb) + Xnotb = as.matrix(Xnotb) + + # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" + + # train an outcome model on training data that received Z=1 and predict outcome on all data in test set + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + # train a treatment model on training data that received Z=0 and predict treatment on all data in test set + if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically + dreg0_ <- dreg0 + dfit0 <- dreg0_((XZ0), DZ0) + dhat0[I[[b]]] <- predict(dfit0, (Xb), type="response") # default type = "response", but for family binomial it's logg odds + } + # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set + if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically + dreg1_ <- dreg1 + dfit1 <- dreg1_((XZ1), DZ1) + dhat1[I[[b]]] <- predict(dfit1, (Xb), type="response") + } else { + dhat1[I[[b]]] <- 1 + } + + } else if (method == "randomforest") { + DZ0factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==0] + DZ1factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==1] + Znotb = as.factor(Znotb) + + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb), type="response") + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb), type="response") + + if (mean(DZ0) > 0) { + dreg0_ <- dreg0 + dfit0 <- dreg0_((XZ0), DZ0factor) + dhat0[I[[b]]] <- predict(dfit0, (Xb), type="prob")[,2] # get second column because type = "prob" + } + if (mean(DZ1) < 1) { + dreg1_ <- dreg1 + dfit1 <- dreg1_((XZ1), DZ1factor) + dhat1[I[[b]]] <- predict(dfit1, (Xb), type="prob")[,2] + } else { + dhat1[I[[b]]] <- 1 + } + + } else if (method == "decisiontrees") { + XZ0 = as.data.frame(XZ0) + XZ1 = as.data.frame(XZ1) + Xb = as.data.frame(Xb) + Xnotb = as.data.frame(Xnotb) + + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for decision trees for continuous response + + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + if (mean(DZ0) > 0) { + dreg0_ <- dreg0 + dfit0 <- dreg0_((XZ0), as.factor(DZ0)) + dhat0[I[[b]]] <- predict(dfit0, (Xb))[,2] # for decision trees, default = "prob" for decision trees with factor responses + } + + if (mean(DZ1) < 1) { + dreg1_ <- dreg1 + dfit1 <- dreg1_((XZ1), as.factor(DZ1)) + dhat1[I[[b]]] <- predict(dfit1, (Xb))[,2] + } else { + dhat1[I[[b]]] <- 1 + } + + } else if (method == "boostedtrees") { + XZ0 = as.data.frame(XZ0) + XZ1 = as.data.frame(XZ1) + Xb = as.data.frame(Xb) + Xnotb = as.data.frame(Xnotb) + + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for boosted trees + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + if (mean(DZ0) > 0) { + dreg0_ <- dreg0 + dfit0 <- dreg0_((XZ0), DZ0) + dhat0[I[[b]]] <- predict(dfit0, (Xb), type = "response") # default for boosted trees is log odds. + } + if (mean(DZ1) < 1) { + dreg1_ <- dreg1 + dfit1 <- dreg1_((XZ1), DZ1) + dhat1[I[[b]]] <- predict(dfit1, (Xb), type = "response") + } else { + dhat1[I[[b]]] <- 1 + } + + } + + # propensity scores: + if (method == "regression"){ + zfit_b <- zreg((Xnotb), Znotb) + zhat_b <- predict(zfit_b, (Xb), type="response") + } else if (method == "randomforest"){ + zfit_b <- zreg((Xnotb), Znotb) + zhat_b <- predict(zfit_b, (Xb), type = "prob")[,2] + } else if (method == "decisiontrees"){ + zfit_b <- zreg((Xnotb), as.factor(Znotb)) + zhat_b <- predict(zfit_b, (Xb)) # default is prob, so get second column + zhat_b = zhat_b[,2] + } else if (method == "boostedtrees"){ + zfit_b <- zreg((Xnotb), Znotb) + zhat_b <- predict(zfit_b, (Xb), type = "response") + } + zhat_b <- pmax(pmin(zhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] + zhat[I[[b]]] <- zhat_b + + cat(b," ") + } + + + # Prediction of treatment and outcome for observed instrument + yhat <- yhat0 * (1 - Z) + yhat1 * Z + dhat <- dhat0 * (1 - Z) + dhat1 * Z + + # residuals + ytil <- y-yhat + dtil <- D-dhat + ztil <- Z-zhat + + # doubly robust quantity for every sample + HZ <- Z / zhat - (1 - Z) / (1 - zhat) + drZ <- yhat1 - yhat0 + (y - yhat) * HZ + drD <- dhat1 - dhat0 + (D - dhat) * HZ + coef.est <- mean(drZ) / mean(drD) + cat("point", coef.est) + psi <- drZ - coef.est * drD + Jhat <- mean(drD) + variance <- mean(psi^2) / Jhat^2 + se <- sqrt(variance / nrow(X)) + cat("se", se) + + return(list(coef.est = coef.est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1)) +} +``` + +```{r} +summary <- function(coef.est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) { + summary_data <- data.frame(estimate = coef.est, # point estimate + se = se, # standard error + lower = coef.est - 1.96 * se, # lower end of 95% confidence interval + upper = coef.est + 1.96 * se, # upper end of 95% confidence interval + rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y + rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D + rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z + accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D + accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z + ) + row.names(summary_data) <- name + return(summary_data) +} +``` + +```{r} +# DML with Lasso/Ridge +set.seed(123) +cat(sprintf("\nDML with Lasso/Logistic \n")) +# DML with Lasso/Ridge +dreg0 <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} +dreg1 <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} +yreg0 <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +yreg1 <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +zreg <- function(x,z){cv.glmnet(x, z, family="binomial", alpha=0, nfolds=5)} + +DML2.results <- DML2.for.IIVM(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="regression") +sum.lasso_ridge.cv <-summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'LassoCV/LogisticCV') +table <- data.frame() +table <- rbind(table, sum.lasso_ridge.cv) +table + +yhat.lasso = DML2.results$yhat +dhat.lasso = DML2.results$dhat +yhat0.lasso = DML2.results$yhat0 +yhat1.lasso = DML2.results$yhat1 +dhat0.lasso = DML2.results$dhat0 +dhat1.lasso = DML2.results$dhat1 +zhat.lasso = DML2.results$zhat +``` + +```{r} +# DML with Random Forest +set.seed(123) +cat(sprintf("\nDML with Random Forest \n")) + +dreg0 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest +dreg1 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest +yreg0 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest +yreg1 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest +zreg <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest + +DML2.results <- DML2.for.IIVM(X,D,Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="randomforest") +sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'RF') +table <- rbind(table, sum.rf) +table + +yhat.rf = DML2.results$yhat +dhat.rf = DML2.results$dhat +yhat0.rf = DML2.results$yhat0 +yhat1.rf = DML2.results$yhat1 +dhat0.rf = DML2.results$dhat0 +dhat1.rf = DML2.results$dhat1 +zhat.rf = DML2.results$zhat +``` + +```{r} +# DML with Decision Trees +set.seed(123) +cat(sprintf("\nDML with Decision Trees \n")) + +dreg0 <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} +dreg1 <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} +yreg0 <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +yreg1 <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +zreg <- function(x,z){rpart(as.formula("Z~."), cbind(data.frame(Z=z),x), method = "class", minbucket=10, cp = 0.001)} + +DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="decisiontrees") +sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Decision Trees') +table <- rbind(table, sum.tr) +table + +yhat.tr = DML2.results$yhat +dhat.tr = DML2.results$dhat +yhat0.tr = DML2.results$yhat0 +yhat1.tr = DML2.results$yhat1 +dhat0.tr = DML2.results$dhat0 +dhat1.tr = DML2.results$dhat1 +zhat.tr = DML2.results$zhat +``` + +```{r} +# DML with Boosted Trees +set.seed(123) +cat(sprintf("\nDML with Boosted Trees \n")) + +# NB: early stopping cannot easily be implemented with gbm +## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) +dreg0 <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +dreg1 <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg0 <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg1 <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +zreg <- function(x,z){gbm(as.formula("Z~."), cbind(data.frame(Z=z),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} + +# passing these through regression as type="response", and D and Z should not be factors! +DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="boostedtrees") +sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Boosted Trees') +table <- rbind(table, sum.boost) +table + +yhat.boost = DML2.results$yhat +dhat.boost = DML2.results$dhat +yhat0.boost = DML2.results$yhat0 +yhat1.boost = DML2.results$yhat1 +dhat0.boost = DML2.results$dhat0 +dhat1.boost = DML2.results$dhat1 +zhat.boost = DML2.results$zhat +``` + +## Ensembles + +Boosted trees give the best RMSE for D and Z and random forests give the best RMSE for Y. + +```{r} +# Best fit is boosted trees for D, Z and random forests for Y + +best.yhat0 <- yhat0.rf +best.yhat1 <- yhat1.rf +best.yhat <- yhat.rf + +best.dhat0 <- dhat0.boost +best.dhat1 <- dhat1.boost +best.dhat <- dhat.boost + +best.zhat <- zhat.boost + +ytil.best <- y - best.yhat +dtil.best <- D - best.dhat +ztil.best <- Z - best.zhat + +# doubly robust quantity for every sample +HZ <- Z / best.zhat - (1 - Z) / (1 - best.zhat) +drZ <- best.yhat1 - best.yhat0 + (y - best.yhat) * HZ +drD <- best.dhat1 - best.dhat0 + (D - best.dhat) * HZ +coef.est <- mean(drZ) / mean(drD) +psi <- drZ - coef.est * drD +Jhat <- mean(drD) +variance <- mean(psi^2) / Jhat^2 +se <- sqrt(variance / nrow(X)) + +sum.best <- summary(coef.est, se, best.yhat, best.dhat, best.zhat, ytil.best, dtil.best, ztil.best, drZ, drD, name = 'Best') +table <- rbind(table, sum.best) +table +``` + +We'll form a model average with unconstrained least squares weights. + +```{r} +# Least squares model average +ma.dcoef <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost-1)$coef +ma.ycoef <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost-1)$coef +ma.zcoef <- lm(Z~zhat.lasso+zhat.rf+zhat.tr+zhat.boost-1)$coef + +ma.yhat0 <- cbind(yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)%*%as.matrix(ma.ycoef) +ma.yhat1 <- cbind(yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)%*%as.matrix(ma.ycoef) +ma.dhat0 <- cbind(dhat0.lasso,dhat0.rf,dhat0.tr,dhat0.boost)%*%as.matrix(ma.dcoef) +ma.dhat1 <- cbind(dhat1.lasso,dhat1.rf,dhat1.tr,dhat1.boost)%*%as.matrix(ma.dcoef) +ma.zhat <- cbind(zhat.lasso,zhat.rf,zhat.tr,zhat.boost)%*%as.matrix(ma.zcoef) + +# Prediction of treatment and outcome for observed instrument +ma.yhat <- ma.yhat0 * (1 - Z) + ma.yhat1 * Z +ma.dhat <- ma.dhat0 * (1 - Z) + ma.dhat1 * Z + +# residuals +ma.ytil <- y-ma.yhat +ma.dtil <- D-ma.dhat +ma.ztil <- Z-ma.zhat + +# doubly robust quantity for every sample +HZ <- Z / ma.zhat - (1 - Z) / (1 - ma.zhat) +drZ <- ma.yhat1 - ma.yhat0 + (y - ma.yhat) * HZ +drD <- ma.dhat1 - ma.dhat0 + (D - ma.dhat) * HZ +coef.est <- mean(drZ) / mean(drD) +psi <- drZ - coef.est * drD +Jhat <- mean(drD) +variance <- mean(psi^2) / Jhat^2 +se <- sqrt(variance / nrow(X)) + +sum.ma <- summary(coef.est, se, ma.yhat, ma.dhat, ma.zhat, ma.ytil, ma.dtil, ma.ztil, drZ, drD, name = 'Model Average') +table <- rbind(table, sum.ma) +table +``` + +Comparing with the PLR model + +```{r} +tableplr +``` + +We find that the PLR model overestimates the effect by around 1k; though both sets of results have overlapping confidence intervals. + + +Again as before, ideally we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. + +As before, in the below analysis of robust inference, we choose Boosted Trees as they perform well in RMSE and accuracy on first-stage models. + +```{r} +iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, drZ, drD, X, Z, D, y, grid, alpha = 0.05) { + # Inference in the partially linear IV model that is robust to weak identification. + # grid: grid of theta values to search over when trying to identify the confidence region + # alpha: confidence level + + n <- dim(X)[1] + thr <- qchisq(1 - alpha, df = 1) + accept <- c() + + for (theta in grid) { + moment <- drZ - theta * drD + test <- n * mean(moment)^2 / var(moment) + if (test <= thr) { + accept <- c(accept, theta) + } + } + + return(accept) +} +``` + +```{r} +grid <- seq(0, 20000, length.out = 10000) +region <- iivm_robust_inference(point = DML2.results$coef.est, stderr = DML2.results$se, yhat = DML2.results$yhat, Dhat = DML2.results$dhat, Zhat = DML2.results$zhat, resy = DML2.results$ytil, resD = DML2.results$dtil, resZ = DML2.results$ztil, drZ = DML2.results$drZ, drD = DML2.results$drD, X=X, Z=Z, D=D, y=y, grid=grid) + +# Calculate min and max +min_region <- min(region) +max_region <- max(region) + +print(min_region) +print(max_region) +``` + +We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this. + +# DoubleML package + +There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R). + +We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. + +You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. + +```{r} +install.packages("DoubleML") +install.packages("mlr3learners") +install.packages("mlr3") +install.packages("data.table") +install.packages("ranger") + +library(DoubleML) +library(mlr3learners) +library(mlr3) +library(data.table) +library(ranger) +``` + +## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets + +## Interactive IV Model (IIVM) + +Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: + +\begin{eqnarray} +& Y = g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ +& D = r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ +& Z = m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. +\end{eqnarray} + +```{r} +# Constructing the data (as DoubleMLData) +formula_flex2 = "net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" +model_flex2 = as.data.table(model.frame(formula_flex2, data)) +x_cols = colnames(model_flex2)[-c(1,2,3)] +data_IV = DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols ="e401",x_cols=x_cols) +``` + +```{r} +lgr::get_logger("mlr3")$set_threshold("warn") +lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") +lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso, + ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +lasso_MLIIVM <- dml_MLIIVM$coef +lasso_std_MLIIVM <- dml_MLIIVM$se +``` + +The confidence interval for the local average treatment effect of participation is given by + +```{r} +dml_MLIIVM$confint(level = 0.95) +``` + +Here we can also check the accuracy of the model: + +```{r} +# variables +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$p401) +z <- as.matrix(pension$e401) + +# predictions +dml_MLIIVM$params_names() +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +``` + +```{r} +# cross-fitted RMSE: outcome +lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +lasso_y_MLIIVM + +# cross-fitted RMSE: treatment +lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +lasso_d_MLIIVM + +# cross-fitted RMSE: instrument +lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +lasso_z_MLIIVM +``` + +Again, we repeat the procedure for the other machine learning methods: + +```{r} +# needed to run boosting +remotes::install_github("mlr-org/mlr3extralearners") +install.packages("mlr3extralearners") +install.packages("mboost") +library(mlr3extralearners) +library(mboost) +``` + +```{r} +# Forest +randomForest <- lrn("regr.ranger") +randomForest_class <- lrn("classif.ranger") + +# Trees +trees <- lrn("regr.rpart") +trees_class <- lrn("classif.rpart") + +# Boosting +boost <- lrn("regr.glmboost") +boost_class <- lrn("classif.glmboost") +``` + +```{r} +### random forest ### + +lgr::get_logger("mlr3")$set_threshold("warn") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, + ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +forest_MLIIVM <- dml_MLIIVM$coef +forest_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +forest_y_MLIIVM + +# cross-fitted RMSE: treatment +forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +forest_d_MLIIVM + +# cross-fitted RMSE: instrument +forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +forest_z_MLIIVM + +### trees ### + +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees, + ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +tree_MLIIVM <- dml_MLIIVM$coef +tree_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +tree_y_MLIIVM + +# cross-fitted RMSE: treatment +tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +tree_d_MLIIVM + +# cross-fitted RMSE: instrument +tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +tree_z_MLIIVM + + +### boosting ### +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost, + ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +boost_MLIIVM <- dml_MLIIVM$coef +boost_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +boost_y_MLIIVM + +# cross-fitted RMSE: treatment +boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +boost_d_MLIIVM + +# cross-fitted RMSE: instrument +boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +boost_z_MLIIVM +``` + +```{r} +table <- matrix(0, 5, 4) +table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM) +table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM) +table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM) +table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM) +table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D","RMSE Z") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +We report results based on four ML methods for estimating the nuisance functions used in +forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis +that the effect of 401(k) participation has no effect on financial health. + +We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation: + +```{r} +lgr::get_logger("mlr3")$set_threshold("warn") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, + ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +best_MLIIVM <- dml_MLIIVM$coef +best_std_MLIIVM <- dml_MLIIVM$se +``` + diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 72328deb..b2bd7540 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1,2240 +1,2150 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f02fa044", - "metadata": { - "papermill": { - "duration": 0.012988, - "end_time": "2022-04-19T09:06:48.772902", - "exception": false, - "start_time": "2022-04-19T09:06:48.759914", - "status": "completed" - }, - "tags": [], - "id": "f02fa044" - }, - "source": [ - "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" - ] - }, - { - "cell_type": "markdown", - "id": "23154404", - "metadata": { - "papermill": { - "duration": 0.009437, - "end_time": "2022-04-19T09:06:48.791895", - "exception": false, - "start_time": "2022-04-19T09:06:48.782458", - "status": "completed" - }, - "tags": [], - "id": "23154404" - }, - "source": [ - "## Impact of 401(k) on Financial Wealth\n", - "\n", - "We consider estimation of the effect of 401(k) participation\n", - "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", - "\n", - "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." - ] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"sandwich\")\n", - "install.packages(\"ggplot2\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"rpart\")\n", - "install.packages(\"gbm\")\n", - "\n", - "library(xtable)\n", - "library(hdm)\n", - "library(sandwich)\n", - "library(ggplot2)\n", - "library(randomForest)\n", - "library(data.table)\n", - "library(glmnet)\n", - "library(rpart)\n", - "library(gbm)\n", - "\n", - "set.seed(123)" - ], - "metadata": { - "id": "KmAkbDiVE7wm" - }, - "id": "KmAkbDiVE7wm", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "7e23cba0", - "metadata": { - "papermill": { - "duration": 0.009588, - "end_time": "2022-04-19T09:06:48.810853", - "exception": false, - "start_time": "2022-04-19T09:06:48.801265", - "status": "completed" - }, - "tags": [], - "id": "7e23cba0" - }, - "source": [ - "### Data\n", - "\n", - "The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv).\n", - "The data set can be loaded from the `hdm` package for R directly by typing:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c442abdc", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:48.833250Z", - "iopub.status.busy": "2022-04-19T09:06:48.831101Z", - "iopub.status.idle": "2022-04-19T09:06:49.281559Z", - "shell.execute_reply": "2022-04-19T09:06:49.279778Z" - }, - "papermill": { - "duration": 0.46397, - "end_time": "2022-04-19T09:06:49.283933", - "exception": false, - "start_time": "2022-04-19T09:06:48.819963", - "status": "completed" - }, - "tags": [], - "id": "c442abdc" - }, - "outputs": [], - "source": [ - "data(pension)\n", - "data <- pension\n", - "dim(data)" - ] - }, - { - "cell_type": "markdown", - "id": "e47fa9d3", - "metadata": { - "papermill": { - "duration": 0.009462, - "end_time": "2022-04-19T09:06:49.302928", - "exception": false, - "start_time": "2022-04-19T09:06:49.293466", - "status": "completed" - }, - "tags": [], - "id": "e47fa9d3" - }, - "source": [ - "See the \"Details\" section on the description of the data set, which can be accessed by\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00e04b82", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:49.394579Z", - "iopub.status.busy": "2022-04-19T09:06:49.323826Z", - "iopub.status.idle": "2022-04-19T09:06:49.662556Z", - "shell.execute_reply": "2022-04-19T09:06:49.660433Z" - }, - "papermill": { - "duration": 0.35227, - "end_time": "2022-04-19T09:06:49.664810", - "exception": false, - "start_time": "2022-04-19T09:06:49.312540", - "status": "completed" - }, - "tags": [], - "id": "00e04b82" - }, - "outputs": [], - "source": [ - "help(pension)" - ] - }, - { - "cell_type": "markdown", - "id": "24b41e4a", - "metadata": { - "papermill": { - "duration": 0.009357, - "end_time": "2022-04-19T09:06:49.683784", - "exception": false, - "start_time": "2022-04-19T09:06:49.674427", - "status": "completed" - }, - "tags": [], - "id": "24b41e4a" - }, - "source": [ - "The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts." - ] - }, - { - "cell_type": "markdown", - "id": "ed9d4e82", - "metadata": { - "papermill": { - "duration": 0.009242, - "end_time": "2022-04-19T09:06:49.702401", - "exception": false, - "start_time": "2022-04-19T09:06:49.693159", - "status": "completed" - }, - "tags": [], - "id": "ed9d4e82" - }, - "source": [ - "Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63519184", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:49.724951Z", - "iopub.status.busy": "2022-04-19T09:06:49.723401Z", - "iopub.status.idle": "2022-04-19T09:06:50.327963Z", - "shell.execute_reply": "2022-04-19T09:06:50.326306Z" - }, - "papermill": { - "duration": 0.618528, - "end_time": "2022-04-19T09:06:50.330218", - "exception": false, - "start_time": "2022-04-19T09:06:49.711690", - "status": "completed" - }, - "tags": [], - "id": "63519184" - }, - "outputs": [], - "source": [ - "hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar()\n", - "hist_e401" - ] - }, - { - "cell_type": "markdown", - "id": "823d2628", - "metadata": { - "papermill": { - "duration": 0.009686, - "end_time": "2022-04-19T09:06:50.349766", - "exception": false, - "start_time": "2022-04-19T09:06:50.340080", - "status": "completed" - }, - "tags": [], - "id": "823d2628" - }, - "source": [ - "Eligibility is highly associated with financial wealth:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d8faf9c", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:50.372330Z", - "iopub.status.busy": "2022-04-19T09:06:50.370847Z", - "iopub.status.idle": "2022-04-19T09:06:50.912011Z", - "shell.execute_reply": "2022-04-19T09:06:50.910336Z" - }, - "papermill": { - "duration": 0.554613, - "end_time": "2022-04-19T09:06:50.914133", - "exception": false, - "start_time": "2022-04-19T09:06:50.359520", - "status": "completed" - }, - "tags": [], - "id": "5d8faf9c" - }, - "outputs": [], - "source": [ - "dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) +\n", - " geom_density() + xlim(c(-20000, 150000)) +\n", - " facet_wrap(.~e401)\n", - "\n", - "dens_net_tfa" - ] - }, - { - "cell_type": "markdown", - "id": "0f4f86a7", - "metadata": { - "papermill": { - "duration": 0.010335, - "end_time": "2022-04-19T09:06:50.935024", - "exception": false, - "start_time": "2022-04-19T09:06:50.924689", - "status": "completed" - }, - "tags": [], - "id": "0f4f86a7" - }, - "source": [ - "The unconditional APE of e401 is about $19559$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "836c6af7", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:50.959110Z", - "iopub.status.busy": "2022-04-19T09:06:50.957519Z", - "iopub.status.idle": "2022-04-19T09:06:50.981194Z", - "shell.execute_reply": "2022-04-19T09:06:50.979530Z" - }, - "papermill": { - "duration": 0.038096, - "end_time": "2022-04-19T09:06:50.983602", - "exception": false, - "start_time": "2022-04-19T09:06:50.945506", - "status": "completed" - }, - "tags": [], - "id": "836c6af7" - }, - "outputs": [], - "source": [ - "e1 <- data[data$e401==1,]\n", - "e0 <- data[data$e401==0,]\n", - "round(mean(e1$net_tfa)-mean(e0$net_tfa),0)" - ] - }, - { - "cell_type": "markdown", - "id": "22b09926", - "metadata": { - "papermill": { - "duration": 0.01047, - "end_time": "2022-04-19T09:06:51.004618", - "exception": false, - "start_time": "2022-04-19T09:06:50.994148", - "status": "completed" - }, - "tags": [], - "id": "22b09926" - }, - "source": [ - "Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e78aaa58", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:51.029140Z", - "iopub.status.busy": "2022-04-19T09:06:51.027462Z", - "iopub.status.idle": "2022-04-19T09:06:51.052361Z", - "shell.execute_reply": "2022-04-19T09:06:51.050591Z" - }, - "papermill": { - "duration": 0.039305, - "end_time": "2022-04-19T09:06:51.054616", - "exception": false, - "start_time": "2022-04-19T09:06:51.015311", - "status": "completed" - }, - "tags": [], - "id": "e78aaa58" - }, - "outputs": [], - "source": [ - "p1 <- data[data$p401==1,]\n", - "p0 <- data[data$p401==0,]\n", - "round(mean(p1$net_tfa)-mean(p0$net_tfa),0)" - ] - }, - { - "cell_type": "markdown", - "id": "e0af3c81", - "metadata": { - "papermill": { - "duration": 0.010831, - "end_time": "2022-04-19T09:06:51.076114", - "exception": false, - "start_time": "2022-04-19T09:06:51.065283", - "status": "completed" - }, - "tags": [], - "id": "e0af3c81" - }, - "source": [ - "As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation." - ] - }, - { - "cell_type": "code", - "source": [ - "# instrument variable\n", - "Z <- data[,'e401']\n", - "# treatment variable\n", - "D <- data[, 'p401']\n", - "# outcome variable\n", - "y <- data[,'net_tfa']" - ], - "metadata": { - "id": "A03YWrvUW0Sm" - }, - "id": "A03YWrvUW0Sm", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### We construct the engineered features for controls" - ], - "metadata": { - "id": "RVUbOMRRWwBm" - }, - "id": "RVUbOMRRWwBm" - }, - { - "cell_type": "code", - "source": [ - "# Constructing the controls\n", - "X_formula = \"~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", - "X = as.data.table(model.frame(X_formula, pension))\n", - "head(X)" - ], - "metadata": { - "id": "7vt1hbdBG8cb" - }, - "id": "7vt1hbdBG8cb", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Instrumental Variables: Effect of 401k Participation on Financial Assets" - ], - "metadata": { - "id": "yzNigd7YYVuA" - }, - "id": "yzNigd7YYVuA" - }, - { - "cell_type": "markdown", - "source": [ - "## Double ML IV under Partial Linearity" - ], - "metadata": { - "id": "FI2u5KU7YWIF" - }, - "id": "FI2u5KU7YWIF" - }, - { - "cell_type": "markdown", - "source": [ - "Now, we consider estimation of average treatment effects of participation in 401k, i.e. `p401`, with the binary instrument being eligibility in 401k, i.e. `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. We consider a partially linear structural equation model:\n", - "\\begin{eqnarray*}\n", - "Y & := & g_Y(\\epsilon_Y) D + f_Y(A, X, \\epsilon_Y), \\\\\n", - "D & := & f_D(Z, X, A, \\epsilon_D), \\\\\n", - "Z & := & f_Z(X, \\epsilon_Z),\\\\\n", - "A & : = & f_A(X, \\epsilon_A), \\\\\n", - "X & := & \\epsilon_X,\n", - "\\end{eqnarray*}\n", - "where $A$ is a vector of un-observed confounders.\n", - "\n", - "Under this structural equation model, the average treatment effect:\n", - "\\begin{align}\n", - "\\alpha = E[Y(1) - Y(0)]\n", - "\\end{align}\n", - "can be identified by the moment restriction:\n", - "\\begin{align}\n", - "E[(\\tilde{Y} - \\alpha \\tilde{D}) \\tilde{Z}] = 0\n", - "\\end{align}\n", - "where for any variable $V$, we denote with $\\tilde{V} = V - E[V|X]$." - ], - "metadata": { - "id": "uhDK6Em_YWSm" - }, - "id": "uhDK6Em_YWSm" - }, - { - "cell_type": "code", - "source": [ - "set.seed(1)\n", - "yfit.lasso.cv <- cv.glmnet(as.matrix(X), y, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", - "Dfit.lasso.cv <- cv.glmnet(as.matrix(X), D, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", - "Zfit.lasso.cv <- cv.glmnet(as.matrix(X), Z, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", - "\n", - "\n", - "yhat.lasso.cv <- predict(yfit.lasso.cv, newx = as.matrix(X)) # predictions\n", - "Dhat.lasso.cv <- predict(Dfit.lasso.cv, newx = as.matrix(X)) # predictions\n", - "Zhat.lasso.cv <- predict(Zfit.lasso.cv, newx = as.matrix(X)) # predictions\n", - "\n", - "resy <- y-yhat.lasso.cv\n", - "resD <- D-Dhat.lasso.cv\n", - "resZ <- Z-Zhat.lasso.cv\n", - "\n", - "# Estimate\n", - "mean(resy * resZ) / mean(resZ*resD)" - ], - "metadata": { - "id": "bdUGB53AYf3S" - }, - "id": "bdUGB53AYf3S", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Recall if we want to do inference, we need to either use the theoretically driven penalty paramter for Lasso or perform cross-fitting." - ], - "metadata": { - "id": "Fw1ZxeBKZcRm" - }, - "id": "Fw1ZxeBKZcRm" - }, - { - "cell_type": "markdown", - "source": [ - "### DML with Non-Linear ML Models and Cross-fitting" - ], - "metadata": { - "id": "jnBOtXXuZnkz" - }, - "id": "jnBOtXXuZnkz" - }, - { - "cell_type": "code", - "source": [ - "# DML for PLIVM with D and Z as classifiers or regressors\n", - "DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5, method=\"regression\") {\n", - " nobs <- nrow(x)\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", - " I <- split(1:nobs, foldid)\n", - " # create residualized objects to fill\n", - " ytil <- dtil <- ztil<- rep(NA, nobs)\n", - " # obtain cross-fitted residuals\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " if (method == \"randomforest\"){\n", - " # take a fold out\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]])\n", - " zfit <- zreg(x[-I[[b]],], z[-I[[b]]])\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", - " # predict the fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"prob\")[,2] # type = \"prob\" is like predict_proba in scikitlearn\n", - " zhat <- predict(zfit, x[I[[b]],], type=\"prob\")[,2]\n", - " yhat <- predict(yfit, x[I[[b]],]) # default type = \"response\" for regression for RF, type = \"vector\" for regression for Decision Trees\n", - " # record residual\n", - " dtil[I[[b]]] <- (as.numeric(d[I[[b]]])-1 - dhat) # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1!\n", - " ztil[I[[b]]] <- (as.numeric(z[I[[b]]])-1 - zhat)\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", - " } else if (method == \"regression\") { # works for both boosted trees and glmnet\n", - " # take a fold out\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]])\n", - " zfit <- zreg(x[-I[[b]],], z[-I[[b]]])\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", - " # predict the fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\")\n", - " zhat <- predict(zfit, x[I[[b]],], type=\"response\")\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\")\n", - " # record residual\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat)\n", - " ztil[I[[b]]] <- (z[I[[b]]] - zhat)\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", - " } else if (method == \"decisiontrees\"){\n", - " # take a fold out\n", - " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]])\n", - " zfit <- zreg(x[-I[[b]],], as.factor(z)[-I[[b]]])\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", - " # predict the fold out\n", - " dhat <- predict(dfit, x[I[[b]],])[,2]\n", - " zhat <- predict(zfit, x[I[[b]],])[,2]\n", - " yhat <- predict(yfit, x[I[[b]],])\n", - " # record residual\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat)\n", - " ztil[I[[b]]] <- (z[I[[b]]] - zhat)\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", - " }\n", - "\n", - " cat(b,\" \")\n", - " }\n", - " ivfit = tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE)\n", - " coef.est <- ivfit$coef #extract coefficient\n", - " se <- ivfit$se #record standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", - "\n", - " return( list(coef.est=coef.est, se=se, dtil=dtil, ytil=ytil, ztil=ztil) )\n", - "}" - ], - "metadata": { - "id": "K_vQlMYmz91I" - }, - "id": "K_vQlMYmz91I", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "summary <- function(point, stderr, resy, resD, resZ, name) {\n", - " data <- data.frame(\n", - " estimate = point, # point estimate\n", - " stderr = stderr, # standard error\n", - " lower = point - 1.96 * stderr, # lower end of 95% confidence interval\n", - " upper = point + 1.96 * stderr, # upper end of 95% confidence interval\n", - " `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y\n", - " `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D\n", - " `rmse Z` = sqrt(mean(resZ^2)), # RMSE of model that predicts treatment D\n", - " `accuracy D` = mean(abs(resD) < 0.5), # binary classification accuracy of model for D\n", - " `accuracy Z` = mean(abs(resZ) < 0.5) # binary classification accuracy of model for Z\n", - " )\n", - " rownames(data) <- name\n", - " return(data)\n", - "}" - ], - "metadata": { - "id": "puSCLNvofQxA" - }, - "id": "puSCLNvofQxA", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Double Lasso with Cross-Fitting" - ], - "metadata": { - "id": "1Z5vrvrlbuPj" - }, - "id": "1Z5vrvrlbuPj" - }, - { - "cell_type": "code", - "source": [ - "# DML with LassoCV\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Lasso CV \\n\"))\n", - "\n", - "dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "zreg.lasso.cv <- function(x,z){ cv.glmnet(x, z, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.lasso.cv, yreg.lasso.cv, zreg.lasso.cv, nfold=5, method=\"regression\")\n", - "sum.lasso.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV')\n", - "tableplr <- data.frame()\n", - "tableplr <- rbind(sum.lasso.cv)\n", - "tableplr\n", - "\n", - "ytil.lasso <- DML2.results$ytil\n", - "dtil.lasso <- DML2.results$dtil\n", - "ztil.lasso <- DML2.results$ztil\n" - ], - "metadata": { - "id": "vBJm7BkUYgsG" - }, - "id": "vBJm7BkUYgsG", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Using a $\\ell_2$ Penalized Logistic Regression for D and Z" - ], - "metadata": { - "id": "pyrem2YniNls" - }, - "id": "pyrem2YniNls" - }, - { - "cell_type": "code", - "source": [ - "# DML with Lasso/Ridge\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", - "\n", - "dreg.ridge.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg.ridge.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "zreg.ridge.cv <- function(x,z){cv.glmnet(x, z, family=\"binomial\", alpha=0, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.ridge.cv, yreg.ridge.cv, zreg.ridge.cv, nfold=5, method=\"regression\")\n", - "sum.lasso_ridge.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV/LogisticCV')\n", - "tableplr <- rbind(tableplr, sum.lasso_ridge.cv)\n", - "tableplr\n", - "\n", - "ytil.ridge <- DML2.results$ytil\n", - "dtil.ridge <- DML2.results$dtil\n", - "ztil.ridge <- DML2.results$ztil" - ], - "metadata": { - "id": "FM6WvQXKYgxL" - }, - "id": "FM6WvQXKYgxL", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Random Forests" - ], - "metadata": { - "id": "yfTdX3__jcwI" - }, - "id": "yfTdX3__jcwI" - }, - { - "cell_type": "code", - "source": [ - "# DML with Random Forest\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "\n", - "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "zreg.rf <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "DML2.results = DML2.for.PLIVM(as.matrix(X), as.factor(D), as.factor(Z), y, dreg.rf, yreg.rf, zreg.rf, nfold=5, method=\"randomforest\")\n", - "sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'RF')\n", - "tableplr <- rbind(tableplr, sum.rf)\n", - "tableplr\n", - "\n", - "ytil.rf <- DML2.results$ytil\n", - "dtil.rf <- DML2.results$dtil\n", - "ztil.rf <- DML2.results$ztil" - ], - "metadata": { - "id": "mMvJT6NZHW1_" - }, - "id": "mMvJT6NZHW1_", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Decision Trees" - ], - "metadata": { - "id": "4I1oVQutjeqE" - }, - "id": "4I1oVQutjeqE" - }, - { - "cell_type": "code", - "source": [ - "# DML with Decision Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", - "\n", - "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "zreg.tr <- function(x,z){rpart(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.tr, yreg.tr, zreg.tr, nfold=5, method=\"decisiontrees\") # decision tree takes in X as dataframe, not matrix/array\n", - "sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Decision Trees')\n", - "tableplr <- rbind(tableplr, sum.tr)\n", - "tableplr\n", - "\n", - "ytil.tr <- DML2.results$ytil\n", - "dtil.tr <- DML2.results$dtil\n", - "ztil.tr <- DML2.results$ztil" - ], - "metadata": { - "id": "ayrnTPeBHW88" - }, - "id": "ayrnTPeBHW88", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Boosted Trees" - ], - "metadata": { - "id": "h7Jo_WXUjgjb" - }, - "id": "h7Jo_WXUjgjb" - }, - { - "cell_type": "code", - "source": [ - "# DML with Boosted Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", - "\n", - "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "zreg.boost <- function(x,z){gbm(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "\n", - "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", - "DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.boost, yreg.boost, zreg.boost, nfold=5, method = \"regression\")\n", - "sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Boosted Trees')\n", - "tableplr <- rbind(tableplr, sum.boost)\n", - "tableplr\n", - "\n", - "ytil.boost <- DML2.results$ytil\n", - "dtil.boost <- DML2.results$dtil\n", - "ztil.boost <- DML2.results$ztil" - ], - "metadata": { - "id": "nzlszy9zjiSy" - }, - "id": "nzlszy9zjiSy", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Ensembles" - ], - "metadata": { - "id": "oQpoYedAc4Ic" - }, - "id": "oQpoYedAc4Ic" - }, - { - "cell_type": "markdown", - "source": [ - "Boosted trees give the best RMSE for Y, D, and Z, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case." - ], - "metadata": { - "id": "_LLsorarc8Mh" - }, - "id": "_LLsorarc8Mh" - }, - { - "cell_type": "code", - "source": [ - "# Best fit is boosted trees for D, Z, Y\n", - "\n", - "sum.best <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$dtil, name = 'Best')\n", - "tableplr <- rbind(tableplr, sum.best)\n", - "tableplr" - ], - "metadata": { - "id": "kAePILCadEVh" - }, - "id": "kAePILCadEVh", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We'll form a model average with unconstrained least squares weights." - ], - "metadata": { - "id": "KaaDX4kkdIMx" - }, - "id": "KaaDX4kkdIMx" - }, - { - "cell_type": "code", - "source": [ - "# Least squares model average\n", - "\n", - "dhat.lasso <- D - dtil.lasso\n", - "dhat.ridge <- D - dtil.ridge\n", - "dhat.rf <- D - dtil.rf\n", - "dhat.tr <- D - dtil.tr\n", - "dhat.boost <- D - dtil.boost\n", - "\n", - "yhat.lasso <- y - ytil.lasso\n", - "yhat.ridge <- y - ytil.ridge\n", - "yhat.rf <- y - ytil.rf\n", - "yhat.tr <- y - ytil.tr\n", - "yhat.boost <- y - ytil.boost\n", - "\n", - "zhat.lasso <- Z - ztil.lasso\n", - "zhat.ridge <- Z - ztil.ridge\n", - "zhat.rf <- Z - ztil.rf\n", - "zhat.tr <- Z - ztil.tr\n", - "zhat.boost <- Z - ztil.boost\n", - "\n", - "ma.dtil <- lm(D~dhat.lasso+dhat.ridge+dhat.rf+dhat.tr+dhat.boost)$residuals\n", - "ma.ytil <- lm(y~yhat.lasso+yhat.ridge+yhat.rf+yhat.tr+yhat.boost)$residuals\n", - "ma.ztil <- lm(Z~zhat.lasso+zhat.ridge+zhat.rf+zhat.tr+zhat.boost)$residuals\n", - "\n", - "ivfit = tsls(y=ma.ytil,d=ma.dtil, x=NULL, z=ma.ztil, intercept=FALSE)\n", - "coef.est <- ivfit$coef #extract coefficient\n", - "se <- ivfit$se #record standard error\n", - "\n", - "sum.ma <- summary(coef.est, se, ma.ytil, ma.dtil, ma.ztil, name = 'Model Average')\n", - "tableplr <- rbind(tableplr, sum.ma)\n", - "tableplr" - ], - "metadata": { - "id": "mCsyY3iJdHm_" - }, - "id": "mCsyY3iJdHm_", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Inference Robust to Weak Identification\n", - "\n", - "Now we turn toward robustness when the instrument is weak.\n", - "\n", - "Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\\sim X$, $D \\sim X$, $Z\\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract the best model so we can re-use that in DML.\n", - "\n", - "Thus, in the below analysis of robust inference, we choose Boosted Trees as they perform well." - ], - "metadata": { - "id": "8OUusM2BpZH4" - }, - "id": "8OUusM2BpZH4" - }, - { - "cell_type": "code", - "source": [ - "robust_inference <- function(point, stderr, resD, resy, resZ, grid, alpha = 0.05) {\n", - " # Inference in the partially linear IV model that is robust to weak identification.\n", - " # grid: grid of theta values to search over when trying to identify the confidence region\n", - " # alpha: confidence level\n", - "\n", - " n <- dim(X)[1]\n", - " thr <- qchisq(1 - alpha, df = 1)\n", - " accept <- c()\n", - "\n", - " for (theta in grid) {\n", - " moment <- (resy - theta * resD) * resZ\n", - " test <- n * mean(moment)^2 / var(moment)\n", - " if (test <= thr) {\n", - " accept <- c(accept, theta)\n", - " }\n", - " }\n", - "\n", - " return(accept)\n", - "}\n" - ], - "metadata": { - "id": "UeNF5j1ApYYy" - }, - "id": "UeNF5j1ApYYy", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "grid <- seq(0, 20000, length.out = 10000)\n", - "region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)" - ], - "metadata": { - "id": "X21PuuUnsa25" - }, - "id": "X21PuuUnsa25", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "grid <- seq(0, 20000, length.out = 10000)\n", - "region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)# Calculate min and max\n", - "min_region <- min(region)\n", - "max_region <- max(region)\n", - "\n", - "print(min_region)\n", - "print(max_region)" - ], - "metadata": { - "id": "x-ZSzMkVqI45" - }, - "id": "x-ZSzMkVqI45", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Interactive IV Model and LATE" - ], - "metadata": { - "id": "nKQGPfXWIKmh" - }, - "id": "nKQGPfXWIKmh" - }, - { - "cell_type": "markdown", - "source": [ - "Now, we consider estimation of local average treatment effects (LATE) of participation `p401`, with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is:\n", - "\\begin{eqnarray}\n", - "Y &:=& f_Y (D, X, A, \\epsilon_Y) \\\\\n", - "D &:= & f_D(Z, X, A, \\epsilon_D) \\in \\{0,1\\}, \\\\\n", - "Z &:= & f_Z(X,\\epsilon_Z) \\in \\{0,1\\}, \\\\\n", - "X &:=& \\epsilon_X, \\quad A = \\epsilon_A,\n", - "\\end{eqnarray}\n", - "where $\\epsilon$'s are all exogenous and independent,\n", - "and\n", - "$$\n", - "z \\mapsto f_D(z , A, X, \\epsilon_D) \\text{ is weakly increasing (weakly monotone)}.\n", - "$$\n", - "and $A$ is a vector of unobserved confounders. Note that in our setting monotonicity is satisfied, since participation is only feasible when it is eligible. Thus we have that $D=0$ whenever $Z=0$. Thus it can only be that $f_D(1, A, X, \\epsilon_D) \\geq 0 = f_D(0, A, X, \\epsilon_D)$.\n", - "\n", - "In this case, we can estimate the local average treatment effect (LATE):\n", - "$$\n", - "\\alpha = E[Y(1) - Y(0) | D(1) > D(0)]\n", - "$$\n", - "This can be identified using the Neyman orthogonal moment equation:\n", - "\\begin{align}\n", - "E\\left[g(1, X) - g(0, X) + H(Z) (Y - g(Z, X)) - \\alpha \\cdot (m(1, X) - m(0, X) + H(Z) (D - m(Z, X))\\right] = 0\n", - "\\end{align}\n", - "where\n", - "\\begin{align}\n", - "g(Z,X) =~& E[Y|Z,X],\\\\\n", - "m(Z,X) =~& E[D|Z,X],\\\\\n", - "H(Z) =~& \\frac{Z}{Pr(Z=1|X)} - \\frac{1 - Z}{1 - Pr(Z=1|X)}\n", - "\\end{align}" - ], - "metadata": { - "id": "bCayhAlaINjL" - }, - "id": "bCayhAlaINjL" - }, - { - "cell_type": "code", - "source": [ - "# DML for IIVM with D and Z as classifiers or regressors\n", - "DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"classification\", dt=0, bt=0) {\n", - " # this implements DML2 algorithm, where there moments are estimated via DML, before constructing\n", - " # the pooled estimate of theta randomly split data into folds\n", - "\n", - " ## NB This method has many if statements to accommodate the various estimators we will use.\n", - " ## Unlike Python's sklearn, all methods have idfferent default arguments in their predict functions.\n", - " ## See official R documentation for details.\n", - "\n", - " yhat0 <- rep(0, length(y))\n", - " yhat1 <- rep(0, length(y))\n", - " dhat0 <- rep(0, length(d))\n", - " dhat1 <- rep(0, length(d))\n", - " zhat <- rep(0, length(Z))\n", - "\n", - " nobs <- nrow(X)\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", - " I <- split(1:nobs, foldid)\n", - " # create residualized objects to fill\n", - " ytil <- dtil <- ztil<- rep(NA, nobs)\n", - "\n", - " # obtain cross-fitted residuals\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - "\n", - " # define helpful variables\n", - " Xb = X[I[[b]],]\n", - " Xnotb = X[-I[[b]],]\n", - " Znotb = Z[-I[[b]]]\n", - "\n", - " # training dfs subsetted on the -I[[b]] fold\n", - " XZ0 = X[-I[[b]],][Z[-I[[b]]]==0]\n", - " yZ0 = y[-I[[b]]][Z[-I[[b]]]==0]\n", - " XZ1 = X[-I[[b]],][Z[-I[[b]]]==1]\n", - " yZ1 = y[-I[[b]]][Z[-I[[b]]]==1]\n", - " DZ0 = d[-I[[b]]][Z[-I[[b]]]==0]\n", - " DZ1 = d[-I[[b]]][Z[-I[[b]]]==1]\n", - "\n", - "\n", - " if (method == \"regression\") {\n", - " XZ0 = as.matrix(XZ0)\n", - " XZ1 = as.matrix(XZ1)\n", - " Xb = as.matrix(Xb)\n", - " Xnotb = as.matrix(Xnotb)\n", - "\n", - " # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set\n", - " yfit0 <- yreg0((XZ0), yZ0)\n", - " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\"\n", - "\n", - " # train an outcome model on training data that received Z=1 and predict outcome on all data in test set\n", - " yfit1 <- yreg1((XZ1), yZ1)\n", - " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", - "\n", - " # train a treatment model on training data that received Z=0 and predict treatment on all data in test set\n", - " if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically\n", - " dreg0_ <- dreg0\n", - " dfit0 <- dreg0_((XZ0), DZ0)\n", - " dhat0[I[[b]]] <- predict(dfit0, (Xb), type=\"response\") # default type = \"response\", but for family binomial it's logg odds\n", - " }\n", - " # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set\n", - " if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically\n", - " dreg1_ <- dreg1\n", - " dfit1 <- dreg1_((XZ1), DZ1)\n", - " dhat1[I[[b]]] <- predict(dfit1, (Xb), type=\"response\")\n", - " } else {\n", - " dhat1[I[[b]]] <- 1\n", - " }\n", - "\n", - " } else if (method == \"randomforest\") {\n", - " DZ0factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==0]\n", - " DZ1factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==1]\n", - " Znotb = as.factor(Znotb)\n", - "\n", - " yfit0 <- yreg0((XZ0), yZ0)\n", - " yhat0[I[[b]]] <- predict(yfit0, (Xb), type=\"response\")\n", - " yfit1 <- yreg1((XZ1), yZ1)\n", - " yhat1[I[[b]]] <- predict(yfit1, (Xb), type=\"response\")\n", - "\n", - " if (mean(DZ0) > 0) {\n", - " dreg0_ <- dreg0\n", - " dfit0 <- dreg0_((XZ0), DZ0factor)\n", - " dhat0[I[[b]]] <- predict(dfit0, (Xb), type=\"prob\")[,2] # get second column because type = \"prob\"\n", - " }\n", - " if (mean(DZ1) < 1) {\n", - " dreg1_ <- dreg1\n", - " dfit1 <- dreg1_((XZ1), DZ1factor)\n", - " dhat1[I[[b]]] <- predict(dfit1, (Xb), type=\"prob\")[,2]\n", - " } else {\n", - " dhat1[I[[b]]] <- 1\n", - " }\n", - "\n", - " } else if (method == \"decisiontrees\") {\n", - " XZ0 = as.data.frame(XZ0)\n", - " XZ1 = as.data.frame(XZ1)\n", - " Xb = as.data.frame(Xb)\n", - " Xnotb = as.data.frame(Xnotb)\n", - "\n", - " yfit0 <- yreg0((XZ0), yZ0)\n", - " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\" for decision trees for continuous response\n", - "\n", - " yfit1 <- yreg1((XZ1), yZ1)\n", - " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", - "\n", - " if (mean(DZ0) > 0) {\n", - " dreg0_ <- dreg0\n", - " dfit0 <- dreg0_((XZ0), as.factor(DZ0))\n", - " dhat0[I[[b]]] <- predict(dfit0, (Xb))[,2] # for decision trees, default = \"prob\" for decision trees with factor responses\n", - " }\n", - "\n", - " if (mean(DZ1) < 1) {\n", - " dreg1_ <- dreg1\n", - " dfit1 <- dreg1_((XZ1), as.factor(DZ1))\n", - " dhat1[I[[b]]] <- predict(dfit1, (Xb))[,2]\n", - " } else {\n", - " dhat1[I[[b]]] <- 1\n", - " }\n", - "\n", - " } else if (method == \"boostedtrees\") {\n", - " XZ0 = as.data.frame(XZ0)\n", - " XZ1 = as.data.frame(XZ1)\n", - " Xb = as.data.frame(Xb)\n", - " Xnotb = as.data.frame(Xnotb)\n", - "\n", - " yfit0 <- yreg0((XZ0), yZ0)\n", - " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\" for boosted trees\n", - " yfit1 <- yreg1((XZ1), yZ1)\n", - " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", - "\n", - " if (mean(DZ0) > 0) {\n", - " dreg0_ <- dreg0\n", - " dfit0 <- dreg0_((XZ0), DZ0)\n", - " dhat0[I[[b]]] <- predict(dfit0, (Xb), type = \"response\") # default for boosted trees is log odds.\n", - " }\n", - " if (mean(DZ1) < 1) {\n", - " dreg1_ <- dreg1\n", - " dfit1 <- dreg1_((XZ1), DZ1)\n", - " dhat1[I[[b]]] <- predict(dfit1, (Xb), type = \"response\")\n", - " } else {\n", - " dhat1[I[[b]]] <- 1\n", - " }\n", - "\n", - " }\n", - "\n", - " # propensity scores:\n", - " if (method == \"regression\"){\n", - " zfit_b <- zreg((Xnotb), Znotb)\n", - " zhat_b <- predict(zfit_b, (Xb), type=\"response\")\n", - " } else if (method == \"randomforest\"){\n", - " zfit_b <- zreg((Xnotb), Znotb)\n", - " zhat_b <- predict(zfit_b, (Xb), type = \"prob\")[,2]\n", - " } else if (method == \"decisiontrees\"){\n", - " zfit_b <- zreg((Xnotb), as.factor(Znotb))\n", - " zhat_b <- predict(zfit_b, (Xb)) # default is prob, so get second column\n", - " zhat_b = zhat_b[,2]\n", - " } else if (method == \"boostedtrees\"){\n", - " zfit_b <- zreg((Xnotb), Znotb)\n", - " zhat_b <- predict(zfit_b, (Xb), type = \"response\")\n", - " }\n", - " zhat_b <- pmax(pmin(zhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", - " zhat[I[[b]]] <- zhat_b\n", - "\n", - " cat(b,\" \")\n", - " }\n", - "\n", - "\n", - " # Prediction of treatment and outcome for observed instrument\n", - " yhat <- yhat0 * (1 - Z) + yhat1 * Z\n", - " dhat <- dhat0 * (1 - Z) + dhat1 * Z\n", - "\n", - " # residuals\n", - " ytil <- y-yhat\n", - " dtil <- D-dhat\n", - " ztil <- Z-zhat\n", - "\n", - " # doubly robust quantity for every sample\n", - " HZ <- Z / zhat - (1 - Z) / (1 - zhat)\n", - " drZ <- yhat1 - yhat0 + (y - yhat) * HZ\n", - " drD <- dhat1 - dhat0 + (D - dhat) * HZ\n", - " coef.est <- mean(drZ) / mean(drD)\n", - " cat(\"point\", coef.est)\n", - " psi <- drZ - coef.est * drD\n", - " Jhat <- mean(drD)\n", - " variance <- mean(psi^2) / Jhat^2\n", - " se <- sqrt(variance / nrow(X))\n", - " cat(\"se\", se)\n", - "\n", - " return(list(coef.est = coef.est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1))\n", - "}" - ], - "metadata": { - "id": "rQYifUnFIt5z" - }, - "id": "rQYifUnFIt5z", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "summary <- function(coef.est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) {\n", - " summary_data <- data.frame(estimate = coef.est, # point estimate\n", - " se = se, # standard error\n", - " lower = coef.est - 1.96 * se, # lower end of 95% confidence interval\n", - " upper = coef.est + 1.96 * se, # upper end of 95% confidence interval\n", - " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", - " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", - " rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z\n", - " accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D\n", - " accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z\n", - " )\n", - " row.names(summary_data) <- name\n", - " return(summary_data)\n", - "}\n" - ], - "metadata": { - "id": "iArB2WQHBXuV" - }, - "id": "iArB2WQHBXuV", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Lasso/Ridge\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", - "# DML with Lasso/Ridge\n", - "dreg0 <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "dreg1 <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg0 <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg1 <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "zreg <- function(x,z){cv.glmnet(x, z, family=\"binomial\", alpha=0, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.IIVM(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"regression\")\n", - "sum.lasso_ridge.cv <-summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'LassoCV/LogisticCV')\n", - "table <- data.frame()\n", - "table <- rbind(table, sum.lasso_ridge.cv)\n", - "table\n", - "\n", - "yhat.lasso = DML2.results$yhat\n", - "dhat.lasso = DML2.results$dhat\n", - "yhat0.lasso = DML2.results$yhat0\n", - "yhat1.lasso = DML2.results$yhat1\n", - "dhat0.lasso = DML2.results$dhat0\n", - "dhat1.lasso = DML2.results$dhat1\n", - "zhat.lasso = DML2.results$zhat" - ], - "metadata": { - "id": "Tj-8FFF3BXxV" - }, - "id": "Tj-8FFF3BXxV", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Random Forest\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "\n", - "dreg0 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "dreg1 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg0 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg1 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "zreg <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "DML2.results <- DML2.for.IIVM(X,D,Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"randomforest\")\n", - "sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'RF')\n", - "table <- rbind(table, sum.rf)\n", - "table\n", - "\n", - "yhat.rf = DML2.results$yhat\n", - "dhat.rf = DML2.results$dhat\n", - "yhat0.rf = DML2.results$yhat0\n", - "yhat1.rf = DML2.results$yhat1\n", - "dhat0.rf = DML2.results$dhat0\n", - "dhat1.rf = DML2.results$dhat1\n", - "zhat.rf = DML2.results$zhat" - ], - "metadata": { - "id": "sXjbvMbEkYJd" - }, - "id": "sXjbvMbEkYJd", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Decision Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", - "\n", - "dreg0 <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "dreg1 <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg0 <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "yreg1 <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "zreg <- function(x,z){rpart(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"decisiontrees\")\n", - "sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Decision Trees')\n", - "table <- rbind(table, sum.tr)\n", - "table\n", - "\n", - "yhat.tr = DML2.results$yhat\n", - "dhat.tr = DML2.results$dhat\n", - "yhat0.tr = DML2.results$yhat0\n", - "yhat1.tr = DML2.results$yhat1\n", - "dhat0.tr = DML2.results$dhat0\n", - "dhat1.tr = DML2.results$dhat1\n", - "zhat.tr = DML2.results$zhat" - ], - "metadata": { - "id": "ZZRXpY8YkYNN" - }, - "id": "ZZRXpY8YkYNN", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Boosted Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", - "\n", - "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg0 <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "dreg1 <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg0 <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg1 <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "zreg <- function(x,z){gbm(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "\n", - "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", - "DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"boostedtrees\")\n", - "sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Boosted Trees')\n", - "table <- rbind(table, sum.boost)\n", - "table\n", - "\n", - "yhat.boost = DML2.results$yhat\n", - "dhat.boost = DML2.results$dhat\n", - "yhat0.boost = DML2.results$yhat0\n", - "yhat1.boost = DML2.results$yhat1\n", - "dhat0.boost = DML2.results$dhat0\n", - "dhat1.boost = DML2.results$dhat1\n", - "zhat.boost = DML2.results$zhat" - ], - "metadata": { - "id": "RYqykjPskYQJ" - }, - "id": "RYqykjPskYQJ", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Ensembles" - ], - "metadata": { - "id": "29vcyCYsktQ3" - }, - "id": "29vcyCYsktQ3" - }, - { - "cell_type": "markdown", - "source": [ - "Boosted trees give the best RMSE for D and Z and random forests give the best RMSE for Y." - ], - "metadata": { - "id": "vuyfd9UJkw9G" - }, - "id": "vuyfd9UJkw9G" - }, - { - "cell_type": "code", - "source": [ - "# Best fit is boosted trees for D, Z and random forests for Y\n", - "\n", - "best.yhat0 <- yhat0.rf\n", - "best.yhat1 <- yhat1.rf\n", - "best.yhat <- yhat.rf\n", - "\n", - "best.dhat0 <- dhat0.boost\n", - "best.dhat1 <- dhat1.boost\n", - "best.dhat <- dhat.boost\n", - "\n", - "best.zhat <- zhat.boost\n", - "\n", - "ytil.best <- y - best.yhat\n", - "dtil.best <- D - best.dhat\n", - "ztil.best <- Z - best.zhat\n", - "\n", - "# doubly robust quantity for every sample\n", - "HZ <- Z / best.zhat - (1 - Z) / (1 - best.zhat)\n", - "drZ <- best.yhat1 - best.yhat0 + (y - best.yhat) * HZ\n", - "drD <- best.dhat1 - best.dhat0 + (D - best.dhat) * HZ\n", - "coef.est <- mean(drZ) / mean(drD)\n", - "psi <- drZ - coef.est * drD\n", - "Jhat <- mean(drD)\n", - "variance <- mean(psi^2) / Jhat^2\n", - "se <- sqrt(variance / nrow(X))\n", - "\n", - "sum.best <- summary(coef.est, se, best.yhat, best.dhat, best.zhat, ytil.best, dtil.best, ztil.best, drZ, drD, name = 'Best')\n", - "table <- rbind(table, sum.best)\n", - "table" - ], - "metadata": { - "id": "Y9_T5SMUk3Rd" - }, - "id": "Y9_T5SMUk3Rd", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We'll form a model average with unconstrained least squares weights." - ], - "metadata": { - "id": "RyRS9zNUlMCF" - }, - "id": "RyRS9zNUlMCF" - }, - { - "cell_type": "code", - "source": [ - "# Least squares model average\n", - "ma.dcoef <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost-1)$coef\n", - "ma.ycoef <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost-1)$coef\n", - "ma.zcoef <- lm(Z~zhat.lasso+zhat.rf+zhat.tr+zhat.boost-1)$coef\n", - "\n", - "ma.yhat0 <- cbind(yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)%*%as.matrix(ma.ycoef)\n", - "ma.yhat1 <- cbind(yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)%*%as.matrix(ma.ycoef)\n", - "ma.dhat0 <- cbind(dhat0.lasso,dhat0.rf,dhat0.tr,dhat0.boost)%*%as.matrix(ma.dcoef)\n", - "ma.dhat1 <- cbind(dhat1.lasso,dhat1.rf,dhat1.tr,dhat1.boost)%*%as.matrix(ma.dcoef)\n", - "ma.zhat <- cbind(zhat.lasso,zhat.rf,zhat.tr,zhat.boost)%*%as.matrix(ma.zcoef)\n", - "\n", - "# Prediction of treatment and outcome for observed instrument\n", - "ma.yhat <- ma.yhat0 * (1 - Z) + ma.yhat1 * Z\n", - "ma.dhat <- ma.dhat0 * (1 - Z) + ma.dhat1 * Z\n", - "\n", - "# residuals\n", - "ma.ytil <- y-ma.yhat\n", - "ma.dtil <- D-ma.dhat\n", - "ma.ztil <- Z-ma.zhat\n", - "\n", - "# doubly robust quantity for every sample\n", - "HZ <- Z / ma.zhat - (1 - Z) / (1 - ma.zhat)\n", - "drZ <- ma.yhat1 - ma.yhat0 + (y - ma.yhat) * HZ\n", - "drD <- ma.dhat1 - ma.dhat0 + (D - ma.dhat) * HZ\n", - "coef.est <- mean(drZ) / mean(drD)\n", - "psi <- drZ - coef.est * drD\n", - "Jhat <- mean(drD)\n", - "variance <- mean(psi^2) / Jhat^2\n", - "se <- sqrt(variance / nrow(X))\n", - "\n", - "sum.ma <- summary(coef.est, se, ma.yhat, ma.dhat, ma.zhat, ma.ytil, ma.dtil, ma.ztil, drZ, drD, name = 'Model Average')\n", - "table <- rbind(table, sum.ma)\n", - "table" - ], - "metadata": { - "id": "1H4sCbO2lLpJ" - }, - "id": "1H4sCbO2lLpJ", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Comparing with the PLR model" - ], - "metadata": { - "id": "UflbjTEG5SXV" - }, - "id": "UflbjTEG5SXV" - }, - { - "cell_type": "code", - "source": [ - "tableplr" - ], - "metadata": { - "id": "CIS-58oi4sa1" - }, - "id": "CIS-58oi4sa1", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We find that the PLR model overestimates the effect by around 1k; though both sets of results have overlapping confidence intervals." - ], - "metadata": { - "id": "M4Zi0FPH5VZG" - }, - "id": "M4Zi0FPH5VZG" - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "Again as before, ideally we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R.\n", - "\n", - "As before, in the below analysis of robust inference, we choose Boosted Trees as they perform well in RMSE and accuracy on first-stage models." - ], - "metadata": { - "id": "VrBkj_pc5qgm" - }, - "id": "VrBkj_pc5qgm" - }, - { - "cell_type": "code", - "source": [ - "iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, drZ, drD, X, Z, D, y, grid, alpha = 0.05) {\n", - " # Inference in the partially linear IV model that is robust to weak identification.\n", - " # grid: grid of theta values to search over when trying to identify the confidence region\n", - " # alpha: confidence level\n", - "\n", - " n <- dim(X)[1]\n", - " thr <- qchisq(1 - alpha, df = 1)\n", - " accept <- c()\n", - "\n", - " for (theta in grid) {\n", - " moment <- drZ - theta * drD\n", - " test <- n * mean(moment)^2 / var(moment)\n", - " if (test <= thr) {\n", - " accept <- c(accept, theta)\n", - " }\n", - " }\n", - "\n", - " return(accept)\n", - "}\n" - ], - "metadata": { - "id": "bj67nsgcCDoS" - }, - "id": "bj67nsgcCDoS", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "grid <- seq(0, 20000, length.out = 10000)\n", - "region <- iivm_robust_inference(point = DML2.results$coef.est, stderr = DML2.results$se, yhat = DML2.results$yhat, Dhat = DML2.results$dhat, Zhat = DML2.results$zhat, resy = DML2.results$ytil, resD = DML2.results$dtil, resZ = DML2.results$ztil, drZ = DML2.results$drZ, drD = DML2.results$drD, X=X, Z=Z, D=D, y=y, grid=grid)\n", - "\n", - "# Calculate min and max\n", - "min_region <- min(region)\n", - "max_region <- max(region)\n", - "\n", - "print(min_region)\n", - "print(max_region)" - ], - "metadata": { - "id": "KqgPk1Jm4sdo" - }, - "id": "KqgPk1Jm4sdo", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this." - ], - "metadata": { - "id": "akCGDMZJCN3h" - }, - "id": "akCGDMZJCN3h" - }, - { - "cell_type": "markdown", - "id": "01de9f24", - "metadata": { - "papermill": { - "duration": 0.010725, - "end_time": "2022-04-19T09:06:51.098483", - "exception": false, - "start_time": "2022-04-19T09:06:51.087758", - "status": "completed" - }, - "tags": [], - "id": "01de9f24" - }, - "source": [ - "# DoubleML package\n", - "\n", - "There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", - "\n", - "We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", - "\n", - "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2846a36a", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:51.144230Z", - "iopub.status.busy": "2022-04-19T09:06:51.142682Z", - "iopub.status.idle": "2022-04-19T09:07:11.366508Z", - "shell.execute_reply": "2022-04-19T09:07:11.364676Z" - }, - "papermill": { - "duration": 20.239271, - "end_time": "2022-04-19T09:07:11.369618", - "exception": false, - "start_time": "2022-04-19T09:06:51.130347", - "status": "completed" - }, - "tags": [], - "id": "2846a36a" - }, - "outputs": [], - "source": [ - "install.packages(\"DoubleML\")\n", - "install.packages(\"mlr3learners\")\n", - "install.packages(\"mlr3\")\n", - "install.packages(\"data.table\")\n", - "install.packages(\"ranger\")\n", - "\n", - "library(DoubleML)\n", - "library(mlr3learners)\n", - "library(mlr3)\n", - "library(data.table)\n", - "library(ranger)" - ] - }, - { - "cell_type": "markdown", - "id": "2259ae1c", - "metadata": { - "papermill": { - "duration": 0.015455, - "end_time": "2022-04-19T09:12:00.920079", - "exception": false, - "start_time": "2022-04-19T09:12:00.904624", - "status": "completed" - }, - "tags": [], - "id": "2259ae1c" - }, - "source": [ - "## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets" - ] - }, - { - "cell_type": "markdown", - "id": "9c27e413", - "metadata": { - "papermill": { - "duration": 0.015158, - "end_time": "2022-04-19T09:12:00.950542", - "exception": false, - "start_time": "2022-04-19T09:12:00.935384", - "status": "completed" - }, - "tags": [], - "id": "9c27e413" - }, - "source": [ - "## Interactive IV Model (IIVM)" - ] - }, - { - "cell_type": "markdown", - "id": "4fa23c70", - "metadata": { - "papermill": { - "duration": 0.015304, - "end_time": "2022-04-19T09:12:00.981285", - "exception": false, - "start_time": "2022-04-19T09:12:00.965981", - "status": "completed" - }, - "tags": [], - "id": "4fa23c70" - }, - "source": [ - "Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is:\n", - "\n", - "\\begin{eqnarray}\n", - "& Y = g_0(Z,X) + U, &\\quad E[U\\mid Z,X] = 0,\\\\\n", - "& D = r_0(Z,X) + V, &\\quad E[V\\mid Z, X] = 0,\\\\\n", - "& Z = m_0(X) + \\zeta, &\\quad E[\\zeta \\mid X] = 0.\n", - "\\end{eqnarray}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb223b75", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:01.015264Z", - "iopub.status.busy": "2022-04-19T09:12:01.013761Z", - "iopub.status.idle": "2022-04-19T09:12:01.062714Z", - "shell.execute_reply": "2022-04-19T09:12:01.060993Z" - }, - "papermill": { - "duration": 0.06823, - "end_time": "2022-04-19T09:12:01.064957", - "exception": false, - "start_time": "2022-04-19T09:12:00.996727", - "status": "completed" - }, - "tags": [], - "id": "cb223b75" - }, - "outputs": [], - "source": [ - "# Constructing the data (as DoubleMLData)\n", - "formula_flex2 = \"net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", - "model_flex2 = as.data.table(model.frame(formula_flex2, data))\n", - "x_cols = colnames(model_flex2)[-c(1,2,3)]\n", - "data_IV = DoubleMLData$new(model_flex2, y_col = \"net_tfa\", d_cols = \"p401\", z_cols =\"e401\",x_cols=x_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e652ffad", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:01.099480Z", - "iopub.status.busy": "2022-04-19T09:12:01.097954Z", - "iopub.status.idle": "2022-04-19T09:12:22.012186Z", - "shell.execute_reply": "2022-04-19T09:12:22.010302Z" - }, - "papermill": { - "duration": 20.934595, - "end_time": "2022-04-19T09:12:22.014866", - "exception": false, - "start_time": "2022-04-19T09:12:01.080271", - "status": "completed" - }, - "tags": [], - "id": "e652ffad" - }, - "outputs": [], - "source": [ - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "lasso <- lrn(\"regr.cv_glmnet\",nfolds = 5, s = \"lambda.min\")\n", - "lasso_class <- lrn(\"classif.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", - "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso,\n", - " ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE,\n", - " never_takers = TRUE))\n", - "dml_MLIIVM$fit(store_predictions=TRUE)\n", - "dml_MLIIVM$summary()\n", - "lasso_MLIIVM <- dml_MLIIVM$coef\n", - "lasso_std_MLIIVM <- dml_MLIIVM$se" - ] - }, - { - "cell_type": "markdown", - "id": "63103667", - "metadata": { - "papermill": { - "duration": 0.015382, - "end_time": "2022-04-19T09:12:22.045989", - "exception": false, - "start_time": "2022-04-19T09:12:22.030607", - "status": "completed" - }, - "tags": [], - "id": "63103667" - }, - "source": [ - "The confidence interval for the local average treatment effect of participation is given by" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "322855c4", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:22.080639Z", - "iopub.status.busy": "2022-04-19T09:12:22.079018Z", - "iopub.status.idle": "2022-04-19T09:12:22.101731Z", - "shell.execute_reply": "2022-04-19T09:12:22.100047Z" - }, - "papermill": { - "duration": 0.042067, - "end_time": "2022-04-19T09:12:22.103953", - "exception": false, - "start_time": "2022-04-19T09:12:22.061886", - "status": "completed" - }, - "tags": [], - "id": "322855c4" - }, - "outputs": [], - "source": [ - "dml_MLIIVM$confint(level = 0.95)" - ] - }, - { - "cell_type": "markdown", - "id": "2965410d", - "metadata": { - "papermill": { - "duration": 0.015374, - "end_time": "2022-04-19T09:12:22.134875", - "exception": false, - "start_time": "2022-04-19T09:12:22.119501", - "status": "completed" - }, - "tags": [], - "id": "2965410d" - }, - "source": [ - "Here we can also check the accuracy of the model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1476fd27", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:22.168618Z", - "iopub.status.busy": "2022-04-19T09:12:22.167046Z", - "iopub.status.idle": "2022-04-19T09:12:22.203398Z", - "shell.execute_reply": "2022-04-19T09:12:22.201502Z" - }, - "papermill": { - "duration": 0.056054, - "end_time": "2022-04-19T09:12:22.206157", - "exception": false, - "start_time": "2022-04-19T09:12:22.150103", - "status": "completed" - }, - "tags": [], - "id": "1476fd27" - }, - "outputs": [], - "source": [ - "# variables\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "d <- as.matrix(pension$p401)\n", - "z <- as.matrix(pension$e401)\n", - "\n", - "# predictions\n", - "dml_MLIIVM$params_names()\n", - "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X)\n", - "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X)\n", - "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", - "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X)\n", - "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X)\n", - "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", - "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "444c53f4", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:22.241038Z", - "iopub.status.busy": "2022-04-19T09:12:22.239185Z", - "iopub.status.idle": "2022-04-19T09:12:22.275804Z", - "shell.execute_reply": "2022-04-19T09:12:22.273813Z" - }, - "papermill": { - "duration": 0.056945, - "end_time": "2022-04-19T09:12:22.278593", - "exception": false, - "start_time": "2022-04-19T09:12:22.221648", - "status": "completed" - }, - "tags": [], - "id": "444c53f4" - }, - "outputs": [], - "source": [ - "# cross-fitted RMSE: outcome\n", - "lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", - "lasso_y_MLIIVM\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", - "lasso_d_MLIIVM\n", - "\n", - "# cross-fitted RMSE: instrument\n", - "lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", - "lasso_z_MLIIVM\n" - ] - }, - { - "cell_type": "markdown", - "id": "a7461966", - "metadata": { - "papermill": { - "duration": 0.016468, - "end_time": "2022-04-19T09:12:22.311250", - "exception": false, - "start_time": "2022-04-19T09:12:22.294782", - "status": "completed" - }, - "tags": [], - "id": "a7461966" - }, - "source": [ - "Again, we repeat the procedure for the other machine learning methods:" - ] - }, - { - "cell_type": "code", - "source": [ - "# needed to run boosting\n", - "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", - "install.packages(\"mlr3extralearners\")\n", - "install.packages(\"mboost\")\n", - "library(mlr3extralearners)\n", - "library(mboost)" - ], - "metadata": { - "id": "59YzwIcpEnyV" - }, - "id": "59YzwIcpEnyV", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Forest\n", - "randomForest <- lrn(\"regr.ranger\")\n", - "randomForest_class <- lrn(\"classif.ranger\")\n", - "\n", - "# Trees\n", - "trees <- lrn(\"regr.rpart\")\n", - "trees_class <- lrn(\"classif.rpart\")\n", - "\n", - "# Boosting\n", - "boost <- lrn(\"regr.glmboost\")\n", - "boost_class <- lrn(\"classif.glmboost\")" - ], - "metadata": { - "id": "Ec0g3ch3EjAl" - }, - "id": "Ec0g3ch3EjAl", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3935dfc5", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:12:22.346966Z", - "iopub.status.busy": "2022-04-19T09:12:22.345453Z", - "iopub.status.idle": "2022-04-19T09:13:31.739923Z", - "shell.execute_reply": "2022-04-19T09:13:31.738086Z" - }, - "papermill": { - "duration": 69.414354, - "end_time": "2022-04-19T09:13:31.742249", - "exception": false, - "start_time": "2022-04-19T09:12:22.327895", - "status": "completed" - }, - "tags": [], - "id": "3935dfc5" - }, - "outputs": [], - "source": [ - "### random forest ###\n", - "\n", - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest,\n", - " ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", - " never_takers = TRUE))\n", - "dml_MLIIVM$fit(store_predictions=TRUE)\n", - "dml_MLIIVM$summary()\n", - "forest_MLIIVM <- dml_MLIIVM$coef\n", - "forest_std_MLIIVM <- dml_MLIIVM$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", - "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", - "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", - "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", - "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", - "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", - "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", - "forest_y_MLIIVM\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", - "forest_d_MLIIVM\n", - "\n", - "# cross-fitted RMSE: instrument\n", - "forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", - "forest_z_MLIIVM\n", - "\n", - "### trees ###\n", - "\n", - "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees,\n", - " ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", - " never_takers = TRUE))\n", - "dml_MLIIVM$fit(store_predictions=TRUE)\n", - "dml_MLIIVM$summary()\n", - "tree_MLIIVM <- dml_MLIIVM$coef\n", - "tree_std_MLIIVM <- dml_MLIIVM$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", - "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", - "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", - "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", - "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", - "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", - "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", - "tree_y_MLIIVM\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", - "tree_d_MLIIVM\n", - "\n", - "# cross-fitted RMSE: instrument\n", - "tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", - "tree_z_MLIIVM\n", - "\n", - "\n", - "### boosting ###\n", - "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost,\n", - " ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", - " never_takers = TRUE))\n", - "dml_MLIIVM$fit(store_predictions=TRUE)\n", - "dml_MLIIVM$summary()\n", - "boost_MLIIVM <- dml_MLIIVM$coef\n", - "boost_std_MLIIVM <- dml_MLIIVM$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", - "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", - "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", - "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", - "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", - "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", - "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", - "boost_y_MLIIVM\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", - "boost_d_MLIIVM\n", - "\n", - "# cross-fitted RMSE: instrument\n", - "boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", - "boost_z_MLIIVM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7187fc74", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:13:31.782847Z", - "iopub.status.busy": "2022-04-19T09:13:31.781148Z", - "iopub.status.idle": "2022-04-19T09:13:31.821873Z", - "shell.execute_reply": "2022-04-19T09:13:31.820166Z" - }, - "papermill": { - "duration": 0.061872, - "end_time": "2022-04-19T09:13:31.824148", - "exception": false, - "start_time": "2022-04-19T09:13:31.762276", - "status": "completed" - }, - "tags": [], - "id": "7187fc74" - }, - "outputs": [], - "source": [ - "table <- matrix(0, 5, 4)\n", - "table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM)\n", - "table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM)\n", - "table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM)\n", - "table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM)\n", - "table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM)\n", - "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\",\"RMSE Z\")\n", - "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", - "tab<- xtable(table, digits = 2)\n", - "tab" - ] - }, - { - "cell_type": "markdown", - "id": "f4ce7be1", - "metadata": { - "papermill": { - "duration": 0.017437, - "end_time": "2022-04-19T09:13:31.859052", - "exception": false, - "start_time": "2022-04-19T09:13:31.841615", - "status": "completed" - }, - "tags": [], - "id": "f4ce7be1" - }, - "source": [ - "We report results based on four ML methods for estimating the nuisance functions used in\n", - "forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis\n", - "that the effect of 401(k) participation has no effect on financial health." - ] - }, - { - "cell_type": "markdown", - "id": "4939cd9c", - "metadata": { - "papermill": { - "duration": 0.017163, - "end_time": "2022-04-19T09:13:31.893361", - "exception": false, - "start_time": "2022-04-19T09:13:31.876198", - "status": "completed" - }, - "tags": [], - "id": "4939cd9c" - }, - "source": [ - "We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca612b71", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:13:31.931238Z", - "iopub.status.busy": "2022-04-19T09:13:31.929630Z", - "iopub.status.idle": "2022-04-19T09:13:52.687242Z", - "shell.execute_reply": "2022-04-19T09:13:52.685428Z" - }, - "papermill": { - "duration": 20.780029, - "end_time": "2022-04-19T09:13:52.690594", - "exception": false, - "start_time": "2022-04-19T09:13:31.910565", - "status": "completed" - }, - "tags": [], - "id": "ca612b71" - }, - "outputs": [], - "source": [ - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest,\n", - " ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE,\n", - " never_takers = TRUE))\n", - "dml_MLIIVM$fit(store_predictions=TRUE)\n", - "dml_MLIIVM$summary()\n", - "best_MLIIVM <- dml_MLIIVM$coef\n", - "best_std_MLIIVM <- dml_MLIIVM$se" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.0.5" - }, + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "f02fa044", "papermill": { - "default_parameters": {}, - "duration": 427.936706, - "end_time": "2022-04-19T09:13:53.230849", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2022-04-19T09:06:45.294143", - "version": "2.3.4" - }, - "colab": { - "provenance": [] - } + "duration": 0.012988, + "end_time": "2022-04-19T09:06:48.772902", + "exception": false, + "start_time": "2022-04-19T09:06:48.759914", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "23154404", + "papermill": { + "duration": 0.009437, + "end_time": "2022-04-19T09:06:48.791895", + "exception": false, + "start_time": "2022-04-19T09:06:48.782458", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Impact of 401(k) on Financial Wealth\n", + "\n", + "We consider estimation of the effect of 401(k) participation\n", + "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", + "\n", + "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "id": "KmAkbDiVE7wm" + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"sandwich\")\n", + "install.packages(\"ggplot2\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"rpart\")\n", + "install.packages(\"gbm\")\n", + "\n", + "library(xtable)\n", + "library(hdm)\n", + "library(sandwich)\n", + "library(ggplot2)\n", + "library(randomForest)\n", + "library(data.table)\n", + "library(glmnet)\n", + "library(rpart)\n", + "library(gbm)\n", + "\n", + "set.seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": { + "id": "7e23cba0", + "papermill": { + "duration": 0.009588, + "end_time": "2022-04-19T09:06:48.810853", + "exception": false, + "start_time": "2022-04-19T09:06:48.801265", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### Data\n", + "\n", + "The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv).\n", + "The data set can be loaded from the `hdm` package for R directly by typing:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "id": "c442abdc", + "papermill": { + "duration": 0.46397, + "end_time": "2022-04-19T09:06:49.283933", + "exception": false, + "start_time": "2022-04-19T09:06:48.819963", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data(pension)\n", + "data <- pension\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": { + "id": "e47fa9d3", + "papermill": { + "duration": 0.009462, + "end_time": "2022-04-19T09:06:49.302928", + "exception": false, + "start_time": "2022-04-19T09:06:49.293466", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "See the \"Details\" section on the description of the data set, which can be accessed by\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": { + "id": "00e04b82", + "papermill": { + "duration": 0.35227, + "end_time": "2022-04-19T09:06:49.664810", + "exception": false, + "start_time": "2022-04-19T09:06:49.312540", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "help(pension)" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": { + "id": "24b41e4a", + "papermill": { + "duration": 0.009357, + "end_time": "2022-04-19T09:06:49.683784", + "exception": false, + "start_time": "2022-04-19T09:06:49.674427", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts." + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": { + "id": "ed9d4e82", + "papermill": { + "duration": 0.009242, + "end_time": "2022-04-19T09:06:49.702401", + "exception": false, + "start_time": "2022-04-19T09:06:49.693159", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "id": "63519184", + "papermill": { + "duration": 0.618528, + "end_time": "2022-04-19T09:06:50.330218", + "exception": false, + "start_time": "2022-04-19T09:06:49.711690", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar()\n", + "hist_e401" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "id": "823d2628", + "papermill": { + "duration": 0.009686, + "end_time": "2022-04-19T09:06:50.349766", + "exception": false, + "start_time": "2022-04-19T09:06:50.340080", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Eligibility is highly associated with financial wealth:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "5d8faf9c", + "papermill": { + "duration": 0.554613, + "end_time": "2022-04-19T09:06:50.914133", + "exception": false, + "start_time": "2022-04-19T09:06:50.359520", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) +\n", + " geom_density() + xlim(c(-20000, 150000)) +\n", + " facet_wrap(.~e401)\n", + "\n", + "dens_net_tfa" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "id": "0f4f86a7", + "papermill": { + "duration": 0.010335, + "end_time": "2022-04-19T09:06:50.935024", + "exception": false, + "start_time": "2022-04-19T09:06:50.924689", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The unconditional APE of e401 is about $19559$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "id": "836c6af7", + "papermill": { + "duration": 0.038096, + "end_time": "2022-04-19T09:06:50.983602", + "exception": false, + "start_time": "2022-04-19T09:06:50.945506", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "e1 <- data[data$e401==1,]\n", + "e0 <- data[data$e401==0,]\n", + "round(mean(e1$net_tfa)-mean(e0$net_tfa),0)" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": { + "id": "22b09926", + "papermill": { + "duration": 0.01047, + "end_time": "2022-04-19T09:06:51.004618", + "exception": false, + "start_time": "2022-04-19T09:06:50.994148", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "id": "e78aaa58", + "papermill": { + "duration": 0.039305, + "end_time": "2022-04-19T09:06:51.054616", + "exception": false, + "start_time": "2022-04-19T09:06:51.015311", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "p1 <- data[data$p401==1,]\n", + "p0 <- data[data$p401==0,]\n", + "round(mean(p1$net_tfa)-mean(p0$net_tfa),0)" + ] + }, + { + "cell_type": "markdown", + "id": "16", + "metadata": { + "id": "e0af3c81", + "papermill": { + "duration": 0.010831, + "end_time": "2022-04-19T09:06:51.076114", + "exception": false, + "start_time": "2022-04-19T09:06:51.065283", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": { + "id": "A03YWrvUW0Sm" + }, + "outputs": [], + "source": [ + "# instrument variable\n", + "Z <- data[,'e401']\n", + "# treatment variable\n", + "D <- data[, 'p401']\n", + "# outcome variable\n", + "y <- data[,'net_tfa']" + ] + }, + { + "cell_type": "markdown", + "id": "18", + "metadata": { + "id": "RVUbOMRRWwBm" + }, + "source": [ + "### We construct the engineered features for controls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": { + "id": "7vt1hbdBG8cb" + }, + "outputs": [], + "source": [ + "# Constructing the controls\n", + "X_formula = \"~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", + "X = as.data.table(model.frame(X_formula, pension))\n", + "head(X)" + ] + }, + { + "cell_type": "markdown", + "id": "20", + "metadata": { + "id": "yzNigd7YYVuA" + }, + "source": [ + "# Instrumental Variables: Effect of 401k Participation on Financial Assets" + ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": { + "id": "FI2u5KU7YWIF" + }, + "source": [ + "## Double ML IV under Partial Linearity" + ] + }, + { + "cell_type": "markdown", + "id": "22", + "metadata": { + "id": "uhDK6Em_YWSm" + }, + "source": [ + "Now, we consider estimation of average treatment effects of participation in 401k, i.e. `p401`, with the binary instrument being eligibility in 401k, i.e. `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. We consider a partially linear structural equation model:\n", + "\\begin{eqnarray*}\n", + "Y & := & g_Y(\\epsilon_Y) D + f_Y(A, X, \\epsilon_Y), \\\\\n", + "D & := & f_D(Z, X, A, \\epsilon_D), \\\\\n", + "Z & := & f_Z(X, \\epsilon_Z),\\\\\n", + "A & : = & f_A(X, \\epsilon_A), \\\\\n", + "X & := & \\epsilon_X,\n", + "\\end{eqnarray*}\n", + "where $A$ is a vector of un-observed confounders.\n", + "\n", + "Under this structural equation model, the average treatment effect:\n", + "\\begin{align}\n", + "\\alpha = E[Y(1) - Y(0)]\n", + "\\end{align}\n", + "can be identified by the moment restriction:\n", + "\\begin{align}\n", + "E[(\\tilde{Y} - \\alpha \\tilde{D}) \\tilde{Z}] = 0\n", + "\\end{align}\n", + "where for any variable $V$, we denote with $\\tilde{V} = V - E[V|X]$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": { + "id": "bdUGB53AYf3S" + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "yfit.lasso.cv <- cv.glmnet(as.matrix(X), y, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", + "Dfit.lasso.cv <- cv.glmnet(as.matrix(X), D, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", + "Zfit.lasso.cv <- cv.glmnet(as.matrix(X), Z, family=\"gaussian\", alpha=1) # family gaussian means that we'll be using square loss\n", + "\n", + "\n", + "yhat.lasso.cv <- predict(yfit.lasso.cv, newx = as.matrix(X)) # predictions\n", + "Dhat.lasso.cv <- predict(Dfit.lasso.cv, newx = as.matrix(X)) # predictions\n", + "Zhat.lasso.cv <- predict(Zfit.lasso.cv, newx = as.matrix(X)) # predictions\n", + "\n", + "resy <- y-yhat.lasso.cv\n", + "resD <- D-Dhat.lasso.cv\n", + "resZ <- Z-Zhat.lasso.cv\n", + "\n", + "# Estimate\n", + "mean(resy * resZ) / mean(resZ*resD)" + ] + }, + { + "cell_type": "markdown", + "id": "24", + "metadata": { + "id": "Fw1ZxeBKZcRm" + }, + "source": [ + "Recall if we want to do inference, we need to either use the theoretically driven penalty paramter for Lasso or perform cross-fitting." + ] + }, + { + "cell_type": "markdown", + "id": "25", + "metadata": { + "id": "jnBOtXXuZnkz" + }, + "source": [ + "### DML with Non-Linear ML Models and Cross-fitting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26", + "metadata": { + "id": "K_vQlMYmz91I" + }, + "outputs": [], + "source": [ + "# DML for PLIVM with D and Z as classifiers or regressors\n", + "DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5, method=\"regression\") {\n", + " nobs <- nrow(x)\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", + " I <- split(1:nobs, foldid)\n", + " # create residualized objects to fill\n", + " ytil <- dtil <- ztil<- rep(NA, nobs)\n", + " # obtain cross-fitted residuals\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + " if (method == \"randomforest\"){\n", + " # take a fold out\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]])\n", + " zfit <- zreg(x[-I[[b]],], z[-I[[b]]])\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", + " # predict the fold out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"prob\")[,2] # type = \"prob\" is like predict_proba in scikitlearn\n", + " zhat <- predict(zfit, x[I[[b]],], type=\"prob\")[,2]\n", + " yhat <- predict(yfit, x[I[[b]],]) # default type = \"response\" for regression for RF, type = \"vector\" for regression for Decision Trees\n", + " # record residual\n", + " dtil[I[[b]]] <- (as.numeric(d[I[[b]]])-1 - dhat) # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1!\n", + " ztil[I[[b]]] <- (as.numeric(z[I[[b]]])-1 - zhat)\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", + " } else if (method == \"regression\") { # works for both boosted trees and glmnet\n", + " # take a fold out\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]])\n", + " zfit <- zreg(x[-I[[b]],], z[-I[[b]]])\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", + " # predict the fold out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\")\n", + " zhat <- predict(zfit, x[I[[b]],], type=\"response\")\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\")\n", + " # record residual\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat)\n", + " ztil[I[[b]]] <- (z[I[[b]]] - zhat)\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", + " } else if (method == \"decisiontrees\"){\n", + " # take a fold out\n", + " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]])\n", + " zfit <- zreg(x[-I[[b]],], as.factor(z)[-I[[b]]])\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]])\n", + " # predict the fold out\n", + " dhat <- predict(dfit, x[I[[b]],])[,2]\n", + " zhat <- predict(zfit, x[I[[b]],])[,2]\n", + " yhat <- predict(yfit, x[I[[b]],])\n", + " # record residual\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat)\n", + " ztil[I[[b]]] <- (z[I[[b]]] - zhat)\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat)\n", + " }\n", + "\n", + " cat(b,\" \")\n", + " }\n", + " ivfit = tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE)\n", + " coef.est <- ivfit$coef #extract coefficient\n", + " se <- ivfit$se #record standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se))\n", + "\n", + " return( list(coef.est=coef.est, se=se, dtil=dtil, ytil=ytil, ztil=ztil) )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": { + "id": "puSCLNvofQxA" + }, + "outputs": [], + "source": [ + "summary <- function(point, stderr, resy, resD, resZ, name) {\n", + " data <- data.frame(\n", + " estimate = point, # point estimate\n", + " stderr = stderr, # standard error\n", + " lower = point - 1.96 * stderr, # lower end of 95% confidence interval\n", + " upper = point + 1.96 * stderr, # upper end of 95% confidence interval\n", + " `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y\n", + " `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D\n", + " `rmse Z` = sqrt(mean(resZ^2)), # RMSE of model that predicts treatment D\n", + " `accuracy D` = mean(abs(resD) < 0.5), # binary classification accuracy of model for D\n", + " `accuracy Z` = mean(abs(resZ) < 0.5) # binary classification accuracy of model for Z\n", + " )\n", + " rownames(data) <- name\n", + " return(data)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "28", + "metadata": { + "id": "1Z5vrvrlbuPj" + }, + "source": [ + "#### Double Lasso with Cross-Fitting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29", + "metadata": { + "id": "vBJm7BkUYgsG" + }, + "outputs": [], + "source": [ + "# DML with LassoCV\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Lasso CV \\n\"))\n", + "\n", + "dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "zreg.lasso.cv <- function(x,z){ cv.glmnet(x, z, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.lasso.cv, yreg.lasso.cv, zreg.lasso.cv, nfold=5, method=\"regression\")\n", + "sum.lasso.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV')\n", + "tableplr <- data.frame()\n", + "tableplr <- rbind(sum.lasso.cv)\n", + "tableplr\n", + "\n", + "ytil.lasso <- DML2.results$ytil\n", + "dtil.lasso <- DML2.results$dtil\n", + "ztil.lasso <- DML2.results$ztil\n" + ] + }, + { + "cell_type": "markdown", + "id": "30", + "metadata": { + "id": "pyrem2YniNls" + }, + "source": [ + "#### Using a $\\ell_2$ Penalized Logistic Regression for D and Z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31", + "metadata": { + "id": "FM6WvQXKYgxL" + }, + "outputs": [], + "source": [ + "# DML with Lasso/Ridge\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", + "\n", + "dreg.ridge.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", + "yreg.ridge.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "zreg.ridge.cv <- function(x,z){cv.glmnet(x, z, family=\"binomial\", alpha=0, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.ridge.cv, yreg.ridge.cv, zreg.ridge.cv, nfold=5, method=\"regression\")\n", + "sum.lasso_ridge.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV/LogisticCV')\n", + "tableplr <- rbind(tableplr, sum.lasso_ridge.cv)\n", + "tableplr\n", + "\n", + "ytil.ridge <- DML2.results$ytil\n", + "dtil.ridge <- DML2.results$dtil\n", + "ztil.ridge <- DML2.results$ztil" + ] + }, + { + "cell_type": "markdown", + "id": "32", + "metadata": { + "id": "yfTdX3__jcwI" + }, + "source": [ + "### Random Forests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33", + "metadata": { + "id": "mMvJT6NZHW1_" + }, + "outputs": [], + "source": [ + "# DML with Random Forest\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", + "\n", + "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "zreg.rf <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest\n", + "\n", + "DML2.results = DML2.for.PLIVM(as.matrix(X), as.factor(D), as.factor(Z), y, dreg.rf, yreg.rf, zreg.rf, nfold=5, method=\"randomforest\")\n", + "sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'RF')\n", + "tableplr <- rbind(tableplr, sum.rf)\n", + "tableplr\n", + "\n", + "ytil.rf <- DML2.results$ytil\n", + "dtil.rf <- DML2.results$dtil\n", + "ztil.rf <- DML2.results$ztil" + ] + }, + { + "cell_type": "markdown", + "id": "34", + "metadata": { + "id": "4I1oVQutjeqE" + }, + "source": [ + "### Decision Trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35", + "metadata": { + "id": "ayrnTPeBHW88" + }, + "outputs": [], + "source": [ + "# DML with Decision Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", + "\n", + "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "yreg.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "zreg.tr <- function(x,z){rpart(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "\n", + "DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.tr, yreg.tr, zreg.tr, nfold=5, method=\"decisiontrees\") # decision tree takes in X as dataframe, not matrix/array\n", + "sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Decision Trees')\n", + "tableplr <- rbind(tableplr, sum.tr)\n", + "tableplr\n", + "\n", + "ytil.tr <- DML2.results$ytil\n", + "dtil.tr <- DML2.results$dtil\n", + "ztil.tr <- DML2.results$ztil" + ] + }, + { + "cell_type": "markdown", + "id": "36", + "metadata": { + "id": "h7Jo_WXUjgjb" + }, + "source": [ + "### Boosted Trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37", + "metadata": { + "id": "nzlszy9zjiSy" + }, + "outputs": [], + "source": [ + "# DML with Boosted Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", + "\n", + "# NB: early stopping cannot easily be implemented with gbm\n", + "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", + "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "zreg.boost <- function(x,z){gbm(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "\n", + "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", + "DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.boost, yreg.boost, zreg.boost, nfold=5, method = \"regression\")\n", + "sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Boosted Trees')\n", + "tableplr <- rbind(tableplr, sum.boost)\n", + "tableplr\n", + "\n", + "ytil.boost <- DML2.results$ytil\n", + "dtil.boost <- DML2.results$dtil\n", + "ztil.boost <- DML2.results$ztil" + ] + }, + { + "cell_type": "markdown", + "id": "38", + "metadata": { + "id": "oQpoYedAc4Ic" + }, + "source": [ + "## Ensembles" + ] + }, + { + "cell_type": "markdown", + "id": "39", + "metadata": { + "id": "_LLsorarc8Mh" + }, + "source": [ + "Boosted trees give the best RMSE for Y, D, and Z, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40", + "metadata": { + "id": "kAePILCadEVh" + }, + "outputs": [], + "source": [ + "# Best fit is boosted trees for D, Z, Y\n", + "\n", + "sum.best <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$dtil, name = 'Best')\n", + "tableplr <- rbind(tableplr, sum.best)\n", + "tableplr" + ] + }, + { + "cell_type": "markdown", + "id": "41", + "metadata": { + "id": "KaaDX4kkdIMx" + }, + "source": [ + "We'll form a model average with unconstrained least squares weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42", + "metadata": { + "id": "mCsyY3iJdHm_" + }, + "outputs": [], + "source": [ + "# Least squares model average\n", + "\n", + "dhat.lasso <- D - dtil.lasso\n", + "dhat.ridge <- D - dtil.ridge\n", + "dhat.rf <- D - dtil.rf\n", + "dhat.tr <- D - dtil.tr\n", + "dhat.boost <- D - dtil.boost\n", + "\n", + "yhat.lasso <- y - ytil.lasso\n", + "yhat.ridge <- y - ytil.ridge\n", + "yhat.rf <- y - ytil.rf\n", + "yhat.tr <- y - ytil.tr\n", + "yhat.boost <- y - ytil.boost\n", + "\n", + "zhat.lasso <- Z - ztil.lasso\n", + "zhat.ridge <- Z - ztil.ridge\n", + "zhat.rf <- Z - ztil.rf\n", + "zhat.tr <- Z - ztil.tr\n", + "zhat.boost <- Z - ztil.boost\n", + "\n", + "ma.dtil <- lm(D~dhat.lasso+dhat.ridge+dhat.rf+dhat.tr+dhat.boost)$residuals\n", + "ma.ytil <- lm(y~yhat.lasso+yhat.ridge+yhat.rf+yhat.tr+yhat.boost)$residuals\n", + "ma.ztil <- lm(Z~zhat.lasso+zhat.ridge+zhat.rf+zhat.tr+zhat.boost)$residuals\n", + "\n", + "ivfit = tsls(y=ma.ytil,d=ma.dtil, x=NULL, z=ma.ztil, intercept=FALSE)\n", + "coef.est <- ivfit$coef #extract coefficient\n", + "se <- ivfit$se #record standard error\n", + "\n", + "sum.ma <- summary(coef.est, se, ma.ytil, ma.dtil, ma.ztil, name = 'Model Average')\n", + "tableplr <- rbind(tableplr, sum.ma)\n", + "tableplr" + ] + }, + { + "cell_type": "markdown", + "id": "43", + "metadata": { + "id": "8OUusM2BpZH4" + }, + "source": [ + "## Inference Robust to Weak Identification\n", + "\n", + "Now we turn toward robustness when the instrument is weak.\n", + "\n", + "Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\\sim X$, $D \\sim X$, $Z\\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract the best model so we can re-use that in DML.\n", + "\n", + "Thus, in the below analysis of robust inference, we choose Boosted Trees as they perform well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44", + "metadata": { + "id": "UeNF5j1ApYYy" + }, + "outputs": [], + "source": [ + "robust_inference <- function(point, stderr, resD, resy, resZ, grid, alpha = 0.05) {\n", + " # Inference in the partially linear IV model that is robust to weak identification.\n", + " # grid: grid of theta values to search over when trying to identify the confidence region\n", + " # alpha: confidence level\n", + "\n", + " n <- dim(X)[1]\n", + " thr <- qchisq(1 - alpha, df = 1)\n", + " accept <- c()\n", + "\n", + " for (theta in grid) {\n", + " moment <- (resy - theta * resD) * resZ\n", + " test <- n * mean(moment)^2 / var(moment)\n", + " if (test <= thr) {\n", + " accept <- c(accept, theta)\n", + " }\n", + " }\n", + "\n", + " return(accept)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45", + "metadata": { + "id": "X21PuuUnsa25" + }, + "outputs": [], + "source": [ + "grid <- seq(0, 20000, length.out = 10000)\n", + "region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46", + "metadata": { + "id": "x-ZSzMkVqI45" + }, + "outputs": [], + "source": [ + "grid <- seq(0, 20000, length.out = 10000)\n", + "region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)# Calculate min and max\n", + "min_region <- min(region)\n", + "max_region <- max(region)\n", + "\n", + "print(min_region)\n", + "print(max_region)" + ] + }, + { + "cell_type": "markdown", + "id": "47", + "metadata": { + "id": "nKQGPfXWIKmh" + }, + "source": [ + "# Interactive IV Model and LATE" + ] + }, + { + "cell_type": "markdown", + "id": "48", + "metadata": { + "id": "bCayhAlaINjL" + }, + "source": [ + "Now, we consider estimation of local average treatment effects (LATE) of participation `p401`, with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is:\n", + "\\begin{eqnarray}\n", + "Y &:=& f_Y (D, X, A, \\epsilon_Y) \\\\\n", + "D &:= & f_D(Z, X, A, \\epsilon_D) \\in \\{0,1\\}, \\\\\n", + "Z &:= & f_Z(X,\\epsilon_Z) \\in \\{0,1\\}, \\\\\n", + "X &:=& \\epsilon_X, \\quad A = \\epsilon_A,\n", + "\\end{eqnarray}\n", + "where $\\epsilon$'s are all exogenous and independent,\n", + "and\n", + "$$\n", + "z \\mapsto f_D(z , A, X, \\epsilon_D) \\text{ is weakly increasing (weakly monotone)}.\n", + "$$\n", + "and $A$ is a vector of unobserved confounders. Note that in our setting monotonicity is satisfied, since participation is only feasible when it is eligible. Thus we have that $D=0$ whenever $Z=0$. Thus it can only be that $f_D(1, A, X, \\epsilon_D) \\geq 0 = f_D(0, A, X, \\epsilon_D)$.\n", + "\n", + "In this case, we can estimate the local average treatment effect (LATE):\n", + "$$\n", + "\\alpha = E[Y(1) - Y(0) | D(1) > D(0)]\n", + "$$\n", + "This can be identified using the Neyman orthogonal moment equation:\n", + "\\begin{align}\n", + "E\\left[g(1, X) - g(0, X) + H(Z) (Y - g(Z, X)) - \\alpha \\cdot (m(1, X) - m(0, X) + H(Z) (D - m(Z, X))\\right] = 0\n", + "\\end{align}\n", + "where\n", + "\\begin{align}\n", + "g(Z,X) =~& E[Y|Z,X],\\\\\n", + "m(Z,X) =~& E[D|Z,X],\\\\\n", + "H(Z) =~& \\frac{Z}{Pr(Z=1|X)} - \\frac{1 - Z}{1 - Pr(Z=1|X)}\n", + "\\end{align}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49", + "metadata": { + "id": "rQYifUnFIt5z" + }, + "outputs": [], + "source": [ + "# DML for IIVM with D and Z as classifiers or regressors\n", + "DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"classification\", dt=0, bt=0) {\n", + " # this implements DML2 algorithm, where there moments are estimated via DML, before constructing\n", + " # the pooled estimate of theta randomly split data into folds\n", + "\n", + " ## NB This method has many if statements to accommodate the various estimators we will use.\n", + " ## Unlike Python's sklearn, all methods have idfferent default arguments in their predict functions.\n", + " ## See official R documentation for details.\n", + "\n", + " yhat0 <- rep(0, length(y))\n", + " yhat1 <- rep(0, length(y))\n", + " dhat0 <- rep(0, length(d))\n", + " dhat1 <- rep(0, length(d))\n", + " zhat <- rep(0, length(Z))\n", + "\n", + " nobs <- nrow(X)\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)]\n", + " I <- split(1:nobs, foldid)\n", + " # create residualized objects to fill\n", + " ytil <- dtil <- ztil<- rep(NA, nobs)\n", + "\n", + " # obtain cross-fitted residuals\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + "\n", + " # define helpful variables\n", + " Xb = X[I[[b]],]\n", + " Xnotb = X[-I[[b]],]\n", + " Znotb = Z[-I[[b]]]\n", + "\n", + " # training dfs subsetted on the -I[[b]] fold\n", + " XZ0 = X[-I[[b]],][Z[-I[[b]]]==0]\n", + " yZ0 = y[-I[[b]]][Z[-I[[b]]]==0]\n", + " XZ1 = X[-I[[b]],][Z[-I[[b]]]==1]\n", + " yZ1 = y[-I[[b]]][Z[-I[[b]]]==1]\n", + " DZ0 = d[-I[[b]]][Z[-I[[b]]]==0]\n", + " DZ1 = d[-I[[b]]][Z[-I[[b]]]==1]\n", + "\n", + "\n", + " if (method == \"regression\") {\n", + " XZ0 = as.matrix(XZ0)\n", + " XZ1 = as.matrix(XZ1)\n", + " Xb = as.matrix(Xb)\n", + " Xnotb = as.matrix(Xnotb)\n", + "\n", + " # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set\n", + " yfit0 <- yreg0((XZ0), yZ0)\n", + " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\"\n", + "\n", + " # train an outcome model on training data that received Z=1 and predict outcome on all data in test set\n", + " yfit1 <- yreg1((XZ1), yZ1)\n", + " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", + "\n", + " # train a treatment model on training data that received Z=0 and predict treatment on all data in test set\n", + " if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically\n", + " dreg0_ <- dreg0\n", + " dfit0 <- dreg0_((XZ0), DZ0)\n", + " dhat0[I[[b]]] <- predict(dfit0, (Xb), type=\"response\") # default type = \"response\", but for family binomial it's logg odds\n", + " }\n", + " # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set\n", + " if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically\n", + " dreg1_ <- dreg1\n", + " dfit1 <- dreg1_((XZ1), DZ1)\n", + " dhat1[I[[b]]] <- predict(dfit1, (Xb), type=\"response\")\n", + " } else {\n", + " dhat1[I[[b]]] <- 1\n", + " }\n", + "\n", + " } else if (method == \"randomforest\") {\n", + " DZ0factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==0]\n", + " DZ1factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==1]\n", + " Znotb = as.factor(Znotb)\n", + "\n", + " yfit0 <- yreg0((XZ0), yZ0)\n", + " yhat0[I[[b]]] <- predict(yfit0, (Xb), type=\"response\")\n", + " yfit1 <- yreg1((XZ1), yZ1)\n", + " yhat1[I[[b]]] <- predict(yfit1, (Xb), type=\"response\")\n", + "\n", + " if (mean(DZ0) > 0) {\n", + " dreg0_ <- dreg0\n", + " dfit0 <- dreg0_((XZ0), DZ0factor)\n", + " dhat0[I[[b]]] <- predict(dfit0, (Xb), type=\"prob\")[,2] # get second column because type = \"prob\"\n", + " }\n", + " if (mean(DZ1) < 1) {\n", + " dreg1_ <- dreg1\n", + " dfit1 <- dreg1_((XZ1), DZ1factor)\n", + " dhat1[I[[b]]] <- predict(dfit1, (Xb), type=\"prob\")[,2]\n", + " } else {\n", + " dhat1[I[[b]]] <- 1\n", + " }\n", + "\n", + " } else if (method == \"decisiontrees\") {\n", + " XZ0 = as.data.frame(XZ0)\n", + " XZ1 = as.data.frame(XZ1)\n", + " Xb = as.data.frame(Xb)\n", + " Xnotb = as.data.frame(Xnotb)\n", + "\n", + " yfit0 <- yreg0((XZ0), yZ0)\n", + " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\" for decision trees for continuous response\n", + "\n", + " yfit1 <- yreg1((XZ1), yZ1)\n", + " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", + "\n", + " if (mean(DZ0) > 0) {\n", + " dreg0_ <- dreg0\n", + " dfit0 <- dreg0_((XZ0), as.factor(DZ0))\n", + " dhat0[I[[b]]] <- predict(dfit0, (Xb))[,2] # for decision trees, default = \"prob\" for decision trees with factor responses\n", + " }\n", + "\n", + " if (mean(DZ1) < 1) {\n", + " dreg1_ <- dreg1\n", + " dfit1 <- dreg1_((XZ1), as.factor(DZ1))\n", + " dhat1[I[[b]]] <- predict(dfit1, (Xb))[,2]\n", + " } else {\n", + " dhat1[I[[b]]] <- 1\n", + " }\n", + "\n", + " } else if (method == \"boostedtrees\") {\n", + " XZ0 = as.data.frame(XZ0)\n", + " XZ1 = as.data.frame(XZ1)\n", + " Xb = as.data.frame(Xb)\n", + " Xnotb = as.data.frame(Xnotb)\n", + "\n", + " yfit0 <- yreg0((XZ0), yZ0)\n", + " yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = \"response\" for boosted trees\n", + " yfit1 <- yreg1((XZ1), yZ1)\n", + " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", + "\n", + " if (mean(DZ0) > 0) {\n", + " dreg0_ <- dreg0\n", + " dfit0 <- dreg0_((XZ0), DZ0)\n", + " dhat0[I[[b]]] <- predict(dfit0, (Xb), type = \"response\") # default for boosted trees is log odds.\n", + " }\n", + " if (mean(DZ1) < 1) {\n", + " dreg1_ <- dreg1\n", + " dfit1 <- dreg1_((XZ1), DZ1)\n", + " dhat1[I[[b]]] <- predict(dfit1, (Xb), type = \"response\")\n", + " } else {\n", + " dhat1[I[[b]]] <- 1\n", + " }\n", + "\n", + " }\n", + "\n", + " # propensity scores:\n", + " if (method == \"regression\"){\n", + " zfit_b <- zreg((Xnotb), Znotb)\n", + " zhat_b <- predict(zfit_b, (Xb), type=\"response\")\n", + " } else if (method == \"randomforest\"){\n", + " zfit_b <- zreg((Xnotb), Znotb)\n", + " zhat_b <- predict(zfit_b, (Xb), type = \"prob\")[,2]\n", + " } else if (method == \"decisiontrees\"){\n", + " zfit_b <- zreg((Xnotb), as.factor(Znotb))\n", + " zhat_b <- predict(zfit_b, (Xb)) # default is prob, so get second column\n", + " zhat_b = zhat_b[,2]\n", + " } else if (method == \"boostedtrees\"){\n", + " zfit_b <- zreg((Xnotb), Znotb)\n", + " zhat_b <- predict(zfit_b, (Xb), type = \"response\")\n", + " }\n", + " zhat_b <- pmax(pmin(zhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", + " zhat[I[[b]]] <- zhat_b\n", + "\n", + " cat(b,\" \")\n", + " }\n", + "\n", + "\n", + " # Prediction of treatment and outcome for observed instrument\n", + " yhat <- yhat0 * (1 - Z) + yhat1 * Z\n", + " dhat <- dhat0 * (1 - Z) + dhat1 * Z\n", + "\n", + " # residuals\n", + " ytil <- y-yhat\n", + " dtil <- D-dhat\n", + " ztil <- Z-zhat\n", + "\n", + " # doubly robust quantity for every sample\n", + " HZ <- Z / zhat - (1 - Z) / (1 - zhat)\n", + " drZ <- yhat1 - yhat0 + (y - yhat) * HZ\n", + " drD <- dhat1 - dhat0 + (D - dhat) * HZ\n", + " coef.est <- mean(drZ) / mean(drD)\n", + " cat(\"point\", coef.est)\n", + " psi <- drZ - coef.est * drD\n", + " Jhat <- mean(drD)\n", + " variance <- mean(psi^2) / Jhat^2\n", + " se <- sqrt(variance / nrow(X))\n", + " cat(\"se\", se)\n", + "\n", + " return(list(coef.est = coef.est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50", + "metadata": { + "id": "iArB2WQHBXuV" + }, + "outputs": [], + "source": [ + "summary <- function(coef.est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) {\n", + " summary_data <- data.frame(estimate = coef.est, # point estimate\n", + " se = se, # standard error\n", + " lower = coef.est - 1.96 * se, # lower end of 95% confidence interval\n", + " upper = coef.est + 1.96 * se, # upper end of 95% confidence interval\n", + " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", + " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", + " rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z\n", + " accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D\n", + " accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z\n", + " )\n", + " row.names(summary_data) <- name\n", + " return(summary_data)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51", + "metadata": { + "id": "Tj-8FFF3BXxV" + }, + "outputs": [], + "source": [ + "# DML with Lasso/Ridge\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", + "# DML with Lasso/Ridge\n", + "dreg0 <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", + "dreg1 <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", + "yreg0 <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "yreg1 <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "zreg <- function(x,z){cv.glmnet(x, z, family=\"binomial\", alpha=0, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.IIVM(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"regression\")\n", + "sum.lasso_ridge.cv <-summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'LassoCV/LogisticCV')\n", + "table <- data.frame()\n", + "table <- rbind(table, sum.lasso_ridge.cv)\n", + "table\n", + "\n", + "yhat.lasso = DML2.results$yhat\n", + "dhat.lasso = DML2.results$dhat\n", + "yhat0.lasso = DML2.results$yhat0\n", + "yhat1.lasso = DML2.results$yhat1\n", + "dhat0.lasso = DML2.results$dhat0\n", + "dhat1.lasso = DML2.results$dhat1\n", + "zhat.lasso = DML2.results$zhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52", + "metadata": { + "id": "sXjbvMbEkYJd" + }, + "outputs": [], + "source": [ + "# DML with Random Forest\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", + "\n", + "dreg0 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", + "dreg1 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg0 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg1 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "zreg <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest\n", + "\n", + "DML2.results <- DML2.for.IIVM(X,D,Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"randomforest\")\n", + "sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'RF')\n", + "table <- rbind(table, sum.rf)\n", + "table\n", + "\n", + "yhat.rf = DML2.results$yhat\n", + "dhat.rf = DML2.results$dhat\n", + "yhat0.rf = DML2.results$yhat0\n", + "yhat1.rf = DML2.results$yhat1\n", + "dhat0.rf = DML2.results$dhat0\n", + "dhat1.rf = DML2.results$dhat1\n", + "zhat.rf = DML2.results$zhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53", + "metadata": { + "id": "ZZRXpY8YkYNN" + }, + "outputs": [], + "source": [ + "# DML with Decision Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", + "\n", + "dreg0 <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "dreg1 <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "yreg0 <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "yreg1 <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "zreg <- function(x,z){rpart(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "\n", + "DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"decisiontrees\")\n", + "sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Decision Trees')\n", + "table <- rbind(table, sum.tr)\n", + "table\n", + "\n", + "yhat.tr = DML2.results$yhat\n", + "dhat.tr = DML2.results$dhat\n", + "yhat0.tr = DML2.results$yhat0\n", + "yhat1.tr = DML2.results$yhat1\n", + "dhat0.tr = DML2.results$dhat0\n", + "dhat1.tr = DML2.results$dhat1\n", + "zhat.tr = DML2.results$zhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54", + "metadata": { + "id": "RYqykjPskYQJ" + }, + "outputs": [], + "source": [ + "# DML with Boosted Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", + "\n", + "# NB: early stopping cannot easily be implemented with gbm\n", + "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", + "dreg0 <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "dreg1 <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg0 <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg1 <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "zreg <- function(x,z){gbm(as.formula(\"Z~.\"), cbind(data.frame(Z=z),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "\n", + "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", + "DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method=\"boostedtrees\")\n", + "sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Boosted Trees')\n", + "table <- rbind(table, sum.boost)\n", + "table\n", + "\n", + "yhat.boost = DML2.results$yhat\n", + "dhat.boost = DML2.results$dhat\n", + "yhat0.boost = DML2.results$yhat0\n", + "yhat1.boost = DML2.results$yhat1\n", + "dhat0.boost = DML2.results$dhat0\n", + "dhat1.boost = DML2.results$dhat1\n", + "zhat.boost = DML2.results$zhat" + ] + }, + { + "cell_type": "markdown", + "id": "55", + "metadata": { + "id": "29vcyCYsktQ3" + }, + "source": [ + "## Ensembles" + ] + }, + { + "cell_type": "markdown", + "id": "56", + "metadata": { + "id": "vuyfd9UJkw9G" + }, + "source": [ + "Boosted trees give the best RMSE for D and Z and random forests give the best RMSE for Y." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57", + "metadata": { + "id": "Y9_T5SMUk3Rd" + }, + "outputs": [], + "source": [ + "# Best fit is boosted trees for D, Z and random forests for Y\n", + "\n", + "best.yhat0 <- yhat0.rf\n", + "best.yhat1 <- yhat1.rf\n", + "best.yhat <- yhat.rf\n", + "\n", + "best.dhat0 <- dhat0.boost\n", + "best.dhat1 <- dhat1.boost\n", + "best.dhat <- dhat.boost\n", + "\n", + "best.zhat <- zhat.boost\n", + "\n", + "ytil.best <- y - best.yhat\n", + "dtil.best <- D - best.dhat\n", + "ztil.best <- Z - best.zhat\n", + "\n", + "# doubly robust quantity for every sample\n", + "HZ <- Z / best.zhat - (1 - Z) / (1 - best.zhat)\n", + "drZ <- best.yhat1 - best.yhat0 + (y - best.yhat) * HZ\n", + "drD <- best.dhat1 - best.dhat0 + (D - best.dhat) * HZ\n", + "coef.est <- mean(drZ) / mean(drD)\n", + "psi <- drZ - coef.est * drD\n", + "Jhat <- mean(drD)\n", + "variance <- mean(psi^2) / Jhat^2\n", + "se <- sqrt(variance / nrow(X))\n", + "\n", + "sum.best <- summary(coef.est, se, best.yhat, best.dhat, best.zhat, ytil.best, dtil.best, ztil.best, drZ, drD, name = 'Best')\n", + "table <- rbind(table, sum.best)\n", + "table" + ] + }, + { + "cell_type": "markdown", + "id": "58", + "metadata": { + "id": "RyRS9zNUlMCF" + }, + "source": [ + "We'll form a model average with unconstrained least squares weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59", + "metadata": { + "id": "1H4sCbO2lLpJ" + }, + "outputs": [], + "source": [ + "# Least squares model average\n", + "ma.dcoef <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost-1)$coef\n", + "ma.ycoef <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost-1)$coef\n", + "ma.zcoef <- lm(Z~zhat.lasso+zhat.rf+zhat.tr+zhat.boost-1)$coef\n", + "\n", + "ma.yhat0 <- cbind(yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)%*%as.matrix(ma.ycoef)\n", + "ma.yhat1 <- cbind(yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)%*%as.matrix(ma.ycoef)\n", + "ma.dhat0 <- cbind(dhat0.lasso,dhat0.rf,dhat0.tr,dhat0.boost)%*%as.matrix(ma.dcoef)\n", + "ma.dhat1 <- cbind(dhat1.lasso,dhat1.rf,dhat1.tr,dhat1.boost)%*%as.matrix(ma.dcoef)\n", + "ma.zhat <- cbind(zhat.lasso,zhat.rf,zhat.tr,zhat.boost)%*%as.matrix(ma.zcoef)\n", + "\n", + "# Prediction of treatment and outcome for observed instrument\n", + "ma.yhat <- ma.yhat0 * (1 - Z) + ma.yhat1 * Z\n", + "ma.dhat <- ma.dhat0 * (1 - Z) + ma.dhat1 * Z\n", + "\n", + "# residuals\n", + "ma.ytil <- y-ma.yhat\n", + "ma.dtil <- D-ma.dhat\n", + "ma.ztil <- Z-ma.zhat\n", + "\n", + "# doubly robust quantity for every sample\n", + "HZ <- Z / ma.zhat - (1 - Z) / (1 - ma.zhat)\n", + "drZ <- ma.yhat1 - ma.yhat0 + (y - ma.yhat) * HZ\n", + "drD <- ma.dhat1 - ma.dhat0 + (D - ma.dhat) * HZ\n", + "coef.est <- mean(drZ) / mean(drD)\n", + "psi <- drZ - coef.est * drD\n", + "Jhat <- mean(drD)\n", + "variance <- mean(psi^2) / Jhat^2\n", + "se <- sqrt(variance / nrow(X))\n", + "\n", + "sum.ma <- summary(coef.est, se, ma.yhat, ma.dhat, ma.zhat, ma.ytil, ma.dtil, ma.ztil, drZ, drD, name = 'Model Average')\n", + "table <- rbind(table, sum.ma)\n", + "table" + ] + }, + { + "cell_type": "markdown", + "id": "60", + "metadata": { + "id": "UflbjTEG5SXV" + }, + "source": [ + "Comparing with the PLR model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61", + "metadata": { + "id": "CIS-58oi4sa1" + }, + "outputs": [], + "source": [ + "tableplr" + ] + }, + { + "cell_type": "markdown", + "id": "62", + "metadata": { + "id": "M4Zi0FPH5VZG" + }, + "source": [ + "We find that the PLR model overestimates the effect by around 1k; though both sets of results have overlapping confidence intervals." + ] + }, + { + "cell_type": "markdown", + "id": "63", + "metadata": { + "id": "VrBkj_pc5qgm" + }, + "source": [ + "\n", + "Again as before, ideally we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R.\n", + "\n", + "As before, in the below analysis of robust inference, we choose Boosted Trees as they perform well in RMSE and accuracy on first-stage models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64", + "metadata": { + "id": "bj67nsgcCDoS" + }, + "outputs": [], + "source": [ + "iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, drZ, drD, X, Z, D, y, grid, alpha = 0.05) {\n", + " # Inference in the partially linear IV model that is robust to weak identification.\n", + " # grid: grid of theta values to search over when trying to identify the confidence region\n", + " # alpha: confidence level\n", + "\n", + " n <- dim(X)[1]\n", + " thr <- qchisq(1 - alpha, df = 1)\n", + " accept <- c()\n", + "\n", + " for (theta in grid) {\n", + " moment <- drZ - theta * drD\n", + " test <- n * mean(moment)^2 / var(moment)\n", + " if (test <= thr) {\n", + " accept <- c(accept, theta)\n", + " }\n", + " }\n", + "\n", + " return(accept)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65", + "metadata": { + "id": "KqgPk1Jm4sdo" + }, + "outputs": [], + "source": [ + "grid <- seq(0, 20000, length.out = 10000)\n", + "region <- iivm_robust_inference(point = DML2.results$coef.est, stderr = DML2.results$se, yhat = DML2.results$yhat, Dhat = DML2.results$dhat, Zhat = DML2.results$zhat, resy = DML2.results$ytil, resD = DML2.results$dtil, resZ = DML2.results$ztil, drZ = DML2.results$drZ, drD = DML2.results$drD, X=X, Z=Z, D=D, y=y, grid=grid)\n", + "\n", + "# Calculate min and max\n", + "min_region <- min(region)\n", + "max_region <- max(region)\n", + "\n", + "print(min_region)\n", + "print(max_region)" + ] + }, + { + "cell_type": "markdown", + "id": "66", + "metadata": { + "id": "akCGDMZJCN3h" + }, + "source": [ + "We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this." + ] + }, + { + "cell_type": "markdown", + "id": "67", + "metadata": { + "id": "01de9f24", + "papermill": { + "duration": 0.010725, + "end_time": "2022-04-19T09:06:51.098483", + "exception": false, + "start_time": "2022-04-19T09:06:51.087758", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# DoubleML package\n", + "\n", + "There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", + "\n", + "We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", + "\n", + "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68", + "metadata": { + "id": "2846a36a", + "papermill": { + "duration": 20.239271, + "end_time": "2022-04-19T09:07:11.369618", + "exception": false, + "start_time": "2022-04-19T09:06:51.130347", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "install.packages(\"DoubleML\")\n", + "install.packages(\"mlr3learners\")\n", + "install.packages(\"mlr3\")\n", + "install.packages(\"data.table\")\n", + "install.packages(\"ranger\")\n", + "\n", + "library(DoubleML)\n", + "library(mlr3learners)\n", + "library(mlr3)\n", + "library(data.table)\n", + "library(ranger)" + ] + }, + { + "cell_type": "markdown", + "id": "69", + "metadata": { + "id": "2259ae1c", + "papermill": { + "duration": 0.015455, + "end_time": "2022-04-19T09:12:00.920079", + "exception": false, + "start_time": "2022-04-19T09:12:00.904624", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets" + ] + }, + { + "cell_type": "markdown", + "id": "70", + "metadata": { + "id": "9c27e413", + "papermill": { + "duration": 0.015158, + "end_time": "2022-04-19T09:12:00.950542", + "exception": false, + "start_time": "2022-04-19T09:12:00.935384", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Interactive IV Model (IIVM)" + ] + }, + { + "cell_type": "markdown", + "id": "71", + "metadata": { + "id": "4fa23c70", + "papermill": { + "duration": 0.015304, + "end_time": "2022-04-19T09:12:00.981285", + "exception": false, + "start_time": "2022-04-19T09:12:00.965981", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is:\n", + "\n", + "\\begin{eqnarray}\n", + "& Y = g_0(Z,X) + U, &\\quad E[U\\mid Z,X] = 0,\\\\\n", + "& D = r_0(Z,X) + V, &\\quad E[V\\mid Z, X] = 0,\\\\\n", + "& Z = m_0(X) + \\zeta, &\\quad E[\\zeta \\mid X] = 0.\n", + "\\end{eqnarray}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72", + "metadata": { + "id": "cb223b75", + "papermill": { + "duration": 0.06823, + "end_time": "2022-04-19T09:12:01.064957", + "exception": false, + "start_time": "2022-04-19T09:12:00.996727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Constructing the data (as DoubleMLData)\n", + "formula_flex2 = \"net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", + "model_flex2 = as.data.table(model.frame(formula_flex2, data))\n", + "x_cols = colnames(model_flex2)[-c(1,2,3)]\n", + "data_IV = DoubleMLData$new(model_flex2, y_col = \"net_tfa\", d_cols = \"p401\", z_cols =\"e401\",x_cols=x_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73", + "metadata": { + "id": "e652ffad", + "papermill": { + "duration": 20.934595, + "end_time": "2022-04-19T09:12:22.014866", + "exception": false, + "start_time": "2022-04-19T09:12:01.080271", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "lasso <- lrn(\"regr.cv_glmnet\",nfolds = 5, s = \"lambda.min\")\n", + "lasso_class <- lrn(\"classif.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", + "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso,\n", + " ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE,\n", + " never_takers = TRUE))\n", + "dml_MLIIVM$fit(store_predictions=TRUE)\n", + "dml_MLIIVM$summary()\n", + "lasso_MLIIVM <- dml_MLIIVM$coef\n", + "lasso_std_MLIIVM <- dml_MLIIVM$se" + ] + }, + { + "cell_type": "markdown", + "id": "74", + "metadata": { + "id": "63103667", + "papermill": { + "duration": 0.015382, + "end_time": "2022-04-19T09:12:22.045989", + "exception": false, + "start_time": "2022-04-19T09:12:22.030607", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The confidence interval for the local average treatment effect of participation is given by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75", + "metadata": { + "id": "322855c4", + "papermill": { + "duration": 0.042067, + "end_time": "2022-04-19T09:12:22.103953", + "exception": false, + "start_time": "2022-04-19T09:12:22.061886", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dml_MLIIVM$confint(level = 0.95)" + ] + }, + { + "cell_type": "markdown", + "id": "76", + "metadata": { + "id": "2965410d", + "papermill": { + "duration": 0.015374, + "end_time": "2022-04-19T09:12:22.134875", + "exception": false, + "start_time": "2022-04-19T09:12:22.119501", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Here we can also check the accuracy of the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77", + "metadata": { + "id": "1476fd27", + "papermill": { + "duration": 0.056054, + "end_time": "2022-04-19T09:12:22.206157", + "exception": false, + "start_time": "2022-04-19T09:12:22.150103", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# variables\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "d <- as.matrix(pension$p401)\n", + "z <- as.matrix(pension$e401)\n", + "\n", + "# predictions\n", + "dml_MLIIVM$params_names()\n", + "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X)\n", + "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X)\n", + "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", + "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X)\n", + "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X)\n", + "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", + "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78", + "metadata": { + "id": "444c53f4", + "papermill": { + "duration": 0.056945, + "end_time": "2022-04-19T09:12:22.278593", + "exception": false, + "start_time": "2022-04-19T09:12:22.221648", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# cross-fitted RMSE: outcome\n", + "lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", + "lasso_y_MLIIVM\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", + "lasso_d_MLIIVM\n", + "\n", + "# cross-fitted RMSE: instrument\n", + "lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", + "lasso_z_MLIIVM\n" + ] + }, + { + "cell_type": "markdown", + "id": "79", + "metadata": { + "id": "a7461966", + "papermill": { + "duration": 0.016468, + "end_time": "2022-04-19T09:12:22.311250", + "exception": false, + "start_time": "2022-04-19T09:12:22.294782", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Again, we repeat the procedure for the other machine learning methods:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80", + "metadata": { + "id": "59YzwIcpEnyV" + }, + "outputs": [], + "source": [ + "# needed to run boosting\n", + "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", + "install.packages(\"mlr3extralearners\")\n", + "install.packages(\"mboost\")\n", + "library(mlr3extralearners)\n", + "library(mboost)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81", + "metadata": { + "id": "Ec0g3ch3EjAl" + }, + "outputs": [], + "source": [ + "# Forest\n", + "randomForest <- lrn(\"regr.ranger\")\n", + "randomForest_class <- lrn(\"classif.ranger\")\n", + "\n", + "# Trees\n", + "trees <- lrn(\"regr.rpart\")\n", + "trees_class <- lrn(\"classif.rpart\")\n", + "\n", + "# Boosting\n", + "boost <- lrn(\"regr.glmboost\")\n", + "boost_class <- lrn(\"classif.glmboost\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82", + "metadata": { + "id": "3935dfc5", + "papermill": { + "duration": 69.414354, + "end_time": "2022-04-19T09:13:31.742249", + "exception": false, + "start_time": "2022-04-19T09:12:22.327895", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "### random forest ###\n", + "\n", + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest,\n", + " ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", + " never_takers = TRUE))\n", + "dml_MLIIVM$fit(store_predictions=TRUE)\n", + "dml_MLIIVM$summary()\n", + "forest_MLIIVM <- dml_MLIIVM$coef\n", + "forest_std_MLIIVM <- dml_MLIIVM$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", + "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", + "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", + "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", + "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", + "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", + "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", + "forest_y_MLIIVM\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", + "forest_d_MLIIVM\n", + "\n", + "# cross-fitted RMSE: instrument\n", + "forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", + "forest_z_MLIIVM\n", + "\n", + "### trees ###\n", + "\n", + "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees,\n", + " ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", + " never_takers = TRUE))\n", + "dml_MLIIVM$fit(store_predictions=TRUE)\n", + "dml_MLIIVM$summary()\n", + "tree_MLIIVM <- dml_MLIIVM$coef\n", + "tree_std_MLIIVM <- dml_MLIIVM$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", + "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", + "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", + "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", + "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", + "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", + "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", + "tree_y_MLIIVM\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", + "tree_d_MLIIVM\n", + "\n", + "# cross-fitted RMSE: instrument\n", + "tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", + "tree_z_MLIIVM\n", + "\n", + "\n", + "### boosting ###\n", + "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost,\n", + " ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE,\n", + " never_takers = TRUE))\n", + "dml_MLIIVM$fit(store_predictions=TRUE)\n", + "dml_MLIIVM$summary()\n", + "boost_MLIIVM <- dml_MLIIVM$coef\n", + "boost_std_MLIIVM <- dml_MLIIVM$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X)\n", + "g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X)\n", + "g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0\n", + "r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X)\n", + "r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X)\n", + "r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0\n", + "m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2))\n", + "boost_y_MLIIVM\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2))\n", + "boost_d_MLIIVM\n", + "\n", + "# cross-fitted RMSE: instrument\n", + "boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2))\n", + "boost_z_MLIIVM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83", + "metadata": { + "id": "7187fc74", + "papermill": { + "duration": 0.061872, + "end_time": "2022-04-19T09:13:31.824148", + "exception": false, + "start_time": "2022-04-19T09:13:31.762276", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "table <- matrix(0, 5, 4)\n", + "table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM)\n", + "table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM)\n", + "table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM)\n", + "table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM)\n", + "table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM)\n", + "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\",\"RMSE Z\")\n", + "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", + "tab<- xtable(table, digits = 2)\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "84", + "metadata": { + "id": "f4ce7be1", + "papermill": { + "duration": 0.017437, + "end_time": "2022-04-19T09:13:31.859052", + "exception": false, + "start_time": "2022-04-19T09:13:31.841615", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We report results based on four ML methods for estimating the nuisance functions used in\n", + "forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis\n", + "that the effect of 401(k) participation has no effect on financial health." + ] + }, + { + "cell_type": "markdown", + "id": "85", + "metadata": { + "id": "4939cd9c", + "papermill": { + "duration": 0.017163, + "end_time": "2022-04-19T09:13:31.893361", + "exception": false, + "start_time": "2022-04-19T09:13:31.876198", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86", + "metadata": { + "id": "ca612b71", + "papermill": { + "duration": 20.780029, + "end_time": "2022-04-19T09:13:52.690594", + "exception": false, + "start_time": "2022-04-19T09:13:31.910565", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest,\n", + " ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE,\n", + " never_takers = TRUE))\n", + "dml_MLIIVM$fit(store_predictions=TRUE)\n", + "dml_MLIIVM$summary()\n", + "best_MLIIVM <- dml_MLIIVM$coef\n", + "best_std_MLIIVM <- dml_MLIIVM$se" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 427.936706, + "end_time": "2022-04-19T09:13:53.230849", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-04-19T09:06:45.294143", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/AC2/r-weak-iv-experiments.Rmd b/AC2/r-weak-iv-experiments.Rmd new file mode 100644 index 00000000..211fd149 --- /dev/null +++ b/AC2/r-weak-iv-experiments.Rmd @@ -0,0 +1,74 @@ +--- +title: An R Markdown document converted from "AC2/r-weak-iv-experiments.irnb" +output: html_document +--- + +# A Simple Example of Properties of IV estimator when Instruments are Weak + +```{r} +install.packages("hdm") + +library(hdm) +``` + +Simulation Design + +```{r} +# Simulation Design +set.seed(1) + +n=100 +beta = .1 # .1 weak IV +#beta = 1 # 1 strong IV + +# One realization +U = rnorm(n) +Z = rnorm(n) # generate instrument +D = beta*Z + U # generate endogenougs variable +Y = D + U # the true causal effect is 1 + + +summary(lm(D~Z)) # first stage is very weak here when we set beta = .1 +``` + +```{r} +summary(tsls(x=NULL, d=D, y=Y, z=Z)) +``` + +Note that the instrument is weak here (strength of the instrument is controlled by setting $\beta$) -- the t-stat is smaller than any rule-of-thumb suggested in the literature (e.g. $\sqrt{10}$) . + +# Run 10000 trials to evaluate distribution of the IV estimator + +```{r} +# Simulation Design + +set.seed(1) +B= 10000 # trials +IVEst = rep(0, B) + +for(i in 1:B){ +U = rnorm(n) +Z = rnorm(n) #generate instrument +D = beta*Z + U #generate endogenougs variable +Y = D+ U # the true causal effect is 1 +IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1] +} + +``` + +# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption) + +```{r} +plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5), + xlab= "IV Estimator -True Effect", main="Actual Distribution vs Gaussian") + +val=seq(-5, 5, by=.05) +var = (1/beta^2)*(1/100) # theoretical variance of IV +sd = sqrt(var) +lines(val, dnorm(val, sd=sd), col=2, lty=2) + +rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B + +cat(c("Rejection Frequency is ", rejection.frequency, " while we expect it to be .05")) +``` + diff --git a/AC2/r-weak-iv-experiments.irnb b/AC2/r-weak-iv-experiments.irnb index 7bc69a14..0660950f 100644 --- a/AC2/r-weak-iv-experiments.irnb +++ b/AC2/r-weak-iv-experiments.irnb @@ -1,166 +1,163 @@ { - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "k2wYjiHw-MNx" + }, + "source": [ + "# A Simple Example of Properties of IV estimator when Instruments are Weak" + ] }, - "nbformat_minor": 0, - "nbformat": 4, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# A Simple Example of Properties of IV estimator when Instruments are Weak" - ], - "metadata": { - "id": "k2wYjiHw-MNx" - } - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"hdm\")\n", - "\n", - "library(hdm)" - ], - "metadata": { - "id": "Fr38spdV-Xdr" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Simulation Design" - ], - "metadata": { - "id": "w7UUXXwZ-MNz" - } - }, - { - "cell_type": "code", - "source": [ - "# Simulation Design\n", - "set.seed(1)\n", - "\n", - "n=100\n", - "beta = .1 # .1 weak IV\n", - "#beta = 1 # 1 strong IV\n", - "\n", - "# One realization\n", - "U = rnorm(n)\n", - "Z = rnorm(n) # generate instrument\n", - "D = beta*Z + U # generate endogenougs variable\n", - "Y = D + U # the true causal effect is 1\n", - "\n", - "\n", - "summary(lm(D~Z)) # first stage is very weak here when we set beta = .1" - ], - "metadata": { - "trusted": true, - "id": "svk7GbHx-MNz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "summary(tsls(x=NULL, d=D, y=Y, z=Z))" - ], - "metadata": { - "id": "086YR0feH0LU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Note that the instrument is weak here (strength of the instrument is controlled by setting $\\beta$) -- the t-stat is smaller than any rule-of-thumb suggested in the literature (e.g. $\\sqrt{10}$) ." - ], - "metadata": { - "id": "G_O5sAa7-MN1" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Run 10000 trials to evaluate distribution of the IV estimator" - ], - "metadata": { - "id": "u-SaSi2L-MN1" - } - }, - { - "cell_type": "code", - "source": [ - "# Simulation Design\n", - "\n", - "set.seed(1)\n", - "B= 10000 # trials\n", - "IVEst = rep(0, B)\n", - "\n", - "for(i in 1:B){\n", - "U = rnorm(n)\n", - "Z = rnorm(n) #generate instrument\n", - "D = beta*Z + U #generate endogenougs variable\n", - "Y = D+ U # the true causal effect is 1\n", - "IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1]\n", - "}\n", - "\n" - ], - "metadata": { - "trusted": true, - "id": "Fc2Sjzhk-MN1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption)" - ], - "metadata": { - "id": "-uvOcpa1-MN1" - } - }, - { - "cell_type": "code", - "source": [ - "plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5),\n", - " xlab= \"IV Estimator -True Effect\", main=\"Actual Distribution vs Gaussian\")\n", - "\n", - "val=seq(-5, 5, by=.05)\n", - "var = (1/beta^2)*(1/100) # theoretical variance of IV\n", - "sd = sqrt(var)\n", - "lines(val, dnorm(val, sd=sd), col=2, lty=2)\n", - "\n", - "rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B\n", - "\n", - "cat(c(\"Rejection Frequency is \", rejection.frequency, \" while we expect it to be .05\"))\n" - ], - "metadata": { - "trusted": true, - "id": "Mvq4cLzC-MN2" - }, - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fr38spdV-Xdr" + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "\n", + "library(hdm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w7UUXXwZ-MNz" + }, + "source": [ + "Simulation Design" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "svk7GbHx-MNz" + }, + "outputs": [], + "source": [ + "# Simulation Design\n", + "set.seed(1)\n", + "\n", + "n=100\n", + "beta = .1 # .1 weak IV\n", + "#beta = 1 # 1 strong IV\n", + "\n", + "# One realization\n", + "U = rnorm(n)\n", + "Z = rnorm(n) # generate instrument\n", + "D = beta*Z + U # generate endogenougs variable\n", + "Y = D + U # the true causal effect is 1\n", + "\n", + "\n", + "summary(lm(D~Z)) # first stage is very weak here when we set beta = .1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "086YR0feH0LU" + }, + "outputs": [], + "source": [ + "summary(tsls(x=NULL, d=D, y=Y, z=Z))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G_O5sAa7-MN1" + }, + "source": [ + "Note that the instrument is weak here (strength of the instrument is controlled by setting $\\beta$) -- the t-stat is smaller than any rule-of-thumb suggested in the literature (e.g. $\\sqrt{10}$) ." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u-SaSi2L-MN1" + }, + "source": [ + "# Run 10000 trials to evaluate distribution of the IV estimator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fc2Sjzhk-MN1" + }, + "outputs": [], + "source": [ + "# Simulation Design\n", + "\n", + "set.seed(1)\n", + "B= 10000 # trials\n", + "IVEst = rep(0, B)\n", + "\n", + "for(i in 1:B){\n", + "U = rnorm(n)\n", + "Z = rnorm(n) #generate instrument\n", + "D = beta*Z + U #generate endogenougs variable\n", + "Y = D+ U # the true causal effect is 1\n", + "IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1]\n", + "}\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-uvOcpa1-MN1" + }, + "source": [ + "# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mvq4cLzC-MN2" + }, + "outputs": [], + "source": [ + "plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5),\n", + " xlab= \"IV Estimator -True Effect\", main=\"Actual Distribution vs Gaussian\")\n", + "\n", + "val=seq(-5, 5, by=.05)\n", + "var = (1/beta^2)*(1/100) # theoretical variance of IV\n", + "sd = sqrt(var)\n", + "lines(val, dnorm(val, sd=sd), col=2, lty=2)\n", + "\n", + "rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B\n", + "\n", + "cat(c(\"Rejection Frequency is \", rejection.frequency, \" while we expect it to be .05\"))\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From e2df16170c3b49bdaee0171232eaac85c88f7dd4 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 06:19:11 -0700 Subject: [PATCH 073/261] linting errors --- PM2/r_convergence_hypothesis_double_lasso.irnb | 2 ++ PM2/r_experiment_non_orthogonal.irnb | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 8e58138e..49e1545d 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -762,6 +762,8 @@ "outputs": [], "source": [ "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", " # residualize outcome with Lasso\n", " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index 21b9b719..e830beff 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -211,6 +211,8 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", " # residualize outcome with Lasso\n", " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", @@ -619,6 +621,8 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", " # residualize outcome with Lasso\n", " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", From d20e4a30749f0904793cf5f3b848b18dc58323f2 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 06:25:12 -0700 Subject: [PATCH 074/261] Create .lintr --- .lintr | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .lintr diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..e8ce173f --- /dev/null +++ b/.lintr @@ -0,0 +1,4 @@ +linters: linters_with_defaults( + line_length_linter = line_length_linter(120), + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase")) + ) \ No newline at end of file From 182bbee44fb7ce41c9e563f222bb6372923b00bd Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 06:29:59 -0700 Subject: [PATCH 075/261] linting R --- .lintr | 4 ++-- PM2/r_convergence_hypothesis_double_lasso.irnb | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.lintr b/.lintr index e8ce173f..3a02d2c2 100644 --- a/.lintr +++ b/.lintr @@ -1,4 +1,4 @@ linters: linters_with_defaults( - line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase")) + line_length_linter(120), + object_name_linter(styles = c("snake_case", "CamelCase")) ) \ No newline at end of file diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 49e1545d..4f56cbe9 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -637,8 +637,8 @@ "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", "\n", "# print unconditional effect of gdpsh465 and the corresponding standard error\n", - "cat (\"The estimated coefficient on gdpsh465 is\", est,\n", - " \" and the corresponding robust standard error is\", se)\n", + "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", + " \" and the corresponding robust standard error is\", se)\n", "\n", "# Calculate the 95% confidence interval for 'gdpsh465'\n", "lower_ci <- est - 1.96 * se\n", @@ -658,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" From db7b167f6156108bea508d75049271d839ecb1f0 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 07:44:30 -0700 Subject: [PATCH 076/261] Update .lintr --- .lintr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.lintr b/.lintr index 3a02d2c2..c015a66d 100644 --- a/.lintr +++ b/.lintr @@ -1,4 +1,4 @@ linters: linters_with_defaults( line_length_linter(120), object_name_linter(styles = c("snake_case", "CamelCase")) - ) \ No newline at end of file + ) From 334c41aa4d5bb6cbe1e970054f7cc468c0d89722 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 15:18:12 +0000 Subject: [PATCH 077/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in T --- T/T-3 Diff-in-Diff Minimum Wage Example.Rmd | 614 +++++ T/T-3 Diff-in-Diff Minimum Wage Example.irnb | 2138 +++++++---------- ...ression_Discontinuity_on_Progresa_Data.Rmd | 575 +++++ ...ession_Discontinuity_on_Progresa_Data.irnb | 1776 +++++++------- ...r-conditional-average-treatment-effect.Rmd | 626 +++++ ...-conditional-average-treatment-effect.irnb | 1802 +++++++------- 6 files changed, 4429 insertions(+), 3102 deletions(-) create mode 100644 T/T-3 Diff-in-Diff Minimum Wage Example.Rmd create mode 100644 T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd create mode 100644 T/dml-for-conditional-average-treatment-effect.Rmd diff --git a/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd b/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd new file mode 100644 index 00000000..985b7d70 --- /dev/null +++ b/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd @@ -0,0 +1,614 @@ +--- +title: An R Markdown document converted from "T/T-3 Diff-in-Diff Minimum Wage Example.irnb" +output: html_document +--- + +# Minimum Wage Example Notebook with DiD + +This notebook implements Difference-in-Differences in an application on +the effect of minimum wage changes on teen employment. We use data from +[Callaway +(2022)](https://bcallaway11.github.io/files/Callaway-Chapter-2022/main.pdf). The data are annual county level data from the United States covering 2001 to 2007. The outcome variable is log county-level teen employment, and the treatment variable is an indicator for whether the county has a minimum wage above the federal minimum wage. Note that this definition of the treatment variable makes the analysis straightforward but ignores the nuances of the exact value of the minimum wage in each county and how far those values are from the federal minimum. The data also include county population and county average annual pay. +See [Callaway and Sant’Anna +(2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620303948) +for additional details on the data. + +First, we will load some libraries. + +*(The installation of the packages might take up to 5 minutes)* + +```{r} +dependencies <- c("BMisc", "glmnet", "randomForest", "rpart", "xtable", "data.table") +install.packages(dependencies) +lapply(dependencies, library, character.only = TRUE) + +set.seed(772023) +options(warn=-1) +``` + +## Loading the data + +```{r} +data <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv", row.names=1) +data <- data.table(data) +``` + +```{r} +head(data) +``` + +### Data Preparation + +We remove observations that are already treated in the first observed period (2001). We drop all variables that we won't use in our analysis. + +```{r} +data <- subset(data, (G==0) | (G>2001)) +data <- data[, -c("countyreal","state_name","FIPS","emp0A01_BS", + "quarter", "censusdiv","pop","annual_avg_pay", + "state_mw","fed_mw", "ever_treated")] +``` + +Next, we create the treatment groups. We focus our analysis exclusively on the set of counties that had wage increases away from the federal minimum wage in 2004. That is, we treat 2003 and earlier as the pre-treatment period. + +```{r} +treat1 <- subset(data, (G == 2004) & (year == 2001)) +treat2 <- subset(data, (G == 2004) & (year == 2002)) +treat3 <- subset(data, (G == 2004) & (year == 2003)) +treat4 <- subset(data, (G == 2004) & (year == 2004)) +treat5 <- subset(data, (G == 2004) & (year == 2005)) +treat6 <- subset(data, (G == 2004) & (year == 2006)) +treat7 <- subset(data, (G == 2004) & (year == 2007)) + +cont1 <- subset(data, (G == 0 | G > 2001) & (year == 2001)) +cont2 <- subset(data, (G == 0 | G > 2002) & (year == 2002)) +cont3 <- subset(data, (G == 0 | G > 2003) & (year == 2003)) +cont4 <- subset(data, (G == 0 | G > 2004) & (year == 2004)) +cont5 <- subset(data, (G == 0 | G > 2005) & (year == 2005)) +cont6 <- subset(data, (G == 0 | G > 2006) & (year == 2006)) +cont7 <- subset(data, (G == 0 | G > 2007) & (year == 2007)) +``` + +We assume that the basic assumptions, particularly parallel trends, hold after conditioning on pre-treatment variables: 2001 population, 2001 average pay and 2001 teen employment, as well as the region in which the county is located. (The region is characterized by four +categories.) + +Consequently, we want to extract the control variables for both treatment and control group in 2001. + +```{r} +treat1 <- treat1[ , -c("year","G","region","treated")] + +cont1 <- cont1[ , -c("year","G","region","treated")] +``` + +2003 serves as the pre-treatment period for both counties that do receive the treatment in 2004 and those that do not. + +```{r} +treatB <- merge(treat3, treat1, by = "id", suffixes = c(".pre",".0")) +treatB <- treatB[ , -c("treated","lpop.pre","lavg_pay.pre","year","G")] + +contB <- merge(cont3, cont1, by = "id", suffixes = c(".pre",".0")) +contB <- contB[ , -c("treated","lpop.pre","lavg_pay.pre","year","G")] +``` + +We estimate the ATET in 2004-2007, which corresponds to the effect in the year of treatment as well as in the three years after the treatment. The control observations are the observations that still have the federal minimum wage in each year. (The control group is shrinking in each year as additional units receive treatment). + +```{r} +treat4 <- treat4[ , -c("lpop","lavg_pay","year","G","region")] +treat5 <- treat5[ , -c("lpop","lavg_pay","year","G","region")] +treat6 <- treat6[ , -c("lpop","lavg_pay","year","G","region")] +treat7 <- treat7[ , -c("lpop","lavg_pay","year","G","region")] + +cont4 <- cont4[ , -c("lpop","lavg_pay","year","G","region")] +cont5 <- cont5[ , -c("lpop","lavg_pay","year","G","region")] +cont6 <- cont6[ , -c("lpop","lavg_pay","year","G","region")] +cont7 <- cont7[ , -c("lpop","lavg_pay","year","G","region")] + +tdid04 <- merge(treat4, treatB, by = "id") +dy <- tdid04$lemp-tdid04$lemp.pre +tdid04$dy <- dy +tdid04 <- tdid04[ , -c("id","lemp","lemp.pre")] + +tdid05 <- merge(treat5, treatB, by = "id") +dy <- tdid05$lemp-tdid05$lemp.pre +tdid05$dy <- dy +tdid05 <- tdid05[ , -c("id","lemp","lemp.pre")] + +tdid06 <- merge(treat6, treatB, by = "id") +dy <- tdid06$lemp-tdid06$lemp.pre +tdid06$dy <- dy +tdid06 <- tdid06[ , -c("id","lemp","lemp.pre")] + +tdid07 <- merge(treat7, treatB, by = "id") +dy <- tdid07$lemp-tdid07$lemp.pre +tdid07$dy <- dy +tdid07 <- tdid07[ , -c("id","lemp","lemp.pre")] + +cdid04 <- merge(cont4, contB, by = "id") +dy <- cdid04$lemp-cdid04$lemp.pre +cdid04$dy <- dy +cdid04 <- cdid04[ , -c("id","lemp","lemp.pre")] + +cdid05 <- merge(cont5, contB, by = "id") +dy <- cdid05$lemp-cdid05$lemp.pre +cdid05$dy <- dy +cdid05 <- cdid05[ , -c("id","lemp","lemp.pre")] + +cdid06 <- merge(cont6, contB, by = "id") +dy <- cdid06$lemp-cdid06$lemp.pre +cdid06$dy <- dy +cdid06 <- cdid06[ , -c("id","lemp","lemp.pre")] + +cdid07 <- merge(cont7, contB, by = "id") +dy <- cdid07$lemp-cdid07$lemp.pre +cdid07$dy <- dy +cdid07 <- cdid07[ , -c("id","lemp","lemp.pre")] +``` + +### Estimation of the ATET with DML + +We estimate the ATET of the county level minimum wage being larger than the federal minimum with the DML algorithm presented in Section 16.3 in the book. This requires estimation of the nuisance functions $E[Y|D=0,X]$, $E[D|X]$ as well as $P(D = 1)$. For the conditional expectation functions, we will consider different modern ML regression methods, namely: Constant (= no controls); a linear combination of the controls; an expansion of the raw control variables including all third order interactions; Lasso (CV); Ridge (CV); Random Forest; Shallow Tree; Deep Tree; and CV Tree. +The methods indicated with CV have their tuning parameter selected by cross-validation. + +The following code block implements the DML cross-fitting procedure. + +```{r} +att <- matrix(NA,4,10) +se.att <- matrix(NA,4,10) +RMSE.d <- matrix(NA,4,9) +RMSE.y <- matrix(NA,4,9) +trimmed <- matrix(NA,4,9) + +print("DML estimation starting, please wait") +for(ii in 1:4){ # ii refer to the 4 investigated post-treatment periods + + tdata <- get(paste("tdid0",(3+ii),sep="")) # Treatment data + cdata <- get(paste("cdid0",(3+ii),sep="")) # Control data + usedata <- rbind(tdata,cdata) + + #----------------------------------------------------------------------------- + # Cross-fit setup + n <- nrow(usedata) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups + + # Initialize variables for CV predictions + yGd0x.fit <- matrix(NA,n,9) + dGx.fit <- matrix(NA,n,9) + pd.fit <-matrix(NA,n,1) + + #----------------------------------------------------------------------------- + # Cross-fit loop + for(k in 1:Kf) { + cat("year: ",ii+2003,"; fold: ",k,"\n") + indk <- cfgroup == k + + ktrain <- usedata[!indk,] + ktest <- usedata[indk,] + + # Build some matrices for later + ytrain <- as.matrix(usedata[!indk,"dy"]) + ytest <- as.matrix(usedata[indk,"dy"]) + dtrain <- as.matrix(usedata[!indk,"treated"]) + dtest <- as.matrix(usedata[indk,"treated"]) + + # Expansion for lasso/ridge (region specific cubic polynomial) + X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 , + degree = 3, raw = TRUE)), + data = usedata) + + xtrain <- as.matrix(X.expand[!indk,]) + xtest <- as.matrix(X.expand[indk,]) + + #----------------------------------------------------------------------------- + # P(D = 1) + pd.fit[indk,1] <- mean(ktrain$treated) + + #----------------------------------------------------------------------------- + # E[D|X] + + # 1) Constant + dGx.fit[indk,1] <- mean(ktrain$treated) + + # 2) Baseline controls + glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + family = "binomial", data = ktrain) + dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = "response") + + # 3) Region specific linear index + glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), + family = "binomial", data = ktrain) + dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = "response") + + # 4) Lasso - expansion - default CV tuning + lassoXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", type.measure = "mse") + dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = "response" , + s = "lambda.min") + + # 5) Ridge - expansion - default CV tuning + ridgeXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", + type.measure = "mse", alpha = 0) + dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = "response" , + s = "lambda.min") + + # 6) Random forest + rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain , mtry = 4, ntree = 1000) + dGx.fit[indk,6] <- predict(rfXdk, ktest, type = "prob")[, 2] + + # 7) Tree (start big) + btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV + dGx.fit[indk,7] <- predict(btXdk, ktest) + + # 8) Tree (small tree) + stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV + dGx.fit[indk,8] <- predict(stXdk, ktest) + + # 9) Tree (cv) + bestcp <- btXdk$cptable[which.min(btXdk$cptable[,"xerror"]),"CP"] + cvXdk <- prune(btXdk , cp=bestcp) + dGx.fit[indk,9] <- predict(cvXdk, ktest) + + #----------------------------------------------------------------------------- + # E[Y|D=0,X] + + # subset to D = 0 + ktrain0 = ktrain[ktrain$treated == 0, ] + + ytrain0 = ytrain[ktrain$treated == 0, ] + xtrain0 = xtrain[ktrain$treated == 0, ] + + # 1) Constant + yGd0x.fit[indk,1] <- mean(ktrain0$dy) + + # 2) Baseline controls + lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) + yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest) + + # 3) Region specific linear index + lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), + data = ktrain) + yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest) + + # 4) Lasso - expansion - default CV tuning + lassoXyk <- cv.glmnet(xtrain0 , ytrain0) + yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = "lambda.min") + + # 5) Ridge - expansion - default CV tuning + ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0) + yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") + + # 6) Random forest + rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain0 , mtry = 4, ntree = 1000) + yGd0x.fit[indk,6] <- predict(rfXyk, ktest) + + # 7) Tree (start big) + btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain0, method = "anova", + control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + yGd0x.fit[indk,7] <- predict(btXyk, ktest) + + # 8) Tree (small tree) + stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + yGd0x.fit[indk,8] <- predict(stXyk, ktest) + + # 9) Tree (cv) + bestcp <- btXyk$cptable[which.min(btXyk$cptable[,"xerror"]),"CP"] + cvXyk <- prune(btXyk , cp=bestcp) + yGd0x.fit[indk,9] <- predict(cvXyk, ktest) + + } + + RMSE.d[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2)) + RMSE.y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - + yGd0x.fit[usedata$treated == 0, ])^2)) + + # trim propensity scores of 1 to .95 + for(r in 1:9) { + trimmed[ii,r] = sum(dGx.fit[ , r] > .95) + dGx.fit[dGx.fit[ ,r] > .95,r] <- .95 + } + + att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* + (usedata$dy-yGd0x.fit)) , + mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) + /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* + (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))) + att.den <- mean(usedata$treated/pd.fit) + + att[ii, ] <- att.num/att.den + + phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* + (usedata$dy-yGd0x.fit) , + ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) + /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* + (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den + se.att[ii, ] <- sqrt(colMeans((phihat^2))/n) + + +} +``` + +We start by reporting the RMSE obtained during cross-fitting for each learner in each period. + +```{r} +table1y <- matrix(0, 9, 4) +table1y <- t(RMSE.y) +colnames(table1y)<- c("2004","2005","2006","2007") +rownames(table1y)<- c("No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest","Deep Tree", + "Shallow Tree", "Tree (CV)") +table1y +``` + +```{r} +table1d <- matrix(0, 9, 4) +table1d <- t(RMSE.d) +colnames(table1d)<- c("2004","2005","2006","2007") +rownames(table1d)<- c("No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest","Deep Tree", + "Shallow Tree", "Tree (CV)") +table1d +``` + +Here we see that the Deep Tree systematically performs worse in terms of cross-fit predictions than the other learners for both tasks and that Expansion performs similarly poorly for the outcome prediction. It also appears there is some signal in the regressors, especially for the propensity score, as all methods outside of Deep Tree and Expansion produce smaller RMSEs than the No Controls baseline. The other methods all produce similar RMSEs, with a small edge going to Ridge and Lasso. While it would be hard to reliably conclude which of the relatively good performing methods is statistically best here, one could exclude Expansion and Deep Tree from further consideration on the basis of out-of-sample performance suggesting +they are doing a poor job approximating the nuisance functions. Best (or a different ensemble) provides a good baseline that is principled in the sense that one could pre-commit to using the best learners without having first looked at the subsequent estimation results. + +We report estimates of the ATET in each period in the following table. + +```{r} +table2 <- matrix(0, 20, 4) +table2[seq(1,20,2),] <- t(att) +table2[seq(2,20,2),] <- t(se.att) +colnames(table2)<- c("2004","2005","2006","2007") +rownames(table2)<- c("No Controls","s.e.","Basic","s.e.", + "Expansion","s.e.","Lasso (CV)","s.e.", + "Ridge (CV)","s.e.","Random Forest","s.e.", + "Deep Tree","s.e.","Shallow Tree","s.e.", + "Tree (CV)","s.e.","Best","s.e.") +table2 +``` + +Here, we see that the majority of methods provide point estimates that suggest the effect of the minimum wage increase leads to decreases in youth employment with small effects in the initial period that become larger in the years following the treatment. This pattern seems economically plausible as it may take time for firms to adjust employment and other input choices in response to a minimum wage change. The methods that produce estiamtes that are not consistent with this pattern are Deep Tree and Expansion which are both suspect as they systematically underperform in terms of having poor cross-fit prediction performance. In terms of point estimates, the other pattern that emerges is that all estimates that use the covariates produce ATET estimates that are systematically smaller in magnitude than the No Controls baseline, suggesting that failing to include the controls may lead to overstatement of treatment effects in this example. + +Turning to inference, we would reject the hypothesis of no minimum wage effect two or more years after the change at the 5% level, even after multiple testing correction, if we were to focus on the row "Best" (or many of the other individual rows). Focusing on "Best" is a reasonable ex ante strategy that could be committed to prior to conducting any analysis. It is, of course, reassuring that this broad conclusion is also obtained using many of the individual learners suggesting some robustness to the exact choice of learner made. + +### Assess pre-trends + +Because we have data for the period 2001-2007, we can perform a so-called pre-trends test to provide some evidence about the plausibility of the conditional parallel trends assumption. Specifically, we can continue to use 2003 as the reference period but now consider 2002 to be the treatment period. Sensible economic mechanisms underlying the assumption would then typically suggest that the ATET in 2002 - before the 2004 minimum wage change we are considering - should be zero. Finding evidence that the ATET in 2002 is non-zero then calls into question the validity of the assumption. + +We change the treatment status of those observations, which received treatment in 2004 in the 2002 data and create a placebo treatment as well as control group. + +```{r} +treat2 <- treat2[ , -c("lpop","lavg_pay","year","G","region")] +treat2$treated <- 1 # Code these observations as treated + +tdid02 <- merge(treat2, treatB, by = "id") +dy <- tdid02$lemp-tdid02$lemp.pre +tdid02$dy <- dy +tdid02 <- tdid02[ , -c("id","lemp","lemp.pre")] + +cont2 <- cont2[ , -c("lpop","lavg_pay","year","G","region")] + +cdid02 <- merge(cont2, contB, by = "id") +dy <- cdid02$lemp-cdid02$lemp.pre +cdid02$dy <- dy +cdid02 <- cdid02[ , -c("id","lemp","lemp.pre")] +``` + +We repeat the exercise for obtaining our ATET estimates and standard error for 2004-2007. Particularly, we also use all the learners as mentioned above. + +```{r} +attP <- matrix(NA,1,10) +se.attP <- matrix(NA,1,10) +RMSE.dP <- matrix(NA,1,9) +RMSE.yP <- matrix(NA,1,9) +trimmedP <- matrix(NA,1,9) +for(ii in 1){ + + tdata <- get(paste("tdid0",(3-ii),sep="")) # Treatment data + cdata <- get(paste("cdid0",(3-ii),sep="")) # Control data + usedata <- rbind(tdata,cdata) + + #----------------------------------------------------------------------------- + # Cross-fit setup + n <- nrow(usedata) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups + + # Initialize variables for CV predictions + yGd0x.fit <- matrix(NA,n,9) + dGx.fit <- matrix(NA,n,9) + pd.fit <-matrix(NA,n,1) + + #----------------------------------------------------------------------------- + # Cross-fit loop + for(k in 1:Kf) { + cat("year: ",ii+2001,"; fold: ",k,"\n") + indk <- cfgroup == k + + ktrain <- usedata[!indk,] + ktest <- usedata[indk,] + + # Build some matrices for later + ytrain <- as.matrix(usedata[!indk,"dy"]) + ytest <- as.matrix(usedata[indk,"dy"]) + dtrain <- as.matrix(usedata[!indk,"treated"]) + dtest <- as.matrix(usedata[indk,"treated"]) + + # Expansion for lasso/ridge (region specific cubic polynomial) + X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 , + degree = 3, raw = TRUE)), + data = usedata) + + xtrain <- as.matrix(X.expand[!indk,]) + xtest <- as.matrix(X.expand[indk,]) + + #----------------------------------------------------------------------------- + # P(D = 1) + pd.fit[indk,1] <- mean(ktrain$treated) + + #----------------------------------------------------------------------------- + # E[D|X] + + # 1) Constant + dGx.fit[indk,1] <- mean(ktrain$treated) + + # 2) Baseline controls + glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + family = "binomial", data = ktrain) + dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = "response") + + # 3) Region specific linear index + glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), + family = "binomial", data = ktrain) + dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = "response") + + # 4) Lasso - expansion - default CV tuning + lassoXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", type.measure = "mse") + dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = "response" , + s = "lambda.min") + + # 5) Ridge - expansion - default CV tuning + ridgeXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", + type.measure = "mse", alpha = 0) + dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = "response" , + s = "lambda.min") + + # 6) Random forest + rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain , mtry = 4, ntree = 1000) + dGx.fit[indk,6] <- predict(rfXdk, ktest, type = "prob")[, 2] + + # 7) Tree (start big) + btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV + dGx.fit[indk,7] <- predict(btXdk, ktest) + + # 8) Tree (small tree) + stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV + dGx.fit[indk,8] <- predict(stXdk, ktest) + + # 9) Tree (cv) + bestcp <- btXdk$cptable[which.min(btXdk$cptable[,"xerror"]),"CP"] + cvXdk <- prune(btXdk , cp=bestcp) + dGx.fit[indk,9] <- predict(cvXdk, ktest) + + #----------------------------------------------------------------------------- + # E[Y|D=0,X] + + # subset to D = 0 + ktrain0 = ktrain[ktrain$treated == 0, ] + + ytrain0 = ytrain[ktrain$treated == 0, ] + xtrain0 = xtrain[ktrain$treated == 0, ] + + # 1) Constant + yGd0x.fit[indk,1] <- mean(ktrain0$dy) + + # 2) Baseline controls + lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) + yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest) + + # 3) Region specific linear index + lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), + data = ktrain) + yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest) + + # 4) Lasso - expansion - default CV tuning + lassoXyk <- cv.glmnet(xtrain0 , ytrain0) + yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = "lambda.min") + + # 5) Ridge - expansion - default CV tuning + ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0) + yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") + + # 6) Random forest + rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain0 , mtry = 4, ntree = 1000) + yGd0x.fit[indk,6] <- predict(rfXyk, ktest) + + # 7) Tree (start big) + btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain0, method = "anova", + control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + yGd0x.fit[indk,7] <- predict(btXyk, ktest) + + # 8) Tree (small tree) + stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , + data = ktrain, method = "anova", + control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + yGd0x.fit[indk,8] <- predict(stXyk, ktest) + + # 9) Tree (cv) + bestcp <- btXyk$cptable[which.min(btXyk$cptable[,"xerror"]),"CP"] + cvXyk <- prune(btXyk , cp=bestcp) + yGd0x.fit[indk,9] <- predict(cvXyk, ktest) + + } + + RMSE.dP[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2)) + RMSE.yP[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - + yGd0x.fit[usedata$treated == 0, ])^2)) + + # trim propensity scores of 1 to .95 + for(r in 1:9) { + trimmedP[ii,r] = sum(dGx.fit[ , r] > .95) + dGx.fit[dGx.fit[ ,r] > .95,r] <- .95 + } + + att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* + (usedata$dy-yGd0x.fit)) , + mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) + /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* + (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))) + att.den <- mean(usedata$treated/pd.fit) + + attP[ii, ] <- att.num/att.den + + phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* + (usedata$dy-yGd0x.fit) , + ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) + /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* + (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den + se.attP[ii, ] <- sqrt(colMeans((phihat^2))/n) + + +} +``` + +We report the results in the following table. + +```{r} +tableP <- matrix(0, 4, 10) +tableP[1,] <- c(RMSE.yP, min(RMSE.yP)) +tableP[2,] <- c(RMSE.dP, min(RMSE.dP)) +tableP[3,] <- attP +tableP[4,] <- se.attP +rownames(tableP)<- c("RMSE Y","RMSE D","ATET","s.e.") +colnames(tableP)<- c("No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest","Deep Tree", + "Shallow Tree", "Tree (CV)" , "Best") +tableP = t(tableP) +tableP +``` + +Here we see broad agreement across all methods in the sense of returning point estimates that are small in magnitude and small relative to standard errors. In no case would we reject the hypothesis that the pre-event effect in 2002 is different from zero at usual levels of significance. We note that failing to reject the hypothesis of no pre-event effects certainly does not imply that the conditional DiD assumption is in fact satisfied. For example, confidence intervals include values that would be consistent with relatively large pre-event effects. However, it is reassuring to see that there is not strong evidence of a violation of the underlying identifying assumption. + diff --git a/T/T-3 Diff-in-Diff Minimum Wage Example.irnb b/T/T-3 Diff-in-Diff Minimum Wage Example.irnb index 86edef38..32c73347 100644 --- a/T/T-3 Diff-in-Diff Minimum Wage Example.irnb +++ b/T/T-3 Diff-in-Diff Minimum Wage Example.irnb @@ -1,1257 +1,889 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rsOnp1Y-TJy_" + }, + "source": [ + "# Minimum Wage Example Notebook with DiD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "trvqH1pjTJpR" + }, + "source": [ + "This notebook implements Difference-in-Differences in an application on\n", + "the effect of minimum wage changes on teen employment. We use data from\n", + "[Callaway\n", + "(2022)](https://bcallaway11.github.io/files/Callaway-Chapter-2022/main.pdf). The data are annual county level data from the United States covering 2001 to 2007. The outcome variable is log county-level teen employment, and the treatment variable is an indicator for whether the county has a minimum wage above the federal minimum wage. Note that this definition of the treatment variable makes the analysis straightforward but ignores the nuances of the exact value of the minimum wage in each county and how far those values are from the federal minimum. The data also include county population and county average annual pay.\n", + "See [Callaway and Sant’Anna\n", + "(2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620303948)\n", + "for additional details on the data.\n", + "\n", + "First, we will load some libraries.\n", + "\n", + "*(The installation of the packages might take up to 5 minutes)*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "ir", - "display_name": "R" - }, - "language_info": { - "name": "R" - } + "base_uri": "https://localhost:8080/", + "height": 201 + }, + "id": "FFlG2QhXTJav", + "outputId": "254f92c0-bae4-41e2-fb92-7d32f17eb751" + }, + "outputs": [], + "source": [ + "dependencies <- c(\"BMisc\", \"glmnet\", \"randomForest\", \"rpart\", \"xtable\", \"data.table\")\n", + "install.packages(dependencies)\n", + "lapply(dependencies, library, character.only = TRUE)\n", + "\n", + "set.seed(772023)\n", + "options(warn=-1)" + ] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Minimum Wage Example Notebook with DiD" - ], - "metadata": { - "id": "rsOnp1Y-TJy_" - } - }, - { - "cell_type": "markdown", - "source": [ - "This notebook implements Difference-in-Differences in an application on\n", - "the effect of minimum wage changes on teen employment. We use data from\n", - "[Callaway\n", - "(2022)](https://bcallaway11.github.io/files/Callaway-Chapter-2022/main.pdf). The data are annual county level data from the United States covering 2001 to 2007. The outcome variable is log county-level teen employment, and the treatment variable is an indicator for whether the county has a minimum wage above the federal minimum wage. Note that this definition of the treatment variable makes the analysis straightforward but ignores the nuances of the exact value of the minimum wage in each county and how far those values are from the federal minimum. The data also include county population and county average annual pay.\n", - "See [Callaway and Sant’Anna\n", - "(2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620303948)\n", - "for additional details on the data.\n", - "\n", - "First, we will load some libraries.\n", - "\n", - "*(The installation of the packages might take up to 5 minutes)*" - ], - "metadata": { - "id": "trvqH1pjTJpR" - } - }, - { - "cell_type": "code", - "source": [ - "dependencies <- c(\"BMisc\", \"glmnet\", \"randomForest\", \"rpart\", \"xtable\", \"data.table\")\n", - "install.packages(dependencies)\n", - "lapply(dependencies, library, character.only = TRUE)\n", - "\n", - "set.seed(772023)\n", - "options(warn=-1)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 201 - }, - "id": "FFlG2QhXTJav", - "outputId": "254f92c0-bae4-41e2-fb92-7d32f17eb751" - }, - "execution_count": 60, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Installing packages into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
    \n", - "\t
  1. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  2. \n", - "\t
  3. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  4. \n", - "\t
  5. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  6. \n", - "\t
  7. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  8. \n", - "\t
  9. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  10. \n", - "\t
  11. \n", - "
    1. 'data.table'
    2. 'xtable'
    3. 'rpart'
    4. 'randomForest'
    5. 'glmnet'
    6. 'Matrix'
    7. 'BMisc'
    8. 'stats'
    9. 'graphics'
    10. 'grDevices'
    11. 'utils'
    12. 'datasets'
    13. 'methods'
    14. 'base'
    \n", - "
  12. \n", - "
\n" - ], - "text/markdown": "1. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n2. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n3. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n4. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n5. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n6. 1. 'data.table'\n2. 'xtable'\n3. 'rpart'\n4. 'randomForest'\n5. 'glmnet'\n6. 'Matrix'\n7. 'BMisc'\n8. 'stats'\n9. 'graphics'\n10. 'grDevices'\n11. 'utils'\n12. 'datasets'\n13. 'methods'\n14. 'base'\n\n\n\n\n\n", - "text/latex": "\\begin{enumerate}\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'data.table'\n\\item 'xtable'\n\\item 'rpart'\n\\item 'randomForest'\n\\item 'glmnet'\n\\item 'Matrix'\n\\item 'BMisc'\n\\item 'stats'\n\\item 'graphics'\n\\item 'grDevices'\n\\item 'utils'\n\\item 'datasets'\n\\item 'methods'\n\\item 'base'\n\\end{enumerate*}\n\n\\end{enumerate}\n", - "text/plain": [ - "[[1]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n", - "\n", - "[[2]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n", - "\n", - "[[3]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n", - "\n", - "[[4]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n", - "\n", - "[[5]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n", - "\n", - "[[6]]\n", - " [1] \"data.table\" \"xtable\" \"rpart\" \"randomForest\" \"glmnet\" \n", - " [6] \"Matrix\" \"BMisc\" \"stats\" \"graphics\" \"grDevices\" \n", - "[11] \"utils\" \"datasets\" \"methods\" \"base\" \n" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Loading the data" - ], - "metadata": { - "id": "u6jWjkrzU8I6" - } - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "id": "znh8YcAXSp3E" - }, - "outputs": [], - "source": [ - "data <- read.csv(\"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv\", row.names=1)\n", - "data <- data.table(data)" - ] - }, - { - "cell_type": "code", - "source": [ - "head(data)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 306 - }, - "id": "PQdsT6BnWKeq", - "outputId": "d71da67c-541c-4c5f-f65a-7e6dd70230cd" - }, - "execution_count": 62, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.table: 6 × 19
countyrealstate_nameyearFIPSemp0A01_BSquartercensusdivpopannual_avg_paystate_mwfed_mwtreatedGlemplpoplavg_payregionever_treatedid
<int><chr><int><int><int><int><int><int><int><dbl><dbl><int><int><dbl><dbl><dbl><int><int><int>
2013Alaska2001201315192459221555.655.15120012.7080507.80751010.00582412013
2013Alaska2002201317192664284475.655.15120012.8332137.88758410.25580412013
2013Alaska2003201312192715301847.155.15120012.4849077.90654710.31507412013
2013Alaska2004201313192677275577.155.15120012.5649497.89245210.22401412013
2013Alaska2005201311192646303967.155.15120012.3978957.88080410.32207412013
2013Alaska20062013 9192610303237.155.15120012.1972257.86710610.31966412013
\n" - ], - "text/markdown": "\nA data.table: 6 × 19\n\n| countyreal <int> | state_name <chr> | year <int> | FIPS <int> | emp0A01_BS <int> | quarter <int> | censusdiv <int> | pop <int> | annual_avg_pay <int> | state_mw <dbl> | fed_mw <dbl> | treated <int> | G <int> | lemp <dbl> | lpop <dbl> | lavg_pay <dbl> | region <int> | ever_treated <int> | id <int> |\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n| 2013 | Alaska | 2001 | 2013 | 15 | 1 | 9 | 2459 | 22155 | 5.65 | 5.15 | 1 | 2001 | 2.708050 | 7.807510 | 10.00582 | 4 | 1 | 2013 |\n| 2013 | Alaska | 2002 | 2013 | 17 | 1 | 9 | 2664 | 28447 | 5.65 | 5.15 | 1 | 2001 | 2.833213 | 7.887584 | 10.25580 | 4 | 1 | 2013 |\n| 2013 | Alaska | 2003 | 2013 | 12 | 1 | 9 | 2715 | 30184 | 7.15 | 5.15 | 1 | 2001 | 2.484907 | 7.906547 | 10.31507 | 4 | 1 | 2013 |\n| 2013 | Alaska | 2004 | 2013 | 13 | 1 | 9 | 2677 | 27557 | 7.15 | 5.15 | 1 | 2001 | 2.564949 | 7.892452 | 10.22401 | 4 | 1 | 2013 |\n| 2013 | Alaska | 2005 | 2013 | 11 | 1 | 9 | 2646 | 30396 | 7.15 | 5.15 | 1 | 2001 | 2.397895 | 7.880804 | 10.32207 | 4 | 1 | 2013 |\n| 2013 | Alaska | 2006 | 2013 | 9 | 1 | 9 | 2610 | 30323 | 7.15 | 5.15 | 1 | 2001 | 2.197225 | 7.867106 | 10.31966 | 4 | 1 | 2013 |\n\n", - "text/latex": "A data.table: 6 × 19\n\\begin{tabular}{lllllllllllllllllll}\n countyreal & state\\_name & year & FIPS & emp0A01\\_BS & quarter & censusdiv & pop & annual\\_avg\\_pay & state\\_mw & fed\\_mw & treated & G & lemp & lpop & lavg\\_pay & region & ever\\_treated & id\\\\\n & & & & & & & & & & & & & & & & & & \\\\\n\\hline\n\t 2013 & Alaska & 2001 & 2013 & 15 & 1 & 9 & 2459 & 22155 & 5.65 & 5.15 & 1 & 2001 & 2.708050 & 7.807510 & 10.00582 & 4 & 1 & 2013\\\\\n\t 2013 & Alaska & 2002 & 2013 & 17 & 1 & 9 & 2664 & 28447 & 5.65 & 5.15 & 1 & 2001 & 2.833213 & 7.887584 & 10.25580 & 4 & 1 & 2013\\\\\n\t 2013 & Alaska & 2003 & 2013 & 12 & 1 & 9 & 2715 & 30184 & 7.15 & 5.15 & 1 & 2001 & 2.484907 & 7.906547 & 10.31507 & 4 & 1 & 2013\\\\\n\t 2013 & Alaska & 2004 & 2013 & 13 & 1 & 9 & 2677 & 27557 & 7.15 & 5.15 & 1 & 2001 & 2.564949 & 7.892452 & 10.22401 & 4 & 1 & 2013\\\\\n\t 2013 & Alaska & 2005 & 2013 & 11 & 1 & 9 & 2646 & 30396 & 7.15 & 5.15 & 1 & 2001 & 2.397895 & 7.880804 & 10.32207 & 4 & 1 & 2013\\\\\n\t 2013 & Alaska & 2006 & 2013 & 9 & 1 & 9 & 2610 & 30323 & 7.15 & 5.15 & 1 & 2001 & 2.197225 & 7.867106 & 10.31966 & 4 & 1 & 2013\\\\\n\\end{tabular}\n", - "text/plain": [ - " countyreal state_name year FIPS emp0A01_BS quarter censusdiv pop \n", - "1 2013 Alaska 2001 2013 15 1 9 2459\n", - "2 2013 Alaska 2002 2013 17 1 9 2664\n", - "3 2013 Alaska 2003 2013 12 1 9 2715\n", - "4 2013 Alaska 2004 2013 13 1 9 2677\n", - "5 2013 Alaska 2005 2013 11 1 9 2646\n", - "6 2013 Alaska 2006 2013 9 1 9 2610\n", - " annual_avg_pay state_mw fed_mw treated G lemp lpop lavg_pay region\n", - "1 22155 5.65 5.15 1 2001 2.708050 7.807510 10.00582 4 \n", - "2 28447 5.65 5.15 1 2001 2.833213 7.887584 10.25580 4 \n", - "3 30184 7.15 5.15 1 2001 2.484907 7.906547 10.31507 4 \n", - "4 27557 7.15 5.15 1 2001 2.564949 7.892452 10.22401 4 \n", - "5 30396 7.15 5.15 1 2001 2.397895 7.880804 10.32207 4 \n", - "6 30323 7.15 5.15 1 2001 2.197225 7.867106 10.31966 4 \n", - " ever_treated id \n", - "1 1 2013\n", - "2 1 2013\n", - "3 1 2013\n", - "4 1 2013\n", - "5 1 2013\n", - "6 1 2013" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Data Preparation\n", - "\n", - "We remove observations that are already treated in the first observed period (2001). We drop all variables that we won't use in our analysis." - ], - "metadata": { - "id": "v37g7zlwW5pH" - } - }, - { - "cell_type": "code", - "source": [ - "data <- subset(data, (G==0) | (G>2001))\n", - "data <- data[, -c(\"countyreal\",\"state_name\",\"FIPS\",\"emp0A01_BS\",\n", - " \"quarter\", \"censusdiv\",\"pop\",\"annual_avg_pay\",\n", - " \"state_mw\",\"fed_mw\", \"ever_treated\")]" - ], - "metadata": { - "id": "W6ob7pptW49G" - }, - "execution_count": 63, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Next, we create the treatment groups. We focus our analysis exclusively on the set of counties that had wage increases away from the federal minimum wage in 2004. That is, we treat 2003 and earlier as the pre-treatment period." - ], - "metadata": { - "id": "Ri12EDNJaAfF" - } - }, - { - "cell_type": "code", - "source": [ - "treat1 <- subset(data, (G == 2004) & (year == 2001))\n", - "treat2 <- subset(data, (G == 2004) & (year == 2002))\n", - "treat3 <- subset(data, (G == 2004) & (year == 2003))\n", - "treat4 <- subset(data, (G == 2004) & (year == 2004))\n", - "treat5 <- subset(data, (G == 2004) & (year == 2005))\n", - "treat6 <- subset(data, (G == 2004) & (year == 2006))\n", - "treat7 <- subset(data, (G == 2004) & (year == 2007))\n", - "\n", - "cont1 <- subset(data, (G == 0 | G > 2001) & (year == 2001))\n", - "cont2 <- subset(data, (G == 0 | G > 2002) & (year == 2002))\n", - "cont3 <- subset(data, (G == 0 | G > 2003) & (year == 2003))\n", - "cont4 <- subset(data, (G == 0 | G > 2004) & (year == 2004))\n", - "cont5 <- subset(data, (G == 0 | G > 2005) & (year == 2005))\n", - "cont6 <- subset(data, (G == 0 | G > 2006) & (year == 2006))\n", - "cont7 <- subset(data, (G == 0 | G > 2007) & (year == 2007))" - ], - "metadata": { - "id": "huj7huQ1aQSq" - }, - "execution_count": 64, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We assume that the basic assumptions, particularly parallel trends, hold after conditioning on pre-treatment variables: 2001 population, 2001 average pay and 2001 teen employment, as well as the region in which the county is located. (The region is characterized by four\n", - "categories.)\n", - "\n", - "Consequently, we want to extract the control variables for both treatment and control group in 2001." - ], - "metadata": { - "id": "6HC1PX_Uc5bQ" - } - }, - { - "cell_type": "code", - "source": [ - "treat1 <- treat1[ , -c(\"year\",\"G\",\"region\",\"treated\")]\n", - "\n", - "cont1 <- cont1[ , -c(\"year\",\"G\",\"region\",\"treated\")]" - ], - "metadata": { - "id": "KvkwAdL6evsU" - }, - "execution_count": 65, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "2003 serves as the pre-treatment period for both counties that do receive the treatment in 2004 and those that do not." - ], - "metadata": { - "id": "zU7rM_5Ne3Xr" - } - }, - { - "cell_type": "code", - "source": [ - "treatB <- merge(treat3, treat1, by = \"id\", suffixes = c(\".pre\",\".0\"))\n", - "treatB <- treatB[ , -c(\"treated\",\"lpop.pre\",\"lavg_pay.pre\",\"year\",\"G\")]\n", - "\n", - "contB <- merge(cont3, cont1, by = \"id\", suffixes = c(\".pre\",\".0\"))\n", - "contB <- contB[ , -c(\"treated\",\"lpop.pre\",\"lavg_pay.pre\",\"year\",\"G\")]" - ], - "metadata": { - "id": "3cd3dBDqeyqa" - }, - "execution_count": 66, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We estimate the ATET in 2004-2007, which corresponds to the effect in the year of treatment as well as in the three years after the treatment. The control observations are the observations that still have the federal minimum wage in each year. (The control group is shrinking in each year as additional units receive treatment)." - ], - "metadata": { - "id": "xL1fSfb5e82d" - } - }, - { - "cell_type": "code", - "source": [ - "treat4 <- treat4[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "treat5 <- treat5[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "treat6 <- treat6[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "treat7 <- treat7[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "\n", - "cont4 <- cont4[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "cont5 <- cont5[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "cont6 <- cont6[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "cont7 <- cont7[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "\n", - "tdid04 <- merge(treat4, treatB, by = \"id\")\n", - "dy <- tdid04$lemp-tdid04$lemp.pre\n", - "tdid04$dy <- dy\n", - "tdid04 <- tdid04[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "tdid05 <- merge(treat5, treatB, by = \"id\")\n", - "dy <- tdid05$lemp-tdid05$lemp.pre\n", - "tdid05$dy <- dy\n", - "tdid05 <- tdid05[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "tdid06 <- merge(treat6, treatB, by = \"id\")\n", - "dy <- tdid06$lemp-tdid06$lemp.pre\n", - "tdid06$dy <- dy\n", - "tdid06 <- tdid06[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "tdid07 <- merge(treat7, treatB, by = \"id\")\n", - "dy <- tdid07$lemp-tdid07$lemp.pre\n", - "tdid07$dy <- dy\n", - "tdid07 <- tdid07[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "cdid04 <- merge(cont4, contB, by = \"id\")\n", - "dy <- cdid04$lemp-cdid04$lemp.pre\n", - "cdid04$dy <- dy\n", - "cdid04 <- cdid04[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "cdid05 <- merge(cont5, contB, by = \"id\")\n", - "dy <- cdid05$lemp-cdid05$lemp.pre\n", - "cdid05$dy <- dy\n", - "cdid05 <- cdid05[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "cdid06 <- merge(cont6, contB, by = \"id\")\n", - "dy <- cdid06$lemp-cdid06$lemp.pre\n", - "cdid06$dy <- dy\n", - "cdid06 <- cdid06[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "cdid07 <- merge(cont7, contB, by = \"id\")\n", - "dy <- cdid07$lemp-cdid07$lemp.pre\n", - "cdid07$dy <- dy\n", - "cdid07 <- cdid07[ , -c(\"id\",\"lemp\",\"lemp.pre\")]" - ], - "metadata": { - "id": "zvN6Nmy0gPy4" - }, - "execution_count": 67, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Estimation of the ATET with DML\n", - "\n", - "We estimate the ATET of the county level minimum wage being larger than the federal minimum with the DML algorithm presented in Section 16.3 in the book. This requires estimation of the nuisance functions $E[Y|D=0,X]$, $E[D|X]$ as well as $P(D = 1)$. For the conditional expectation functions, we will consider different modern ML regression methods, namely: Constant (= no controls); a linear combination of the controls; an expansion of the raw control variables including all third order interactions; Lasso (CV); Ridge (CV); Random Forest; Shallow Tree; Deep Tree; and CV Tree.\n", - "The methods indicated with CV have their tuning parameter selected by cross-validation.\n", - "\n", - "The following code block implements the DML cross-fitting procedure." - ], - "metadata": { - "id": "EqHmiHaZgPZz" - } - }, - { - "cell_type": "code", - "source": [ - "att <- matrix(NA,4,10)\n", - "se.att <- matrix(NA,4,10)\n", - "RMSE.d <- matrix(NA,4,9)\n", - "RMSE.y <- matrix(NA,4,9)\n", - "trimmed <- matrix(NA,4,9)\n", - "\n", - "print(\"DML estimation starting, please wait\")\n", - "for(ii in 1:4){ # ii refer to the 4 investigated post-treatment periods\n", - "\n", - " tdata <- get(paste(\"tdid0\",(3+ii),sep=\"\")) # Treatment data\n", - " cdata <- get(paste(\"cdid0\",(3+ii),sep=\"\")) # Control data\n", - " usedata <- rbind(tdata,cdata)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # Cross-fit setup\n", - " n <- nrow(usedata)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", - "\n", - " # Initialize variables for CV predictions\n", - " yGd0x.fit <- matrix(NA,n,9)\n", - " dGx.fit <- matrix(NA,n,9)\n", - " pd.fit <-matrix(NA,n,1)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # Cross-fit loop\n", - " for(k in 1:Kf) {\n", - " cat(\"year: \",ii+2003,\"; fold: \",k,\"\\n\")\n", - " indk <- cfgroup == k\n", - "\n", - " ktrain <- usedata[!indk,]\n", - " ktest <- usedata[indk,]\n", - "\n", - " # Build some matrices for later\n", - " ytrain <- as.matrix(usedata[!indk,\"dy\"])\n", - " ytest <- as.matrix(usedata[indk,\"dy\"])\n", - " dtrain <- as.matrix(usedata[!indk,\"treated\"])\n", - " dtest <- as.matrix(usedata[indk,\"treated\"])\n", - "\n", - " # Expansion for lasso/ridge (region specific cubic polynomial)\n", - " X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 ,\n", - " degree = 3, raw = TRUE)),\n", - " data = usedata)\n", - "\n", - " xtrain <- as.matrix(X.expand[!indk,])\n", - " xtest <- as.matrix(X.expand[indk,])\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # P(D = 1)\n", - " pd.fit[indk,1] <- mean(ktrain$treated)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # E[D|X]\n", - "\n", - " # 1) Constant\n", - " dGx.fit[indk,1] <- mean(ktrain$treated)\n", - "\n", - " # 2) Baseline controls\n", - " glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0,\n", - " family = \"binomial\", data = ktrain)\n", - " dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = \"response\")\n", - "\n", - " # 3) Region specific linear index\n", - " glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", - " family = \"binomial\", data = ktrain)\n", - " dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = \"response\")\n", - "\n", - " # 4) Lasso - expansion - default CV tuning\n", - " lassoXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\", type.measure = \"mse\")\n", - " dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = \"response\" ,\n", - " s = \"lambda.min\")\n", - "\n", - " # 5) Ridge - expansion - default CV tuning\n", - " ridgeXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\",\n", - " type.measure = \"mse\", alpha = 0)\n", - " dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = \"response\" ,\n", - " s = \"lambda.min\")\n", - "\n", - " # 6) Random forest\n", - " rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain , mtry = 4, ntree = 1000)\n", - " dGx.fit[indk,6] <- predict(rfXdk, ktest, type = \"prob\")[, 2]\n", - "\n", - " # 7) Tree (start big)\n", - " btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", - " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", - " dGx.fit[indk,7] <- predict(btXdk, ktest)\n", - "\n", - " # 8) Tree (small tree)\n", - " stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", - " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", - " dGx.fit[indk,8] <- predict(stXdk, ktest)\n", - "\n", - " # 9) Tree (cv)\n", - " bestcp <- btXdk$cptable[which.min(btXdk$cptable[,\"xerror\"]),\"CP\"]\n", - " cvXdk <- prune(btXdk , cp=bestcp)\n", - " dGx.fit[indk,9] <- predict(cvXdk, ktest)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # E[Y|D=0,X]\n", - "\n", - " # subset to D = 0\n", - " ktrain0 = ktrain[ktrain$treated == 0, ]\n", - "\n", - " ytrain0 = ytrain[ktrain$treated == 0, ]\n", - " xtrain0 = xtrain[ktrain$treated == 0, ]\n", - "\n", - " # 1) Constant\n", - " yGd0x.fit[indk,1] <- mean(ktrain0$dy)\n", - "\n", - " # 2) Baseline controls\n", - " lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0)\n", - " yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest)\n", - "\n", - " # 3) Region specific linear index\n", - " lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", - " data = ktrain)\n", - " yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest)\n", - "\n", - " # 4) Lasso - expansion - default CV tuning\n", - " lassoXyk <- cv.glmnet(xtrain0 , ytrain0)\n", - " yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = \"lambda.min\")\n", - "\n", - " # 5) Ridge - expansion - default CV tuning\n", - " ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0)\n", - " yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " # 6) Random forest\n", - " rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain0 , mtry = 4, ntree = 1000)\n", - " yGd0x.fit[indk,6] <- predict(rfXyk, ktest)\n", - "\n", - " # 7) Tree (start big)\n", - " btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain0, method = \"anova\",\n", - " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", - " yGd0x.fit[indk,7] <- predict(btXyk, ktest)\n", - "\n", - " # 8) Tree (small tree)\n", - " stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", - " yGd0x.fit[indk,8] <- predict(stXyk, ktest)\n", - "\n", - " # 9) Tree (cv)\n", - " bestcp <- btXyk$cptable[which.min(btXyk$cptable[,\"xerror\"]),\"CP\"]\n", - " cvXyk <- prune(btXyk , cp=bestcp)\n", - " yGd0x.fit[indk,9] <- predict(cvXyk, ktest)\n", - "\n", - " }\n", - "\n", - " RMSE.d[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2))\n", - " RMSE.y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] -\n", - " yGd0x.fit[usedata$treated == 0, ])^2))\n", - "\n", - " # trim propensity scores of 1 to .95\n", - " for(r in 1:9) {\n", - " trimmed[ii,r] = sum(dGx.fit[ , r] > .95)\n", - " dGx.fit[dGx.fit[ ,r] > .95,r] <- .95\n", - " }\n", - "\n", - " att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", - " (usedata$dy-yGd0x.fit)) ,\n", - " mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", - " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", - " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])])))\n", - " att.den <- mean(usedata$treated/pd.fit)\n", - "\n", - " att[ii, ] <- att.num/att.den\n", - "\n", - " phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", - " (usedata$dy-yGd0x.fit) ,\n", - " ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", - " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", - " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den\n", - " se.att[ii, ] <- sqrt(colMeans((phihat^2))/n)\n", - "\n", - "\n", - "}" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PMFoM90-guGI", - "outputId": "a088bc6b-8c4b-466a-dade-2e133e917250" - }, - "execution_count": 68, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[1] \"DML estimation starting, please wait\"\n", - "year: 2004 ; fold: 1 \n", - "year: 2004 ; fold: 2 \n", - "year: 2004 ; fold: 3 \n", - "year: 2004 ; fold: 4 \n", - "year: 2004 ; fold: 5 \n", - "year: 2005 ; fold: 1 \n", - "year: 2005 ; fold: 2 \n", - "year: 2005 ; fold: 3 \n", - "year: 2005 ; fold: 4 \n", - "year: 2005 ; fold: 5 \n", - "year: 2006 ; fold: 1 \n", - "year: 2006 ; fold: 2 \n", - "year: 2006 ; fold: 3 \n", - "year: 2006 ; fold: 4 \n", - "year: 2006 ; fold: 5 \n", - "year: 2007 ; fold: 1 \n", - "year: 2007 ; fold: 2 \n", - "year: 2007 ; fold: 3 \n", - "year: 2007 ; fold: 4 \n", - "year: 2007 ; fold: 5 \n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "We start by reporting the RMSE obtained during cross-fitting for each learner in each period." - ], - "metadata": { - "id": "o-r2aJIv2_Yh" - } - }, - { - "cell_type": "code", - "source": [ - "table1y <- matrix(0, 9, 4)\n", - "table1y <- t(RMSE.y)\n", - "colnames(table1y)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", - "rownames(table1y)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\",\n", - " \"Random Forest\",\"Deep Tree\",\n", - " \"Shallow Tree\", \"Tree (CV)\")\n", - "table1y" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 349 - }, - "id": "xRazffP5kaq8", - "outputId": "eb34dd8d-fb75-4d48-cfe7-63baf9906645" - }, - "execution_count": 69, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 9 × 4 of type dbl
2004200520062007
No Controls0.16332230.18815220.22345810.2301717
Basic0.16340080.18540120.21912270.2215959
Expansion0.18870550.21218260.24446260.2709898
Lasso (CV)0.16312020.18508460.21925530.2214365
Ridge (CV)0.16305720.18514150.21908810.2212952
Random Forest0.17160090.19823860.23296050.2387630
Deep Tree0.19224070.22499770.25992500.2708277
Shallow Tree0.16778780.19244500.22790240.2289600
Tree (CV)0.16332230.18889020.21783650.2226685
\n" - ], - "text/markdown": "\nA matrix: 9 × 4 of type dbl\n\n| | 2004 | 2005 | 2006 | 2007 |\n|---|---|---|---|---|\n| No Controls | 0.1633223 | 0.1881522 | 0.2234581 | 0.2301717 |\n| Basic | 0.1634008 | 0.1854012 | 0.2191227 | 0.2215959 |\n| Expansion | 0.1887055 | 0.2121826 | 0.2444626 | 0.2709898 |\n| Lasso (CV) | 0.1631202 | 0.1850846 | 0.2192553 | 0.2214365 |\n| Ridge (CV) | 0.1630572 | 0.1851415 | 0.2190881 | 0.2212952 |\n| Random Forest | 0.1716009 | 0.1982386 | 0.2329605 | 0.2387630 |\n| Deep Tree | 0.1922407 | 0.2249977 | 0.2599250 | 0.2708277 |\n| Shallow Tree | 0.1677878 | 0.1924450 | 0.2279024 | 0.2289600 |\n| Tree (CV) | 0.1633223 | 0.1888902 | 0.2178365 | 0.2226685 |\n\n", - "text/latex": "A matrix: 9 × 4 of type dbl\n\\begin{tabular}{r|llll}\n & 2004 & 2005 & 2006 & 2007\\\\\n\\hline\n\tNo Controls & 0.1633223 & 0.1881522 & 0.2234581 & 0.2301717\\\\\n\tBasic & 0.1634008 & 0.1854012 & 0.2191227 & 0.2215959\\\\\n\tExpansion & 0.1887055 & 0.2121826 & 0.2444626 & 0.2709898\\\\\n\tLasso (CV) & 0.1631202 & 0.1850846 & 0.2192553 & 0.2214365\\\\\n\tRidge (CV) & 0.1630572 & 0.1851415 & 0.2190881 & 0.2212952\\\\\n\tRandom Forest & 0.1716009 & 0.1982386 & 0.2329605 & 0.2387630\\\\\n\tDeep Tree & 0.1922407 & 0.2249977 & 0.2599250 & 0.2708277\\\\\n\tShallow Tree & 0.1677878 & 0.1924450 & 0.2279024 & 0.2289600\\\\\n\tTree (CV) & 0.1633223 & 0.1888902 & 0.2178365 & 0.2226685\\\\\n\\end{tabular}\n", - "text/plain": [ - " 2004 2005 2006 2007 \n", - "No Controls 0.1633223 0.1881522 0.2234581 0.2301717\n", - "Basic 0.1634008 0.1854012 0.2191227 0.2215959\n", - "Expansion 0.1887055 0.2121826 0.2444626 0.2709898\n", - "Lasso (CV) 0.1631202 0.1850846 0.2192553 0.2214365\n", - "Ridge (CV) 0.1630572 0.1851415 0.2190881 0.2212952\n", - "Random Forest 0.1716009 0.1982386 0.2329605 0.2387630\n", - "Deep Tree 0.1922407 0.2249977 0.2599250 0.2708277\n", - "Shallow Tree 0.1677878 0.1924450 0.2279024 0.2289600\n", - "Tree (CV) 0.1633223 0.1888902 0.2178365 0.2226685" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "code", - "source": [ - "table1d <- matrix(0, 9, 4)\n", - "table1d <- t(RMSE.d)\n", - "colnames(table1d)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", - "rownames(table1d)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\",\n", - " \"Random Forest\",\"Deep Tree\",\n", - " \"Shallow Tree\", \"Tree (CV)\")\n", - "table1d" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 349 - }, - "id": "tlrNObn1kvpF", - "outputId": "80322a03-006e-4696-ae2e-a02c1c97422a" - }, - "execution_count": 70, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 9 × 4 of type dbl
2004200520062007
No Controls0.19825980.20060480.21110190.2503029
Basic0.19864350.20092290.21125070.2216673
Expansion0.19880520.20074400.21127080.2216673
Lasso (CV)0.19684670.19858490.20829570.2197175
Ridge (CV)0.19713350.19888550.20855070.2198350
Random Forest0.20049070.20511670.21283250.2355156
Deep Tree0.22068560.23639190.23025370.2744030
Shallow Tree0.19211230.19442960.20290520.2300568
Tree (CV)0.19368770.19553650.20390380.2311018
\n" - ], - "text/markdown": "\nA matrix: 9 × 4 of type dbl\n\n| | 2004 | 2005 | 2006 | 2007 |\n|---|---|---|---|---|\n| No Controls | 0.1982598 | 0.2006048 | 0.2111019 | 0.2503029 |\n| Basic | 0.1986435 | 0.2009229 | 0.2112507 | 0.2216673 |\n| Expansion | 0.1988052 | 0.2007440 | 0.2112708 | 0.2216673 |\n| Lasso (CV) | 0.1968467 | 0.1985849 | 0.2082957 | 0.2197175 |\n| Ridge (CV) | 0.1971335 | 0.1988855 | 0.2085507 | 0.2198350 |\n| Random Forest | 0.2004907 | 0.2051167 | 0.2128325 | 0.2355156 |\n| Deep Tree | 0.2206856 | 0.2363919 | 0.2302537 | 0.2744030 |\n| Shallow Tree | 0.1921123 | 0.1944296 | 0.2029052 | 0.2300568 |\n| Tree (CV) | 0.1936877 | 0.1955365 | 0.2039038 | 0.2311018 |\n\n", - "text/latex": "A matrix: 9 × 4 of type dbl\n\\begin{tabular}{r|llll}\n & 2004 & 2005 & 2006 & 2007\\\\\n\\hline\n\tNo Controls & 0.1982598 & 0.2006048 & 0.2111019 & 0.2503029\\\\\n\tBasic & 0.1986435 & 0.2009229 & 0.2112507 & 0.2216673\\\\\n\tExpansion & 0.1988052 & 0.2007440 & 0.2112708 & 0.2216673\\\\\n\tLasso (CV) & 0.1968467 & 0.1985849 & 0.2082957 & 0.2197175\\\\\n\tRidge (CV) & 0.1971335 & 0.1988855 & 0.2085507 & 0.2198350\\\\\n\tRandom Forest & 0.2004907 & 0.2051167 & 0.2128325 & 0.2355156\\\\\n\tDeep Tree & 0.2206856 & 0.2363919 & 0.2302537 & 0.2744030\\\\\n\tShallow Tree & 0.1921123 & 0.1944296 & 0.2029052 & 0.2300568\\\\\n\tTree (CV) & 0.1936877 & 0.1955365 & 0.2039038 & 0.2311018\\\\\n\\end{tabular}\n", - "text/plain": [ - " 2004 2005 2006 2007 \n", - "No Controls 0.1982598 0.2006048 0.2111019 0.2503029\n", - "Basic 0.1986435 0.2009229 0.2112507 0.2216673\n", - "Expansion 0.1988052 0.2007440 0.2112708 0.2216673\n", - "Lasso (CV) 0.1968467 0.1985849 0.2082957 0.2197175\n", - "Ridge (CV) 0.1971335 0.1988855 0.2085507 0.2198350\n", - "Random Forest 0.2004907 0.2051167 0.2128325 0.2355156\n", - "Deep Tree 0.2206856 0.2363919 0.2302537 0.2744030\n", - "Shallow Tree 0.1921123 0.1944296 0.2029052 0.2300568\n", - "Tree (CV) 0.1936877 0.1955365 0.2039038 0.2311018" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Here we see that the Deep Tree systematically performs worse in terms of cross-fit predictions than the other learners for both tasks and that Expansion performs similarly poorly for the outcome prediction. It also appears there is some signal in the regressors, especially for the propensity score, as all methods outside of Deep Tree and Expansion produce smaller RMSEs than the No Controls baseline. The other methods all produce similar RMSEs, with a small edge going to Ridge and Lasso. While it would be hard to reliably conclude which of the relatively good performing methods is statistically best here, one could exclude Expansion and Deep Tree from further consideration on the basis of out-of-sample performance suggesting\n", - "they are doing a poor job approximating the nuisance functions. Best (or a different ensemble) provides a good baseline that is principled in the sense that one could pre-commit to using the best learners without having first looked at the subsequent estimation results." - ], - "metadata": { - "id": "nxlhGchj4n-9" - } - }, - { - "cell_type": "markdown", - "source": [ - "We report estimates of the ATET in each period in the following table." - ], - "metadata": { - "id": "85jrGLgG4-Gw" - } - }, - { - "cell_type": "code", - "source": [ - "table2 <- matrix(0, 20, 4)\n", - "table2[seq(1,20,2),] <- t(att)\n", - "table2[seq(2,20,2),] <- t(se.att)\n", - "colnames(table2)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", - "rownames(table2)<- c(\"No Controls\",\"s.e.\",\"Basic\",\"s.e.\",\n", - " \"Expansion\",\"s.e.\",\"Lasso (CV)\",\"s.e.\",\n", - " \"Ridge (CV)\",\"s.e.\",\"Random Forest\",\"s.e.\",\n", - " \"Deep Tree\",\"s.e.\",\"Shallow Tree\",\"s.e.\",\n", - " \"Tree (CV)\",\"s.e.\",\"Best\",\"s.e.\")\n", - "table2" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 693 - }, - "id": "-tfALIgnkvao", - "outputId": "cf1c07a7-60ef-41a3-f8b3-51a01819c4c9" - }, - "execution_count": 71, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 20 × 4 of type dbl
2004200520062007
No Controls-0.03929934-0.075846130-0.11667635-0.13120725
s.e. 0.01868421 0.021063618 0.02281314 0.02607473
Basic-0.03663512-0.066280602-0.08837113-0.04108086
s.e. 0.01842984 0.020412243 0.02108130 0.03259285
Expansion-0.02243204-0.046360609-0.06148823 0.30300328
s.e. 0.02499826 0.029872098 0.03266593 0.22708133
Lasso (CV)-0.03455314-0.062192311-0.08226398-0.04932912
s.e. 0.01842588 0.020047216 0.02086891 0.03061909
Ridge (CV)-0.03479934-0.062216632-0.08300939-0.06134609
s.e. 0.01845471 0.020110151 0.02087059 0.02518957
Random Forest 0.01321134-0.056233389-0.03857945-0.07055396
s.e. 0.02914845 0.024028011 0.02824594 0.03801528
Deep Tree 0.07732570 0.006890783 0.09978114-0.47009336
s.e. 0.07899597 0.172330204 0.08032430 0.17774703
Shallow Tree-0.02788830-0.040227625-0.05756143-0.06500586
s.e. 0.01893650 0.020784187 0.02085503 0.02567692
Tree (CV)-0.02675490-0.044549388-0.06034715-0.06855990
s.e. 0.01887985 0.021384449 0.02074813 0.02499548
Best-0.02800609-0.051160252-0.05451478-0.05480770
s.e. 0.01867106 0.021279527 0.02111372 0.03083662
\n" - ], - "text/markdown": "\nA matrix: 20 × 4 of type dbl\n\n| | 2004 | 2005 | 2006 | 2007 |\n|---|---|---|---|---|\n| No Controls | -0.03929934 | -0.075846130 | -0.11667635 | -0.13120725 |\n| s.e. | 0.01868421 | 0.021063618 | 0.02281314 | 0.02607473 |\n| Basic | -0.03663512 | -0.066280602 | -0.08837113 | -0.04108086 |\n| s.e. | 0.01842984 | 0.020412243 | 0.02108130 | 0.03259285 |\n| Expansion | -0.02243204 | -0.046360609 | -0.06148823 | 0.30300328 |\n| s.e. | 0.02499826 | 0.029872098 | 0.03266593 | 0.22708133 |\n| Lasso (CV) | -0.03455314 | -0.062192311 | -0.08226398 | -0.04932912 |\n| s.e. | 0.01842588 | 0.020047216 | 0.02086891 | 0.03061909 |\n| Ridge (CV) | -0.03479934 | -0.062216632 | -0.08300939 | -0.06134609 |\n| s.e. | 0.01845471 | 0.020110151 | 0.02087059 | 0.02518957 |\n| Random Forest | 0.01321134 | -0.056233389 | -0.03857945 | -0.07055396 |\n| s.e. | 0.02914845 | 0.024028011 | 0.02824594 | 0.03801528 |\n| Deep Tree | 0.07732570 | 0.006890783 | 0.09978114 | -0.47009336 |\n| s.e. | 0.07899597 | 0.172330204 | 0.08032430 | 0.17774703 |\n| Shallow Tree | -0.02788830 | -0.040227625 | -0.05756143 | -0.06500586 |\n| s.e. | 0.01893650 | 0.020784187 | 0.02085503 | 0.02567692 |\n| Tree (CV) | -0.02675490 | -0.044549388 | -0.06034715 | -0.06855990 |\n| s.e. | 0.01887985 | 0.021384449 | 0.02074813 | 0.02499548 |\n| Best | -0.02800609 | -0.051160252 | -0.05451478 | -0.05480770 |\n| s.e. | 0.01867106 | 0.021279527 | 0.02111372 | 0.03083662 |\n\n", - "text/latex": "A matrix: 20 × 4 of type dbl\n\\begin{tabular}{r|llll}\n & 2004 & 2005 & 2006 & 2007\\\\\n\\hline\n\tNo Controls & -0.03929934 & -0.075846130 & -0.11667635 & -0.13120725\\\\\n\ts.e. & 0.01868421 & 0.021063618 & 0.02281314 & 0.02607473\\\\\n\tBasic & -0.03663512 & -0.066280602 & -0.08837113 & -0.04108086\\\\\n\ts.e. & 0.01842984 & 0.020412243 & 0.02108130 & 0.03259285\\\\\n\tExpansion & -0.02243204 & -0.046360609 & -0.06148823 & 0.30300328\\\\\n\ts.e. & 0.02499826 & 0.029872098 & 0.03266593 & 0.22708133\\\\\n\tLasso (CV) & -0.03455314 & -0.062192311 & -0.08226398 & -0.04932912\\\\\n\ts.e. & 0.01842588 & 0.020047216 & 0.02086891 & 0.03061909\\\\\n\tRidge (CV) & -0.03479934 & -0.062216632 & -0.08300939 & -0.06134609\\\\\n\ts.e. & 0.01845471 & 0.020110151 & 0.02087059 & 0.02518957\\\\\n\tRandom Forest & 0.01321134 & -0.056233389 & -0.03857945 & -0.07055396\\\\\n\ts.e. & 0.02914845 & 0.024028011 & 0.02824594 & 0.03801528\\\\\n\tDeep Tree & 0.07732570 & 0.006890783 & 0.09978114 & -0.47009336\\\\\n\ts.e. & 0.07899597 & 0.172330204 & 0.08032430 & 0.17774703\\\\\n\tShallow Tree & -0.02788830 & -0.040227625 & -0.05756143 & -0.06500586\\\\\n\ts.e. & 0.01893650 & 0.020784187 & 0.02085503 & 0.02567692\\\\\n\tTree (CV) & -0.02675490 & -0.044549388 & -0.06034715 & -0.06855990\\\\\n\ts.e. & 0.01887985 & 0.021384449 & 0.02074813 & 0.02499548\\\\\n\tBest & -0.02800609 & -0.051160252 & -0.05451478 & -0.05480770\\\\\n\ts.e. & 0.01867106 & 0.021279527 & 0.02111372 & 0.03083662\\\\\n\\end{tabular}\n", - "text/plain": [ - " 2004 2005 2006 2007 \n", - "No Controls -0.03929934 -0.075846130 -0.11667635 -0.13120725\n", - "s.e. 0.01868421 0.021063618 0.02281314 0.02607473\n", - "Basic -0.03663512 -0.066280602 -0.08837113 -0.04108086\n", - "s.e. 0.01842984 0.020412243 0.02108130 0.03259285\n", - "Expansion -0.02243204 -0.046360609 -0.06148823 0.30300328\n", - "s.e. 0.02499826 0.029872098 0.03266593 0.22708133\n", - "Lasso (CV) -0.03455314 -0.062192311 -0.08226398 -0.04932912\n", - "s.e. 0.01842588 0.020047216 0.02086891 0.03061909\n", - "Ridge (CV) -0.03479934 -0.062216632 -0.08300939 -0.06134609\n", - "s.e. 0.01845471 0.020110151 0.02087059 0.02518957\n", - "Random Forest 0.01321134 -0.056233389 -0.03857945 -0.07055396\n", - "s.e. 0.02914845 0.024028011 0.02824594 0.03801528\n", - "Deep Tree 0.07732570 0.006890783 0.09978114 -0.47009336\n", - "s.e. 0.07899597 0.172330204 0.08032430 0.17774703\n", - "Shallow Tree -0.02788830 -0.040227625 -0.05756143 -0.06500586\n", - "s.e. 0.01893650 0.020784187 0.02085503 0.02567692\n", - "Tree (CV) -0.02675490 -0.044549388 -0.06034715 -0.06855990\n", - "s.e. 0.01887985 0.021384449 0.02074813 0.02499548\n", - "Best -0.02800609 -0.051160252 -0.05451478 -0.05480770\n", - "s.e. 0.01867106 0.021279527 0.02111372 0.03083662" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Here, we see that the majority of methods provide point estimates that suggest the effect of the minimum wage increase leads to decreases in youth employment with small effects in the initial period that become larger in the years following the treatment. This pattern seems economically plausible as it may take time for firms to adjust employment and other input choices in response to a minimum wage change. The methods that produce estiamtes that are not consistent with this pattern are Deep Tree and Expansion which are both suspect as they systematically underperform in terms of having poor cross-fit prediction performance. In terms of point estimates, the other pattern that emerges is that all estimates that use the covariates produce ATET estimates that are systematically smaller in magnitude than the No Controls baseline, suggesting that failing to include the controls may lead to overstatement of treatment effects in this example.\n", - "\n", - "Turning to inference, we would reject the hypothesis of no minimum wage effect two or more years after the change at the 5% level, even after multiple testing correction, if we were to focus on the row \"Best\" (or many of the other individual rows). Focusing on \"Best\" is a reasonable ex ante strategy that could be committed to prior to conducting any analysis. It is, of course, reassuring that this broad conclusion is also obtained using many of the individual learners suggesting some robustness to the exact choice of learner made." - ], - "metadata": { - "id": "euBQPSYs47iZ" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Assess pre-trends\n", - "\n", - "Because we have data for the period 2001-2007, we can perform a so-called pre-trends test to provide some evidence about the plausibility of the conditional parallel trends assumption. Specifically, we can continue to use 2003 as the reference period but now consider 2002 to be the treatment period. Sensible economic mechanisms underlying the assumption would then typically suggest that the ATET in 2002 - before the 2004 minimum wage change we are considering - should be zero. Finding evidence that the ATET in 2002 is non-zero then calls into question the validity of the assumption.\n", - "\n", - "We change the treatment status of those observations, which received treatment in 2004 in the 2002 data and create a placebo treatment as well as control group." - ], - "metadata": { - "id": "MhFmwCr3kulb" - } - }, - { - "cell_type": "code", - "source": [ - "treat2 <- treat2[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "treat2$treated <- 1 # Code these observations as treated\n", - "\n", - "tdid02 <- merge(treat2, treatB, by = \"id\")\n", - "dy <- tdid02$lemp-tdid02$lemp.pre\n", - "tdid02$dy <- dy\n", - "tdid02 <- tdid02[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", - "\n", - "cont2 <- cont2[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", - "\n", - "cdid02 <- merge(cont2, contB, by = \"id\")\n", - "dy <- cdid02$lemp-cdid02$lemp.pre\n", - "cdid02$dy <- dy\n", - "cdid02 <- cdid02[ , -c(\"id\",\"lemp\",\"lemp.pre\")]" - ], - "metadata": { - "id": "dogBL7W4o3P4" - }, - "execution_count": 72, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We repeat the exercise for obtaining our ATET estimates and standard error for 2004-2007. Particularly, we also use all the learners as mentioned above." - ], - "metadata": { - "id": "JhjbleYX599z" - } - }, - { - "cell_type": "code", - "source": [ - "attP <- matrix(NA,1,10)\n", - "se.attP <- matrix(NA,1,10)\n", - "RMSE.dP <- matrix(NA,1,9)\n", - "RMSE.yP <- matrix(NA,1,9)\n", - "trimmedP <- matrix(NA,1,9)\n", - "for(ii in 1){\n", - "\n", - " tdata <- get(paste(\"tdid0\",(3-ii),sep=\"\")) # Treatment data\n", - " cdata <- get(paste(\"cdid0\",(3-ii),sep=\"\")) # Control data\n", - " usedata <- rbind(tdata,cdata)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # Cross-fit setup\n", - " n <- nrow(usedata)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", - "\n", - " # Initialize variables for CV predictions\n", - " yGd0x.fit <- matrix(NA,n,9)\n", - " dGx.fit <- matrix(NA,n,9)\n", - " pd.fit <-matrix(NA,n,1)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # Cross-fit loop\n", - " for(k in 1:Kf) {\n", - " cat(\"year: \",ii+2001,\"; fold: \",k,\"\\n\")\n", - " indk <- cfgroup == k\n", - "\n", - " ktrain <- usedata[!indk,]\n", - " ktest <- usedata[indk,]\n", - "\n", - " # Build some matrices for later\n", - " ytrain <- as.matrix(usedata[!indk,\"dy\"])\n", - " ytest <- as.matrix(usedata[indk,\"dy\"])\n", - " dtrain <- as.matrix(usedata[!indk,\"treated\"])\n", - " dtest <- as.matrix(usedata[indk,\"treated\"])\n", - "\n", - " # Expansion for lasso/ridge (region specific cubic polynomial)\n", - " X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 ,\n", - " degree = 3, raw = TRUE)),\n", - " data = usedata)\n", - "\n", - " xtrain <- as.matrix(X.expand[!indk,])\n", - " xtest <- as.matrix(X.expand[indk,])\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # P(D = 1)\n", - " pd.fit[indk,1] <- mean(ktrain$treated)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # E[D|X]\n", - "\n", - " # 1) Constant\n", - " dGx.fit[indk,1] <- mean(ktrain$treated)\n", - "\n", - " # 2) Baseline controls\n", - " glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0,\n", - " family = \"binomial\", data = ktrain)\n", - " dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = \"response\")\n", - "\n", - " # 3) Region specific linear index\n", - " glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", - " family = \"binomial\", data = ktrain)\n", - " dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = \"response\")\n", - "\n", - " # 4) Lasso - expansion - default CV tuning\n", - " lassoXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\", type.measure = \"mse\")\n", - " dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = \"response\" ,\n", - " s = \"lambda.min\")\n", - "\n", - " # 5) Ridge - expansion - default CV tuning\n", - " ridgeXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\",\n", - " type.measure = \"mse\", alpha = 0)\n", - " dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = \"response\" ,\n", - " s = \"lambda.min\")\n", - "\n", - " # 6) Random forest\n", - " rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain , mtry = 4, ntree = 1000)\n", - " dGx.fit[indk,6] <- predict(rfXdk, ktest, type = \"prob\")[, 2]\n", - "\n", - " # 7) Tree (start big)\n", - " btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", - " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", - " dGx.fit[indk,7] <- predict(btXdk, ktest)\n", - "\n", - " # 8) Tree (small tree)\n", - " stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", - " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", - " dGx.fit[indk,8] <- predict(stXdk, ktest)\n", - "\n", - " # 9) Tree (cv)\n", - " bestcp <- btXdk$cptable[which.min(btXdk$cptable[,\"xerror\"]),\"CP\"]\n", - " cvXdk <- prune(btXdk , cp=bestcp)\n", - " dGx.fit[indk,9] <- predict(cvXdk, ktest)\n", - "\n", - " #-----------------------------------------------------------------------------\n", - " # E[Y|D=0,X]\n", - "\n", - " # subset to D = 0\n", - " ktrain0 = ktrain[ktrain$treated == 0, ]\n", - "\n", - " ytrain0 = ytrain[ktrain$treated == 0, ]\n", - " xtrain0 = xtrain[ktrain$treated == 0, ]\n", - "\n", - " # 1) Constant\n", - " yGd0x.fit[indk,1] <- mean(ktrain0$dy)\n", - "\n", - " # 2) Baseline controls\n", - " lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0)\n", - " yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest)\n", - "\n", - " # 3) Region specific linear index\n", - " lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", - " data = ktrain)\n", - " yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest)\n", - "\n", - " # 4) Lasso - expansion - default CV tuning\n", - " lassoXyk <- cv.glmnet(xtrain0 , ytrain0)\n", - " yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = \"lambda.min\")\n", - "\n", - " # 5) Ridge - expansion - default CV tuning\n", - " ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0)\n", - " yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " # 6) Random forest\n", - " rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain0 , mtry = 4, ntree = 1000)\n", - " yGd0x.fit[indk,6] <- predict(rfXyk, ktest)\n", - "\n", - " # 7) Tree (start big)\n", - " btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain0, method = \"anova\",\n", - " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", - " yGd0x.fit[indk,7] <- predict(btXyk, ktest)\n", - "\n", - " # 8) Tree (small tree)\n", - " stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", - " data = ktrain, method = \"anova\",\n", - " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", - " yGd0x.fit[indk,8] <- predict(stXyk, ktest)\n", - "\n", - " # 9) Tree (cv)\n", - " bestcp <- btXyk$cptable[which.min(btXyk$cptable[,\"xerror\"]),\"CP\"]\n", - " cvXyk <- prune(btXyk , cp=bestcp)\n", - " yGd0x.fit[indk,9] <- predict(cvXyk, ktest)\n", - "\n", - " }\n", - "\n", - " RMSE.dP[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2))\n", - " RMSE.yP[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] -\n", - " yGd0x.fit[usedata$treated == 0, ])^2))\n", - "\n", - " # trim propensity scores of 1 to .95\n", - " for(r in 1:9) {\n", - " trimmedP[ii,r] = sum(dGx.fit[ , r] > .95)\n", - " dGx.fit[dGx.fit[ ,r] > .95,r] <- .95\n", - " }\n", - "\n", - " att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", - " (usedata$dy-yGd0x.fit)) ,\n", - " mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", - " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", - " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])])))\n", - " att.den <- mean(usedata$treated/pd.fit)\n", - "\n", - " attP[ii, ] <- att.num/att.den\n", - "\n", - " phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", - " (usedata$dy-yGd0x.fit) ,\n", - " ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", - " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", - " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den\n", - " se.attP[ii, ] <- sqrt(colMeans((phihat^2))/n)\n", - "\n", - "\n", - "}" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Df8A2qj_odSh", - "outputId": "7b81df57-3ad3-4f15-9651-1313dd53189f" - }, - "execution_count": 73, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "year: 2002 ; fold: 1 \n", - "year: 2002 ; fold: 2 \n", - "year: 2002 ; fold: 3 \n", - "year: 2002 ; fold: 4 \n", - "year: 2002 ; fold: 5 \n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "We report the results in the following table." - ], - "metadata": { - "id": "ZpL1SPbe6HRE" - } - }, - { - "cell_type": "code", - "source": [ - "tableP <- matrix(0, 4, 10)\n", - "tableP[1,] <- c(RMSE.yP, min(RMSE.yP))\n", - "tableP[2,] <- c(RMSE.dP, min(RMSE.dP))\n", - "tableP[3,] <- attP\n", - "tableP[4,] <- se.attP\n", - "rownames(tableP)<- c(\"RMSE Y\",\"RMSE D\",\"ATET\",\"s.e.\")\n", - "colnames(tableP)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\",\n", - " \"Random Forest\",\"Deep Tree\",\n", - " \"Shallow Tree\", \"Tree (CV)\" , \"Best\")\n", - "tableP = t(tableP)\n", - "tableP" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 380 - }, - "id": "2y3YugM4omz4", - "outputId": "44e812c4-bc35-4309-e567-f6a37bca1d0f" - }, - "execution_count": 74, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 10 × 4 of type dbl
RMSE YRMSE DATETs.e.
No Controls0.15431950.1944604-0.0037220380.01310626
Basic0.15405620.1948737-0.0043577720.01301002
Expansion0.15772370.1949051 0.0045885230.01395891
Lasso (CV)0.15441080.1932033-0.0038828200.01312425
Ridge (CV)0.15439470.1934664-0.0053156960.01309610
Random Forest0.16348230.2265493 0.0229668860.02648961
Deep Tree0.18221660.2233871 0.0080466530.02757672
Shallow Tree0.16196530.1884214-0.0037129590.01337397
Tree (CV)0.15502540.1905196-0.0056030100.01330858
Best0.00000000.0000000-0.0030723040.01342307
\n" - ], - "text/markdown": "\nA matrix: 10 × 4 of type dbl\n\n| | RMSE Y | RMSE D | ATET | s.e. |\n|---|---|---|---|---|\n| No Controls | 0.1543195 | 0.1944604 | -0.003722038 | 0.01310626 |\n| Basic | 0.1540562 | 0.1948737 | -0.004357772 | 0.01301002 |\n| Expansion | 0.1577237 | 0.1949051 | 0.004588523 | 0.01395891 |\n| Lasso (CV) | 0.1544108 | 0.1932033 | -0.003882820 | 0.01312425 |\n| Ridge (CV) | 0.1543947 | 0.1934664 | -0.005315696 | 0.01309610 |\n| Random Forest | 0.1634823 | 0.2265493 | 0.022966886 | 0.02648961 |\n| Deep Tree | 0.1822166 | 0.2233871 | 0.008046653 | 0.02757672 |\n| Shallow Tree | 0.1619653 | 0.1884214 | -0.003712959 | 0.01337397 |\n| Tree (CV) | 0.1550254 | 0.1905196 | -0.005603010 | 0.01330858 |\n| Best | 0.0000000 | 0.0000000 | -0.003072304 | 0.01342307 |\n\n", - "text/latex": "A matrix: 10 × 4 of type dbl\n\\begin{tabular}{r|llll}\n & RMSE Y & RMSE D & ATET & s.e.\\\\\n\\hline\n\tNo Controls & 0.1543195 & 0.1944604 & -0.003722038 & 0.01310626\\\\\n\tBasic & 0.1540562 & 0.1948737 & -0.004357772 & 0.01301002\\\\\n\tExpansion & 0.1577237 & 0.1949051 & 0.004588523 & 0.01395891\\\\\n\tLasso (CV) & 0.1544108 & 0.1932033 & -0.003882820 & 0.01312425\\\\\n\tRidge (CV) & 0.1543947 & 0.1934664 & -0.005315696 & 0.01309610\\\\\n\tRandom Forest & 0.1634823 & 0.2265493 & 0.022966886 & 0.02648961\\\\\n\tDeep Tree & 0.1822166 & 0.2233871 & 0.008046653 & 0.02757672\\\\\n\tShallow Tree & 0.1619653 & 0.1884214 & -0.003712959 & 0.01337397\\\\\n\tTree (CV) & 0.1550254 & 0.1905196 & -0.005603010 & 0.01330858\\\\\n\tBest & 0.0000000 & 0.0000000 & -0.003072304 & 0.01342307\\\\\n\\end{tabular}\n", - "text/plain": [ - " RMSE Y RMSE D ATET s.e. \n", - "No Controls 0.1543195 0.1944604 -0.003722038 0.01310626\n", - "Basic 0.1540562 0.1948737 -0.004357772 0.01301002\n", - "Expansion 0.1577237 0.1949051 0.004588523 0.01395891\n", - "Lasso (CV) 0.1544108 0.1932033 -0.003882820 0.01312425\n", - "Ridge (CV) 0.1543947 0.1934664 -0.005315696 0.01309610\n", - "Random Forest 0.1634823 0.2265493 0.022966886 0.02648961\n", - "Deep Tree 0.1822166 0.2233871 0.008046653 0.02757672\n", - "Shallow Tree 0.1619653 0.1884214 -0.003712959 0.01337397\n", - "Tree (CV) 0.1550254 0.1905196 -0.005603010 0.01330858\n", - "Best 0.1540562 0.1884214 -0.003072304 0.01342307" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Here we see broad agreement across all methods in the sense of returning point estimates that are small in magnitude and small relative to standard errors. In no case would we reject the hypothesis that the pre-event effect in 2002 is different from zero at usual levels of significance. We note that failing to reject the hypothesis of no pre-event effects certainly does not imply that the conditional DiD assumption is in fact satisfied. For example, confidence intervals include values that would be consistent with relatively large pre-event effects. However, it is reassuring to see that there is not strong evidence of a violation of the underlying identifying assumption." - ], - "metadata": { - "id": "k0jGwOFO6NZD" - } - } - ] + { + "cell_type": "markdown", + "metadata": { + "id": "u6jWjkrzU8I6" + }, + "source": [ + "## Loading the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "znh8YcAXSp3E" + }, + "outputs": [], + "source": [ + "data <- read.csv(\"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv\", row.names=1)\n", + "data <- data.table(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 306 + }, + "id": "PQdsT6BnWKeq", + "outputId": "d71da67c-541c-4c5f-f65a-7e6dd70230cd" + }, + "outputs": [], + "source": [ + "head(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v37g7zlwW5pH" + }, + "source": [ + "### Data Preparation\n", + "\n", + "We remove observations that are already treated in the first observed period (2001). We drop all variables that we won't use in our analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W6ob7pptW49G" + }, + "outputs": [], + "source": [ + "data <- subset(data, (G==0) | (G>2001))\n", + "data <- data[, -c(\"countyreal\",\"state_name\",\"FIPS\",\"emp0A01_BS\",\n", + " \"quarter\", \"censusdiv\",\"pop\",\"annual_avg_pay\",\n", + " \"state_mw\",\"fed_mw\", \"ever_treated\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ri12EDNJaAfF" + }, + "source": [ + "Next, we create the treatment groups. We focus our analysis exclusively on the set of counties that had wage increases away from the federal minimum wage in 2004. That is, we treat 2003 and earlier as the pre-treatment period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "huj7huQ1aQSq" + }, + "outputs": [], + "source": [ + "treat1 <- subset(data, (G == 2004) & (year == 2001))\n", + "treat2 <- subset(data, (G == 2004) & (year == 2002))\n", + "treat3 <- subset(data, (G == 2004) & (year == 2003))\n", + "treat4 <- subset(data, (G == 2004) & (year == 2004))\n", + "treat5 <- subset(data, (G == 2004) & (year == 2005))\n", + "treat6 <- subset(data, (G == 2004) & (year == 2006))\n", + "treat7 <- subset(data, (G == 2004) & (year == 2007))\n", + "\n", + "cont1 <- subset(data, (G == 0 | G > 2001) & (year == 2001))\n", + "cont2 <- subset(data, (G == 0 | G > 2002) & (year == 2002))\n", + "cont3 <- subset(data, (G == 0 | G > 2003) & (year == 2003))\n", + "cont4 <- subset(data, (G == 0 | G > 2004) & (year == 2004))\n", + "cont5 <- subset(data, (G == 0 | G > 2005) & (year == 2005))\n", + "cont6 <- subset(data, (G == 0 | G > 2006) & (year == 2006))\n", + "cont7 <- subset(data, (G == 0 | G > 2007) & (year == 2007))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6HC1PX_Uc5bQ" + }, + "source": [ + "We assume that the basic assumptions, particularly parallel trends, hold after conditioning on pre-treatment variables: 2001 population, 2001 average pay and 2001 teen employment, as well as the region in which the county is located. (The region is characterized by four\n", + "categories.)\n", + "\n", + "Consequently, we want to extract the control variables for both treatment and control group in 2001." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KvkwAdL6evsU" + }, + "outputs": [], + "source": [ + "treat1 <- treat1[ , -c(\"year\",\"G\",\"region\",\"treated\")]\n", + "\n", + "cont1 <- cont1[ , -c(\"year\",\"G\",\"region\",\"treated\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zU7rM_5Ne3Xr" + }, + "source": [ + "2003 serves as the pre-treatment period for both counties that do receive the treatment in 2004 and those that do not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3cd3dBDqeyqa" + }, + "outputs": [], + "source": [ + "treatB <- merge(treat3, treat1, by = \"id\", suffixes = c(\".pre\",\".0\"))\n", + "treatB <- treatB[ , -c(\"treated\",\"lpop.pre\",\"lavg_pay.pre\",\"year\",\"G\")]\n", + "\n", + "contB <- merge(cont3, cont1, by = \"id\", suffixes = c(\".pre\",\".0\"))\n", + "contB <- contB[ , -c(\"treated\",\"lpop.pre\",\"lavg_pay.pre\",\"year\",\"G\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xL1fSfb5e82d" + }, + "source": [ + "We estimate the ATET in 2004-2007, which corresponds to the effect in the year of treatment as well as in the three years after the treatment. The control observations are the observations that still have the federal minimum wage in each year. (The control group is shrinking in each year as additional units receive treatment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zvN6Nmy0gPy4" + }, + "outputs": [], + "source": [ + "treat4 <- treat4[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "treat5 <- treat5[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "treat6 <- treat6[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "treat7 <- treat7[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "\n", + "cont4 <- cont4[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "cont5 <- cont5[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "cont6 <- cont6[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "cont7 <- cont7[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "\n", + "tdid04 <- merge(treat4, treatB, by = \"id\")\n", + "dy <- tdid04$lemp-tdid04$lemp.pre\n", + "tdid04$dy <- dy\n", + "tdid04 <- tdid04[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "tdid05 <- merge(treat5, treatB, by = \"id\")\n", + "dy <- tdid05$lemp-tdid05$lemp.pre\n", + "tdid05$dy <- dy\n", + "tdid05 <- tdid05[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "tdid06 <- merge(treat6, treatB, by = \"id\")\n", + "dy <- tdid06$lemp-tdid06$lemp.pre\n", + "tdid06$dy <- dy\n", + "tdid06 <- tdid06[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "tdid07 <- merge(treat7, treatB, by = \"id\")\n", + "dy <- tdid07$lemp-tdid07$lemp.pre\n", + "tdid07$dy <- dy\n", + "tdid07 <- tdid07[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "cdid04 <- merge(cont4, contB, by = \"id\")\n", + "dy <- cdid04$lemp-cdid04$lemp.pre\n", + "cdid04$dy <- dy\n", + "cdid04 <- cdid04[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "cdid05 <- merge(cont5, contB, by = \"id\")\n", + "dy <- cdid05$lemp-cdid05$lemp.pre\n", + "cdid05$dy <- dy\n", + "cdid05 <- cdid05[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "cdid06 <- merge(cont6, contB, by = \"id\")\n", + "dy <- cdid06$lemp-cdid06$lemp.pre\n", + "cdid06$dy <- dy\n", + "cdid06 <- cdid06[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "cdid07 <- merge(cont7, contB, by = \"id\")\n", + "dy <- cdid07$lemp-cdid07$lemp.pre\n", + "cdid07$dy <- dy\n", + "cdid07 <- cdid07[ , -c(\"id\",\"lemp\",\"lemp.pre\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EqHmiHaZgPZz" + }, + "source": [ + "### Estimation of the ATET with DML\n", + "\n", + "We estimate the ATET of the county level minimum wage being larger than the federal minimum with the DML algorithm presented in Section 16.3 in the book. This requires estimation of the nuisance functions $E[Y|D=0,X]$, $E[D|X]$ as well as $P(D = 1)$. For the conditional expectation functions, we will consider different modern ML regression methods, namely: Constant (= no controls); a linear combination of the controls; an expansion of the raw control variables including all third order interactions; Lasso (CV); Ridge (CV); Random Forest; Shallow Tree; Deep Tree; and CV Tree.\n", + "The methods indicated with CV have their tuning parameter selected by cross-validation.\n", + "\n", + "The following code block implements the DML cross-fitting procedure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PMFoM90-guGI", + "outputId": "a088bc6b-8c4b-466a-dade-2e133e917250" + }, + "outputs": [], + "source": [ + "att <- matrix(NA,4,10)\n", + "se.att <- matrix(NA,4,10)\n", + "RMSE.d <- matrix(NA,4,9)\n", + "RMSE.y <- matrix(NA,4,9)\n", + "trimmed <- matrix(NA,4,9)\n", + "\n", + "print(\"DML estimation starting, please wait\")\n", + "for(ii in 1:4){ # ii refer to the 4 investigated post-treatment periods\n", + "\n", + " tdata <- get(paste(\"tdid0\",(3+ii),sep=\"\")) # Treatment data\n", + " cdata <- get(paste(\"cdid0\",(3+ii),sep=\"\")) # Control data\n", + " usedata <- rbind(tdata,cdata)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # Cross-fit setup\n", + " n <- nrow(usedata)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", + "\n", + " # Initialize variables for CV predictions\n", + " yGd0x.fit <- matrix(NA,n,9)\n", + " dGx.fit <- matrix(NA,n,9)\n", + " pd.fit <-matrix(NA,n,1)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # Cross-fit loop\n", + " for(k in 1:Kf) {\n", + " cat(\"year: \",ii+2003,\"; fold: \",k,\"\\n\")\n", + " indk <- cfgroup == k\n", + "\n", + " ktrain <- usedata[!indk,]\n", + " ktest <- usedata[indk,]\n", + "\n", + " # Build some matrices for later\n", + " ytrain <- as.matrix(usedata[!indk,\"dy\"])\n", + " ytest <- as.matrix(usedata[indk,\"dy\"])\n", + " dtrain <- as.matrix(usedata[!indk,\"treated\"])\n", + " dtest <- as.matrix(usedata[indk,\"treated\"])\n", + "\n", + " # Expansion for lasso/ridge (region specific cubic polynomial)\n", + " X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 ,\n", + " degree = 3, raw = TRUE)),\n", + " data = usedata)\n", + "\n", + " xtrain <- as.matrix(X.expand[!indk,])\n", + " xtest <- as.matrix(X.expand[indk,])\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # P(D = 1)\n", + " pd.fit[indk,1] <- mean(ktrain$treated)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # E[D|X]\n", + "\n", + " # 1) Constant\n", + " dGx.fit[indk,1] <- mean(ktrain$treated)\n", + "\n", + " # 2) Baseline controls\n", + " glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0,\n", + " family = \"binomial\", data = ktrain)\n", + " dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = \"response\")\n", + "\n", + " # 3) Region specific linear index\n", + " glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", + " family = \"binomial\", data = ktrain)\n", + " dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = \"response\")\n", + "\n", + " # 4) Lasso - expansion - default CV tuning\n", + " lassoXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\", type.measure = \"mse\")\n", + " dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = \"response\" ,\n", + " s = \"lambda.min\")\n", + "\n", + " # 5) Ridge - expansion - default CV tuning\n", + " ridgeXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\",\n", + " type.measure = \"mse\", alpha = 0)\n", + " dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = \"response\" ,\n", + " s = \"lambda.min\")\n", + "\n", + " # 6) Random forest\n", + " rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain , mtry = 4, ntree = 1000)\n", + " dGx.fit[indk,6] <- predict(rfXdk, ktest, type = \"prob\")[, 2]\n", + "\n", + " # 7) Tree (start big)\n", + " btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", + " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", + " dGx.fit[indk,7] <- predict(btXdk, ktest)\n", + "\n", + " # 8) Tree (small tree)\n", + " stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", + " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", + " dGx.fit[indk,8] <- predict(stXdk, ktest)\n", + "\n", + " # 9) Tree (cv)\n", + " bestcp <- btXdk$cptable[which.min(btXdk$cptable[,\"xerror\"]),\"CP\"]\n", + " cvXdk <- prune(btXdk , cp=bestcp)\n", + " dGx.fit[indk,9] <- predict(cvXdk, ktest)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # E[Y|D=0,X]\n", + "\n", + " # subset to D = 0\n", + " ktrain0 = ktrain[ktrain$treated == 0, ]\n", + "\n", + " ytrain0 = ytrain[ktrain$treated == 0, ]\n", + " xtrain0 = xtrain[ktrain$treated == 0, ]\n", + "\n", + " # 1) Constant\n", + " yGd0x.fit[indk,1] <- mean(ktrain0$dy)\n", + "\n", + " # 2) Baseline controls\n", + " lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0)\n", + " yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest)\n", + "\n", + " # 3) Region specific linear index\n", + " lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", + " data = ktrain)\n", + " yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest)\n", + "\n", + " # 4) Lasso - expansion - default CV tuning\n", + " lassoXyk <- cv.glmnet(xtrain0 , ytrain0)\n", + " yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = \"lambda.min\")\n", + "\n", + " # 5) Ridge - expansion - default CV tuning\n", + " ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0)\n", + " yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " # 6) Random forest\n", + " rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain0 , mtry = 4, ntree = 1000)\n", + " yGd0x.fit[indk,6] <- predict(rfXyk, ktest)\n", + "\n", + " # 7) Tree (start big)\n", + " btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain0, method = \"anova\",\n", + " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", + " yGd0x.fit[indk,7] <- predict(btXyk, ktest)\n", + "\n", + " # 8) Tree (small tree)\n", + " stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", + " yGd0x.fit[indk,8] <- predict(stXyk, ktest)\n", + "\n", + " # 9) Tree (cv)\n", + " bestcp <- btXyk$cptable[which.min(btXyk$cptable[,\"xerror\"]),\"CP\"]\n", + " cvXyk <- prune(btXyk , cp=bestcp)\n", + " yGd0x.fit[indk,9] <- predict(cvXyk, ktest)\n", + "\n", + " }\n", + "\n", + " RMSE.d[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2))\n", + " RMSE.y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] -\n", + " yGd0x.fit[usedata$treated == 0, ])^2))\n", + "\n", + " # trim propensity scores of 1 to .95\n", + " for(r in 1:9) {\n", + " trimmed[ii,r] = sum(dGx.fit[ , r] > .95)\n", + " dGx.fit[dGx.fit[ ,r] > .95,r] <- .95\n", + " }\n", + "\n", + " att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", + " (usedata$dy-yGd0x.fit)) ,\n", + " mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", + " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", + " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])])))\n", + " att.den <- mean(usedata$treated/pd.fit)\n", + "\n", + " att[ii, ] <- att.num/att.den\n", + "\n", + " phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", + " (usedata$dy-yGd0x.fit) ,\n", + " ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", + " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", + " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den\n", + " se.att[ii, ] <- sqrt(colMeans((phihat^2))/n)\n", + "\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o-r2aJIv2_Yh" + }, + "source": [ + "We start by reporting the RMSE obtained during cross-fitting for each learner in each period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 349 + }, + "id": "xRazffP5kaq8", + "outputId": "eb34dd8d-fb75-4d48-cfe7-63baf9906645" + }, + "outputs": [], + "source": [ + "table1y <- matrix(0, 9, 4)\n", + "table1y <- t(RMSE.y)\n", + "colnames(table1y)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", + "rownames(table1y)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\",\n", + " \"Random Forest\",\"Deep Tree\",\n", + " \"Shallow Tree\", \"Tree (CV)\")\n", + "table1y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 349 + }, + "id": "tlrNObn1kvpF", + "outputId": "80322a03-006e-4696-ae2e-a02c1c97422a" + }, + "outputs": [], + "source": [ + "table1d <- matrix(0, 9, 4)\n", + "table1d <- t(RMSE.d)\n", + "colnames(table1d)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", + "rownames(table1d)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\",\n", + " \"Random Forest\",\"Deep Tree\",\n", + " \"Shallow Tree\", \"Tree (CV)\")\n", + "table1d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nxlhGchj4n-9" + }, + "source": [ + "Here we see that the Deep Tree systematically performs worse in terms of cross-fit predictions than the other learners for both tasks and that Expansion performs similarly poorly for the outcome prediction. It also appears there is some signal in the regressors, especially for the propensity score, as all methods outside of Deep Tree and Expansion produce smaller RMSEs than the No Controls baseline. The other methods all produce similar RMSEs, with a small edge going to Ridge and Lasso. While it would be hard to reliably conclude which of the relatively good performing methods is statistically best here, one could exclude Expansion and Deep Tree from further consideration on the basis of out-of-sample performance suggesting\n", + "they are doing a poor job approximating the nuisance functions. Best (or a different ensemble) provides a good baseline that is principled in the sense that one could pre-commit to using the best learners without having first looked at the subsequent estimation results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "85jrGLgG4-Gw" + }, + "source": [ + "We report estimates of the ATET in each period in the following table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 693 + }, + "id": "-tfALIgnkvao", + "outputId": "cf1c07a7-60ef-41a3-f8b3-51a01819c4c9" + }, + "outputs": [], + "source": [ + "table2 <- matrix(0, 20, 4)\n", + "table2[seq(1,20,2),] <- t(att)\n", + "table2[seq(2,20,2),] <- t(se.att)\n", + "colnames(table2)<- c(\"2004\",\"2005\",\"2006\",\"2007\")\n", + "rownames(table2)<- c(\"No Controls\",\"s.e.\",\"Basic\",\"s.e.\",\n", + " \"Expansion\",\"s.e.\",\"Lasso (CV)\",\"s.e.\",\n", + " \"Ridge (CV)\",\"s.e.\",\"Random Forest\",\"s.e.\",\n", + " \"Deep Tree\",\"s.e.\",\"Shallow Tree\",\"s.e.\",\n", + " \"Tree (CV)\",\"s.e.\",\"Best\",\"s.e.\")\n", + "table2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "euBQPSYs47iZ" + }, + "source": [ + "Here, we see that the majority of methods provide point estimates that suggest the effect of the minimum wage increase leads to decreases in youth employment with small effects in the initial period that become larger in the years following the treatment. This pattern seems economically plausible as it may take time for firms to adjust employment and other input choices in response to a minimum wage change. The methods that produce estiamtes that are not consistent with this pattern are Deep Tree and Expansion which are both suspect as they systematically underperform in terms of having poor cross-fit prediction performance. In terms of point estimates, the other pattern that emerges is that all estimates that use the covariates produce ATET estimates that are systematically smaller in magnitude than the No Controls baseline, suggesting that failing to include the controls may lead to overstatement of treatment effects in this example.\n", + "\n", + "Turning to inference, we would reject the hypothesis of no minimum wage effect two or more years after the change at the 5% level, even after multiple testing correction, if we were to focus on the row \"Best\" (or many of the other individual rows). Focusing on \"Best\" is a reasonable ex ante strategy that could be committed to prior to conducting any analysis. It is, of course, reassuring that this broad conclusion is also obtained using many of the individual learners suggesting some robustness to the exact choice of learner made." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MhFmwCr3kulb" + }, + "source": [ + "### Assess pre-trends\n", + "\n", + "Because we have data for the period 2001-2007, we can perform a so-called pre-trends test to provide some evidence about the plausibility of the conditional parallel trends assumption. Specifically, we can continue to use 2003 as the reference period but now consider 2002 to be the treatment period. Sensible economic mechanisms underlying the assumption would then typically suggest that the ATET in 2002 - before the 2004 minimum wage change we are considering - should be zero. Finding evidence that the ATET in 2002 is non-zero then calls into question the validity of the assumption.\n", + "\n", + "We change the treatment status of those observations, which received treatment in 2004 in the 2002 data and create a placebo treatment as well as control group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dogBL7W4o3P4" + }, + "outputs": [], + "source": [ + "treat2 <- treat2[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "treat2$treated <- 1 # Code these observations as treated\n", + "\n", + "tdid02 <- merge(treat2, treatB, by = \"id\")\n", + "dy <- tdid02$lemp-tdid02$lemp.pre\n", + "tdid02$dy <- dy\n", + "tdid02 <- tdid02[ , -c(\"id\",\"lemp\",\"lemp.pre\")]\n", + "\n", + "cont2 <- cont2[ , -c(\"lpop\",\"lavg_pay\",\"year\",\"G\",\"region\")]\n", + "\n", + "cdid02 <- merge(cont2, contB, by = \"id\")\n", + "dy <- cdid02$lemp-cdid02$lemp.pre\n", + "cdid02$dy <- dy\n", + "cdid02 <- cdid02[ , -c(\"id\",\"lemp\",\"lemp.pre\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JhjbleYX599z" + }, + "source": [ + "We repeat the exercise for obtaining our ATET estimates and standard error for 2004-2007. Particularly, we also use all the learners as mentioned above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Df8A2qj_odSh", + "outputId": "7b81df57-3ad3-4f15-9651-1313dd53189f" + }, + "outputs": [], + "source": [ + "attP <- matrix(NA,1,10)\n", + "se.attP <- matrix(NA,1,10)\n", + "RMSE.dP <- matrix(NA,1,9)\n", + "RMSE.yP <- matrix(NA,1,9)\n", + "trimmedP <- matrix(NA,1,9)\n", + "for(ii in 1){\n", + "\n", + " tdata <- get(paste(\"tdid0\",(3-ii),sep=\"\")) # Treatment data\n", + " cdata <- get(paste(\"cdid0\",(3-ii),sep=\"\")) # Control data\n", + " usedata <- rbind(tdata,cdata)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # Cross-fit setup\n", + " n <- nrow(usedata)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", + "\n", + " # Initialize variables for CV predictions\n", + " yGd0x.fit <- matrix(NA,n,9)\n", + " dGx.fit <- matrix(NA,n,9)\n", + " pd.fit <-matrix(NA,n,1)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # Cross-fit loop\n", + " for(k in 1:Kf) {\n", + " cat(\"year: \",ii+2001,\"; fold: \",k,\"\\n\")\n", + " indk <- cfgroup == k\n", + "\n", + " ktrain <- usedata[!indk,]\n", + " ktest <- usedata[indk,]\n", + "\n", + " # Build some matrices for later\n", + " ytrain <- as.matrix(usedata[!indk,\"dy\"])\n", + " ytest <- as.matrix(usedata[indk,\"dy\"])\n", + " dtrain <- as.matrix(usedata[!indk,\"treated\"])\n", + " dtest <- as.matrix(usedata[indk,\"treated\"])\n", + "\n", + " # Expansion for lasso/ridge (region specific cubic polynomial)\n", + " X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 ,\n", + " degree = 3, raw = TRUE)),\n", + " data = usedata)\n", + "\n", + " xtrain <- as.matrix(X.expand[!indk,])\n", + " xtest <- as.matrix(X.expand[indk,])\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # P(D = 1)\n", + " pd.fit[indk,1] <- mean(ktrain$treated)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # E[D|X]\n", + "\n", + " # 1) Constant\n", + " dGx.fit[indk,1] <- mean(ktrain$treated)\n", + "\n", + " # 2) Baseline controls\n", + " glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0,\n", + " family = \"binomial\", data = ktrain)\n", + " dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = \"response\")\n", + "\n", + " # 3) Region specific linear index\n", + " glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", + " family = \"binomial\", data = ktrain)\n", + " dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = \"response\")\n", + "\n", + " # 4) Lasso - expansion - default CV tuning\n", + " lassoXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\", type.measure = \"mse\")\n", + " dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = \"response\" ,\n", + " s = \"lambda.min\")\n", + "\n", + " # 5) Ridge - expansion - default CV tuning\n", + " ridgeXdk <- cv.glmnet(xtrain , dtrain , family = \"binomial\",\n", + " type.measure = \"mse\", alpha = 0)\n", + " dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = \"response\" ,\n", + " s = \"lambda.min\")\n", + "\n", + " # 6) Random forest\n", + " rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain , mtry = 4, ntree = 1000)\n", + " dGx.fit[indk,6] <- predict(rfXdk, ktest, type = \"prob\")[, 2]\n", + "\n", + " # 7) Tree (start big)\n", + " btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", + " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", + " dGx.fit[indk,7] <- predict(btXdk, ktest)\n", + "\n", + " # 8) Tree (small tree)\n", + " stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", + " # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV\n", + " dGx.fit[indk,8] <- predict(stXdk, ktest)\n", + "\n", + " # 9) Tree (cv)\n", + " bestcp <- btXdk$cptable[which.min(btXdk$cptable[,\"xerror\"]),\"CP\"]\n", + " cvXdk <- prune(btXdk , cp=bestcp)\n", + " dGx.fit[indk,9] <- predict(cvXdk, ktest)\n", + "\n", + " #-----------------------------------------------------------------------------\n", + " # E[Y|D=0,X]\n", + "\n", + " # subset to D = 0\n", + " ktrain0 = ktrain[ktrain$treated == 0, ]\n", + "\n", + " ytrain0 = ytrain[ktrain$treated == 0, ]\n", + " xtrain0 = xtrain[ktrain$treated == 0, ]\n", + "\n", + " # 1) Constant\n", + " yGd0x.fit[indk,1] <- mean(ktrain0$dy)\n", + "\n", + " # 2) Baseline controls\n", + " lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0)\n", + " yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest)\n", + "\n", + " # 3) Region specific linear index\n", + " lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0),\n", + " data = ktrain)\n", + " yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest)\n", + "\n", + " # 4) Lasso - expansion - default CV tuning\n", + " lassoXyk <- cv.glmnet(xtrain0 , ytrain0)\n", + " yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = \"lambda.min\")\n", + "\n", + " # 5) Ridge - expansion - default CV tuning\n", + " ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0)\n", + " yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " # 6) Random forest\n", + " rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain0 , mtry = 4, ntree = 1000)\n", + " yGd0x.fit[indk,6] <- predict(rfXyk, ktest)\n", + "\n", + " # 7) Tree (start big)\n", + " btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain0, method = \"anova\",\n", + " control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10))\n", + " yGd0x.fit[indk,7] <- predict(btXyk, ktest)\n", + "\n", + " # 8) Tree (small tree)\n", + " stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 ,\n", + " data = ktrain, method = \"anova\",\n", + " control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10))\n", + " yGd0x.fit[indk,8] <- predict(stXyk, ktest)\n", + "\n", + " # 9) Tree (cv)\n", + " bestcp <- btXyk$cptable[which.min(btXyk$cptable[,\"xerror\"]),\"CP\"]\n", + " cvXyk <- prune(btXyk , cp=bestcp)\n", + " yGd0x.fit[indk,9] <- predict(cvXyk, ktest)\n", + "\n", + " }\n", + "\n", + " RMSE.dP[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2))\n", + " RMSE.yP[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] -\n", + " yGd0x.fit[usedata$treated == 0, ])^2))\n", + "\n", + " # trim propensity scores of 1 to .95\n", + " for(r in 1:9) {\n", + " trimmedP[ii,r] = sum(dGx.fit[ , r] > .95)\n", + " dGx.fit[dGx.fit[ ,r] > .95,r] <- .95\n", + " }\n", + "\n", + " att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", + " (usedata$dy-yGd0x.fit)) ,\n", + " mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", + " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", + " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])])))\n", + " att.den <- mean(usedata$treated/pd.fit)\n", + "\n", + " attP[ii, ] <- att.num/att.den\n", + "\n", + " phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))*\n", + " (usedata$dy-yGd0x.fit) ,\n", + " ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])])\n", + " /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))*\n", + " (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den\n", + " se.attP[ii, ] <- sqrt(colMeans((phihat^2))/n)\n", + "\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZpL1SPbe6HRE" + }, + "source": [ + "We report the results in the following table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 + }, + "id": "2y3YugM4omz4", + "outputId": "44e812c4-bc35-4309-e567-f6a37bca1d0f" + }, + "outputs": [], + "source": [ + "tableP <- matrix(0, 4, 10)\n", + "tableP[1,] <- c(RMSE.yP, min(RMSE.yP))\n", + "tableP[2,] <- c(RMSE.dP, min(RMSE.dP))\n", + "tableP[3,] <- attP\n", + "tableP[4,] <- se.attP\n", + "rownames(tableP)<- c(\"RMSE Y\",\"RMSE D\",\"ATET\",\"s.e.\")\n", + "colnames(tableP)<- c(\"No Controls\", \"Basic\", \"Expansion\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\",\n", + " \"Random Forest\",\"Deep Tree\",\n", + " \"Shallow Tree\", \"Tree (CV)\" , \"Best\")\n", + "tableP = t(tableP)\n", + "tableP" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k0jGwOFO6NZD" + }, + "source": [ + "Here we see broad agreement across all methods in the sense of returning point estimates that are small in magnitude and small relative to standard errors. In no case would we reject the hypothesis that the pre-event effect in 2002 is different from zero at usual levels of significance. We note that failing to reject the hypothesis of no pre-event effects certainly does not imply that the conditional DiD assumption is in fact satisfied. For example, confidence intervals include values that would be consistent with relatively large pre-event effects. However, it is reassuring to see that there is not strong evidence of a violation of the underlying identifying assumption." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd b/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd new file mode 100644 index 00000000..8a066de6 --- /dev/null +++ b/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd @@ -0,0 +1,575 @@ +--- +title: An R Markdown document converted from "T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb" +output: html_document +--- + +# Regression Discontinuity +This notebook illustrates the use of Regression Discontinuity in an empirical study. We analyze the effect of the antipoverty program *Progresa/Oportunidades* on the consumption behavior of families in Mexico in the early 2000s. + +The program was intended for families in extreme poverty and included financial incentives for participation in measures that improved the family's health, nutrition and children's education. The effect of this program is a widely studied problem in social and economic sciences and, according to the WHO, was a very successful measure in terms of reducing extreme poverty in Mexico. + +Eligibility for the program was determined based on a pre-intervention household poverty-index. Individuals above a certain threshold received the treatment (participation in the program) while individuals below the threshold were excluded and recorded as a control group. All observations above the threshold participated in the program, which makes the analysis fall into the standard (sharp) regression discontinuity design. + +First, we need to install and load some packages. This can take up to 15 minutes. + +```{r} +dependencies <- c("rdrobust", "fastDummies", "randomForest", "hdm", "gbm", "rdd") +install.packages(dependencies) +lapply(dependencies, library, character.only = TRUE) +``` + +We use a dataset assembled by [Calonico et al. (2014)](https://rdpackages.github.io/references/Calonico-Cattaneo-Titiunik_2014_ECMA--Supplemental.pdf) and follow the analysis in [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). + +First, we open the data and remove any observations that have NaN values. + +```{r} +df <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv", row.names=1) +comp <- complete.cases(df) +df <- df[comp,] +print("Shape of Data:") +print(dim(df)) +print("Variable Names:") +print(colnames(df)) +head(df) +``` + +The data set contains 1,944 observations for which full covariate information of 27 variables is available. + +We want to measure the local average treatment effect of program participation on four outcome variables. The outcome variables are food and non-food consumption of the recorded families at two points in time, one year and two years after the implementation of the program. + +The baseline covariates, recorded prior to program implementation, include the household's size; household head's age, sex, years of education and employment status; spouse's age and years of education; number of children not older than five years and their sex, and physical characteristics of the house: whether the house has cement floors, water connection, water connection inside the house, a bathroom, electricity, number of rooms, pre-intervention consumption, and an identifier of the urban locality in which the house is located. + +The data fits to the pattern of a sharp RD design, namely, all individuals that were below the cut-off index received no intervention, and all individuals above the cut-off were eligible to join the *progresa* program and thus participated. + +## Estimation without Covariates + +First, we will perform a very simple RD estimation with a weighted linear regression. We use a triangular kernel, which assigns weights to observations based on their distance from the cutoff point. The weights decrease linearly as the distance from the cutoff point increases. + +```{r} +triangular_kernel <- function(index, h) { + weights <- 1 - abs(index)/h + weights[weights < 0] <- 0 + return(weights) +} +``` + +The parameter `h` is the bandwidth that controls the range of observations that receive non-zero weights. We use the `IKbandwidth` function from the `rdd` package that implements the *Imbens-Kalyanaraman* method. Another standard approach would be to use the standard deviation of `index`. + +```{r} +h <- IKbandwidth(X=df$index, Y=df$conspcfood_t1, cutpoint = 0) +``` + +We use the triangular kernel function to calculate weights for each observation. After that, we can fit two seperate linear regressions for both treatment and control groups. + +```{r} +weights <- triangular_kernel(df$index, h) +model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0,], weights = weights[df$index > 0]) +model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0,], weights = weights[df$index < 0]) +``` + +The treatment effect at the cutoff point is estimated as the difference between the predictions of the two models at the cutoff point. + +```{r} +cutoff <- 0 +treatment_effect <- predict(model_treated, newdata = data.frame(index = cutoff)) - + predict(model_control, newdata = data.frame(index = cutoff)) +treatment_effect +``` + +We estimate that the participation in the program reduced food consumption by $22.1$ units in the first year after the intervention. We can repeat the estimation using the `rdd` package, which yields us an estimate as well as a confidence band calculated according to the formulas presented in the book. We look at all four targets. + +```{r} +result <- c() +for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")){ + rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1) + result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust",])) +} +resframe <- as.data.frame(result) +colnames(resframe) <- c("LATE", "s.e.") +rownames(resframe) <- c("Food T_1", "Non-Food T_1", "Food T_2", "Non-Food T_2") +print(resframe) +``` + +While the effects in the first year after the intervention are negative, we observe significant positive effects in the second year after an individual or household was accepted in the *Progresa* program. This is in accordance to the previous analysis of this dataset. One possible explanation for this is that the program households have more money and can thus afford more. This was the desired effect of the program to combat hunger and extreme poverty. + +The following plot visualizes the two weighted regressions at the cut-off for the last outcome variable (non-food consumption in `t2`). We can clearly see the "jump" at the cut-off, which is our LATE. + +```{r} +rdplot(df$conspcfood_t1, df$index, c=0, x.lim = c(-1,1), y.lim = c(250,400)) +``` + +## Estimation with Covariates + +For identification and estimation of the average treatment effect at the cutoff value no covariate information is required except the running variable, but nevertheless in many applications additional covariates are collected which might be exploited for the analysis. + + +The standard approach is simply to take up the regressors in the weighted least squares regression. + +```{r} +model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index > 0,], weights = weights[df$index > 0]) +model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index < 0,], weights = weights[df$index < 0]) +prediction_treated <- predict(model_treated, newdata = data.frame(index = cutoff, + hhownhouse = weighted.mean(df[df$index > 0,]$hhownhouse, w =weights[df$index > 0]), + headage = weighted.mean(df[df$index > 0,]$headage, w =weights[df$index > 0]), + heademp = weighted.mean(df[df$index > 0,]$heademp, w =weights[df$index > 0]), + headeduc = weighted.mean(df[df$index > 0,]$headeduc, w =weights[df$index > 0]))) +prediction_control <- predict(model_control, newdata = data.frame(index = cutoff, + hhownhouse = weighted.mean(df[df$index < 0,]$hhownhouse, w = weights[df$index < 0]), + headage = weighted.mean(df[df$index < 0,]$headage, w = weights[df$index < 0]), + heademp = weighted.mean(df[df$index < 0,]$heademp, w = weights[df$index < 0]), + headeduc = weighted.mean(df[df$index < 0,]$headeduc, w = weights[df$index < 0]))) +treatment_effect <- prediction_treated - prediction_control +treatment_effect +``` + +Including these selected covariates does not have a significant impact on the LATE estimation. + +Again, we can also use `rdrobust` to repeat the estimation with all other outcomes. + +```{r} +result <- c() +for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")){ + rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1, covs = df[,c(1:8,10:17,19,22)]) + result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust",])) +} +resframe_adj <- as.data.frame(result) +colnames(resframe_adj) <- c("LATE", "s.e.") +rownames(resframe_adj) <- c("Food T_1", "Non-Food T_1", "Food T_2", "Non-Food T_2") +resframe_adj["% reduction"] = (resframe_adj["s.e."] - resframe[,2]) * 100 / resframe[,2] +print(resframe_adj) +``` + +Overall, the adjustment by only a few covariates has not changed the estimated coefficient much from the result without covariates. However, including covariates does reduce the standard errors. + +## Estimation using ML + +As discussed in the book, including many covariates in RDD estimation can be beneficial for multiple reasons: +1. **Efficiency and power improvements**: As in randomized control trials, using covariates can increase efficiency and improve power. +2. **Auxiliary information**: In RDD the score determines the treatment assignment and measurement errors in the running variable can distort the results. Additional covariates can be exploited to overcome these issues or to deal with missing data problems. +3. **Treatment effect heterogeneity**: Covariates can be used to define subgroups in which the treatment effects differ. +4. **Other parameters of interest and extrapolation**: As the identified treatment effect in RDD is local at the cutoff, additional covariates might help for extrapolation of the treatment effects or identify other causal parameters. + +However, including a high number of covariates also comes with additional challenges, such as variables selection, non-linearities or interactions between covariates. The best way to overcome these is the use of modern ML methods. + +There are multiple ways to implement the estimators presented in the book, we will closely follow the analysis of [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). We set up running variable and outcome as above. The baseline covariates will be all the other variables in the data. + +```{r} +# Running Variable and Outcome +df_ml = df +investigated_outcome = "conspcfood_t1" +names(df_ml)[names(df_ml) == "index"] <- "X" +names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" + +# Baseline covariates including consumption +b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) + +# Fixed effects for localities +i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) + +# Flexible covariates including localities indicators +f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] +Z.lasso <- as.matrix(cbind(i.fe, f.covs)) +``` + +We will use the package `rdrobust` for the RD estimation. Before starting the DML procedure, we have to estimate a bandwidth to restrict the samples in the first stage estimation. + +```{r} +h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] +``` + +The next chunk sets up the crossfitting and estimates the function $\eta(Z)$, which we will use to adjust $Y$ for the second stage. We use Random Forest, a Boosting implementation, Linear Regression and Lasso with both a baseline and flexible covariate structure. + +```{r} +first_stage <- function(){ + # Set up the cross-fitting + n <- nrow(df_ml) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) + + # Matrix to store eta predictions + eta.fit <- matrix(NA, n, 5) + + # Create vector of observations to be considered in the first stage model + weights <- (abs(df_ml$X)0 & !fold & weights>0,] + data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + + data_fold <- df_ml[fold,] + + model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + + rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) + rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) + eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 + + gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + + lm1 <- lm(model, data = data_treated) + lm0 <- lm(model, data = data_control) + eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + + las_base1 <- rlasso(model, data = data_treated) + las_base0 <- rlasso(model, data = data_control) + eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + + data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) + data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) + data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + + las_flex1 <- rlasso(model_flex, data = data_treated_extended) + las_flex0 <- rlasso(model_flex, data = data_control_extended) + eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + } + return(eta.fit) +} + +eta.fit <- first_stage() +``` + +With the estimated $\hat{\eta}(Z)$ we can correct for confounding in $Y$ and now run the RDD estimation as second stage again. + +```{r} +methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") + +second_stage <- function(eta.fit){ + adj_results <- NULL + + for(i in 1:length(methods)){ + M.Y <- df_ml$Y - eta.fit[,i] + rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + } + return(adj_results) +} + +adj_frame <- as.data.frame(second_stage(eta.fit)) +rownames(adj_frame) <- methods +colnames(adj_frame) <- c("LATE", "s.e.") +print(adj_frame) +``` + +Finally, we create a small simulation study with only $R=20$ repetitions to show the variance reducing effect of the inclusion of ML-based estimators for the covariates. The next block runs up to ten minutes. + +```{r} +estimates <- adj_frame[,1] +std.err <- adj_frame[,2] +R <- 19 + +for (i in 1:R){ + eta.fit <- first_stage() + adj_results <- second_stage(eta.fit) + estimates <- cbind(estimates, adj_results[,1]) + std.err <- cbind(std.err, adj_results[,2]) +} +``` + +We aggregate the median of the estimates, the mean of the standard errors and also calculate the mean reduction of standard error compared to the "no covariates" estimation. We see, that including covariates can reduce the standard error of estimation around 15-20%. + +```{r} +med.est <- apply(estimates, 1, median) +mean.se <- apply(std.err, 1, mean) +adj_frame <- as.data.frame(cbind(med.est, mean.se)) +rownames(adj_frame) <- methods +colnames(adj_frame) <- c("LATE", "s.e.") +adj_frame["% reduction"] <- (adj_frame["s.e."] - resframe[1,2]) * 100 / resframe[1,2] +adj_frame["Linear Adjusted (no cross-fit)", ] = resframe_adj[1,] +print(adj_frame) +``` + +## We now repeat the exercise for the other outcomes (excluding the simulation). + +Non-Food Consumption (Year 1) + +```{r} +# Running Variable and Outcome +df_ml = df +investigated_outcome = "conspcnonfood_t1" +names(df_ml)[names(df_ml) == "index"] <- "X" +names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" + +# Baseline covariates including consumption +b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) + +# Fixed effects for localities +i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) + +# Flexible covariates including localities indicators +f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] +Z.lasso <- as.matrix(cbind(i.fe, f.covs)) + +h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] + +first_stage <- function(){ + # Set up the cross-fitting + n <- nrow(df_ml) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) + + # Matrix to store eta predictions + eta.fit <- matrix(NA, n, 5) + + # Create vector of observations to be considered in the first stage model + weights <- (abs(df_ml$X)0 & !fold & weights>0,] + data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + + data_fold <- df_ml[fold,] + + model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + + rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) + rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) + eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 + + gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + + lm1 <- lm(model, data = data_treated) + lm0 <- lm(model, data = data_control) + eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + + las_base1 <- rlasso(model, data = data_treated) + las_base0 <- rlasso(model, data = data_control) + eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + + data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) + data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) + data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + + las_flex1 <- rlasso(model_flex, data = data_treated_extended) + las_flex0 <- rlasso(model_flex, data = data_control_extended) + eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + } + return(eta.fit) +} + +eta.fit <- first_stage() + +methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") + +second_stage <- function(eta.fit){ + adj_results <- NULL + + for(i in 1:length(methods)){ + M.Y <- df_ml$Y - eta.fit[,i] + rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + } + return(adj_results) +} + +adj_frame <- as.data.frame(second_stage(eta.fit)) +rownames(adj_frame) <- methods +colnames(adj_frame) <- c("LATE", "s.e.") +print(adj_frame) +``` + +Food Consumption (Year 2) + +```{r} +# Running Variable and Outcome +df_ml = df +investigated_outcome = "conspcfood_t2" +names(df_ml)[names(df_ml) == "index"] <- "X" +names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" + +# Baseline covariates including consumption +b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) + +# Fixed effects for localities +i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) + +# Flexible covariates including localities indicators +f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] +Z.lasso <- as.matrix(cbind(i.fe, f.covs)) + +h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] + +first_stage <- function(){ + # Set up the cross-fitting + n <- nrow(df_ml) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) + + # Matrix to store eta predictions + eta.fit <- matrix(NA, n, 5) + + # Create vector of observations to be considered in the first stage model + weights <- (abs(df_ml$X)0 & !fold & weights>0,] + data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + + data_fold <- df_ml[fold,] + + model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + + rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) + rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) + eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 + + gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + + lm1 <- lm(model, data = data_treated) + lm0 <- lm(model, data = data_control) + eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + + las_base1 <- rlasso(model, data = data_treated) + las_base0 <- rlasso(model, data = data_control) + eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + + data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) + data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) + data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + + las_flex1 <- rlasso(model_flex, data = data_treated_extended) + las_flex0 <- rlasso(model_flex, data = data_control_extended) + eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + } + return(eta.fit) +} + +eta.fit <- first_stage() + +methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") + +second_stage <- function(eta.fit){ + adj_results <- NULL + + for(i in 1:length(methods)){ + M.Y <- df_ml$Y - eta.fit[,i] + rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + } + return(adj_results) +} + +adj_frame <- as.data.frame(second_stage(eta.fit)) +rownames(adj_frame) <- methods +colnames(adj_frame) <- c("LATE", "s.e.") +print(adj_frame) +``` + +Non-Food Consumption (Year 2) + +```{r} +# Running Variable and Outcome +df_ml = df +investigated_outcome = "conspcnonfood_t2" +names(df_ml)[names(df_ml) == "index"] <- "X" +names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" + +# Baseline covariates including consumption +b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) + +# Fixed effects for localities +i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) + +# Flexible covariates including localities indicators +f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] +Z.lasso <- as.matrix(cbind(i.fe, f.covs)) + +h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] + +first_stage <- function(){ + # Set up the cross-fitting + n <- nrow(df_ml) + Kf <- 5 # Number of folds + sampleframe <- rep(1:Kf, ceiling(n/Kf)) + cfgroup <- sample(sampleframe, size=n, replace = FALSE) + + # Matrix to store eta predictions + eta.fit <- matrix(NA, n, 5) + + # Create vector of observations to be considered in the first stage model + weights <- (abs(df_ml$X)0 & !fold & weights>0,] + data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + + data_fold <- df_ml[fold,] + + model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + + rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) + rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) + eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 + + gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution="gaussian") + eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + + lm1 <- lm(model, data = data_treated) + lm0 <- lm(model, data = data_control) + eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + + las_base1 <- rlasso(model, data = data_treated) + las_base0 <- rlasso(model, data = data_control) + eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + + data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) + data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) + data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + + las_flex1 <- rlasso(model_flex, data = data_treated_extended) + las_flex0 <- rlasso(model_flex, data = data_control_extended) + eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + } + return(eta.fit) +} + +eta.fit <- first_stage() + +methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") + +second_stage <- function(eta.fit){ + adj_results <- NULL + + for(i in 1:length(methods)){ + M.Y <- df_ml$Y - eta.fit[,i] + rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + } + return(adj_results) +} + +adj_frame <- as.data.frame(second_stage(eta.fit)) +rownames(adj_frame) <- methods +colnames(adj_frame) <- c("LATE", "s.e.") +print(adj_frame) +``` + diff --git a/T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb b/T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb index 6e4844c1..89223532 100644 --- a/T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb +++ b/T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb @@ -1,889 +1,889 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "-VRZnOBNA6o7" - }, - "source": [ - "# Regression Discontinuity\n", - "This notebook illustrates the use of Regression Discontinuity in an empirical study. We analyze the effect of the antipoverty program *Progresa/Oportunidades* on the consumption behavior of families in Mexico in the early 2000s.\n", - "\n", - "The program was intended for families in extreme poverty and included financial incentives for participation in measures that improved the family's health, nutrition and children's education. The effect of this program is a widely studied problem in social and economic sciences and, according to the WHO, was a very successful measure in terms of reducing extreme poverty in Mexico.\n", - "\n", - "Eligibility for the program was determined based on a pre-intervention household poverty-index. Individuals above a certain threshold received the treatment (participation in the program) while individuals below the threshold were excluded and recorded as a control group. All observations above the threshold participated in the program, which makes the analysis fall into the standard (sharp) regression discontinuity design.\n", - "\n", - "First, we need to install and load some packages. This can take up to 15 minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Yr5aL2yAgYN" - }, - "outputs": [], - "source": [ - "dependencies <- c(\"rdrobust\", \"fastDummies\", \"randomForest\", \"hdm\", \"gbm\", \"rdd\")\n", - "install.packages(dependencies)\n", - "lapply(dependencies, library, character.only = TRUE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GH0wFmHSxnen" - }, - "source": [ - "We use a dataset assembled by [Calonico et al. (2014)](https://rdpackages.github.io/references/Calonico-Cattaneo-Titiunik_2014_ECMA--Supplemental.pdf) and follow the analysis in [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf).\n", - "\n", - "First, we open the data and remove any observations that have NaN values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Rzbv0XXCxxJt" - }, - "outputs": [], - "source": [ - "df <- read.csv(\"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv\", row.names=1)\n", - "comp <- complete.cases(df)\n", - "df <- df[comp,]\n", - "print(\"Shape of Data:\")\n", - "print(dim(df))\n", - "print(\"Variable Names:\")\n", - "print(colnames(df))\n", - "head(df)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vGbvqQmpmoqV" - }, - "source": [ - "The data set contains 1,944 observations for which full covariate information of 27 variables is available.\n", - "\n", - "We want to measure the local average treatment effect of program participation on four outcome variables. The outcome variables are food and non-food consumption of the recorded families at two points in time, one year and two years after the implementation of the program.\n", - "\n", - "The baseline covariates, recorded prior to program implementation, include the household's size; household head's age, sex, years of education and employment status; spouse's age and years of education; number of children not older than five years and their sex, and physical characteristics of the house: whether the house has cement floors, water connection, water connection inside the house, a bathroom, electricity, number of rooms, pre-intervention consumption, and an identifier of the urban locality in which the house is located.\n", - "\n", - "The data fits to the pattern of a sharp RD design, namely, all individuals that were below the cut-off index received no intervention, and all individuals above the cut-off were eligible to join the *progresa* program and thus participated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9yvX75wy98g9" - }, - "source": [ - "## Estimation without Covariates" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bCueRzpuqNXn" - }, - "source": [ - "First, we will perform a very simple RD estimation with a weighted linear regression. We use a triangular kernel, which assigns weights to observations based on their distance from the cutoff point. The weights decrease linearly as the distance from the cutoff point increases." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1LAMZP540pLn" - }, - "outputs": [], - "source": [ - "triangular_kernel <- function(index, h) {\n", - " weights <- 1 - abs(index)/h\n", - " weights[weights < 0] <- 0\n", - " return(weights)\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N-I-EBps0ubO" - }, - "source": [ - "The parameter `h` is the bandwidth that controls the range of observations that receive non-zero weights. We use the `IKbandwidth` function from the `rdd` package that implements the *Imbens-Kalyanaraman* method. Another standard approach would be to use the standard deviation of `index`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bFuzAouP04lO" - }, - "outputs": [], - "source": [ - "h <- IKbandwidth(X=df$index, Y=df$conspcfood_t1, cutpoint = 0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J9kU7tQ207A3" - }, - "source": [ - "We use the triangular kernel function to calculate weights for each observation. After that, we can fit two seperate linear regressions for both treatment and control groups." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cjc7f7F6qM36" - }, - "outputs": [], - "source": [ - "weights <- triangular_kernel(df$index, h)\n", - "model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0,], weights = weights[df$index > 0])\n", - "model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0,], weights = weights[df$index < 0])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MC5vPB-I1jeH" - }, - "source": [ - "The treatment effect at the cutoff point is estimated as the difference between the predictions of the two models at the cutoff point." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "279my1C8o9a3" - }, - "outputs": [], - "source": [ - "cutoff <- 0\n", - "treatment_effect <- predict(model_treated, newdata = data.frame(index = cutoff)) -\n", - " predict(model_control, newdata = data.frame(index = cutoff))\n", - "treatment_effect" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uW6PYdz-BESB" - }, - "source": [ - "We estimate that the participation in the program reduced food consumption by $22.1$ units in the first year after the intervention. We can repeat the estimation using the `rdd` package, which yields us an estimate as well as a confidence band calculated according to the formulas presented in the book. We look at all four targets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6rLo9c_YGWIq" - }, - "outputs": [], - "source": [ - "result <- c()\n", - "for (outcome in c(\"conspcfood_t1\", \"conspcnonfood_t1\", \"conspcfood_t2\", \"conspcnonfood_t2\")){\n", - " rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1)\n", - " result <- rbind(result, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - "}\n", - "resframe <- as.data.frame(result)\n", - "colnames(resframe) <- c(\"LATE\", \"s.e.\")\n", - "rownames(resframe) <- c(\"Food T_1\", \"Non-Food T_1\", \"Food T_2\", \"Non-Food T_2\")\n", - "print(resframe)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BzzCc3oWZycJ" - }, - "source": [ - "While the effects in the first year after the intervention are negative, we observe significant positive effects in the second year after an individual or household was accepted in the *Progresa* program. This is in accordance to the previous analysis of this dataset. One possible explanation for this is that the program households have more money and can thus afford more. This was the desired effect of the program to combat hunger and extreme poverty.\n", - "\n", - "The following plot visualizes the two weighted regressions at the cut-off for the last outcome variable (non-food consumption in `t2`). We can clearly see the \"jump\" at the cut-off, which is our LATE." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gBbbIMxEZb6V" - }, - "outputs": [], - "source": [ - "rdplot(df$conspcfood_t1, df$index, c=0, x.lim = c(-1,1), y.lim = c(250,400))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hDEf53bE-Aki" - }, - "source": [ - "## Estimation with Covariates\n", - "\n", - "For identification and estimation of the average treatment effect at the cutoff value no covariate information is required except the running variable, but nevertheless in many applications additional covariates are collected which might be exploited for the analysis.\n", - "\n", - "\n", - "The standard approach is simply to take up the regressors in the weighted least squares regression." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JRdUQ8gcsGCg" - }, - "outputs": [], - "source": [ - "model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index > 0,], weights = weights[df$index > 0])\n", - "model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index < 0,], weights = weights[df$index < 0])\n", - "prediction_treated <- predict(model_treated, newdata = data.frame(index = cutoff,\n", - " hhownhouse = weighted.mean(df[df$index > 0,]$hhownhouse, w =weights[df$index > 0]),\n", - " headage = weighted.mean(df[df$index > 0,]$headage, w =weights[df$index > 0]),\n", - " heademp = weighted.mean(df[df$index > 0,]$heademp, w =weights[df$index > 0]),\n", - " headeduc = weighted.mean(df[df$index > 0,]$headeduc, w =weights[df$index > 0])))\n", - "prediction_control <- predict(model_control, newdata = data.frame(index = cutoff,\n", - " hhownhouse = weighted.mean(df[df$index < 0,]$hhownhouse, w = weights[df$index < 0]),\n", - " headage = weighted.mean(df[df$index < 0,]$headage, w = weights[df$index < 0]),\n", - " heademp = weighted.mean(df[df$index < 0,]$heademp, w = weights[df$index < 0]),\n", - " headeduc = weighted.mean(df[df$index < 0,]$headeduc, w = weights[df$index < 0])))\n", - "treatment_effect <- prediction_treated - prediction_control\n", - "treatment_effect" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8_-6oj93FYY-" - }, - "source": [ - "Including these selected covariates does not have a significant impact on the LATE estimation.\n", - "\n", - "Again, we can also use `rdrobust` to repeat the estimation with all other outcomes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OYQuZcvjyYx6" - }, - "outputs": [], - "source": [ - "result <- c()\n", - "for (outcome in c(\"conspcfood_t1\", \"conspcnonfood_t1\", \"conspcfood_t2\", \"conspcnonfood_t2\")){\n", - " rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1, covs = df[,c(1:8,10:17,19,22)])\n", - " result <- rbind(result, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - "}\n", - "resframe_adj <- as.data.frame(result)\n", - "colnames(resframe_adj) <- c(\"LATE\", \"s.e.\")\n", - "rownames(resframe_adj) <- c(\"Food T_1\", \"Non-Food T_1\", \"Food T_2\", \"Non-Food T_2\")\n", - "resframe_adj[\"% reduction\"] = (resframe_adj[\"s.e.\"] - resframe[,2]) * 100 / resframe[,2]\n", - "print(resframe_adj)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5q8S0wNhabWy" - }, - "source": [ - "Overall, the adjustment by only a few covariates has not changed the estimated coefficient much from the result without covariates. However, including covariates does reduce the standard errors." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9U8UkHmv-D-0" - }, - "source": [ - "## Estimation using ML" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NiYSglH9E0Er" - }, - "source": [ - "As discussed in the book, including many covariates in RDD estimation can be beneficial for multiple reasons:\n", - "1. **Efficiency and power improvements**: As in randomized control trials, using covariates can increase efficiency and improve power.\n", - "2. **Auxiliary information**: In RDD the score determines the treatment assignment and measurement errors in the running variable can distort the results. Additional covariates can be exploited to overcome these issues or to deal with missing data problems.\n", - "3. **Treatment effect heterogeneity**: Covariates can be used to define subgroups in which the treatment effects differ.\n", - "4. **Other parameters of interest and extrapolation**: As the identified treatment effect in RDD is local at the cutoff, additional covariates might help for extrapolation of the treatment effects or identify other causal parameters.\n", - "\n", - "However, including a high number of covariates also comes with additional challenges, such as variables selection, non-linearities or interactions between covariates. The best way to overcome these is the use of modern ML methods.\n", - "\n", - "There are multiple ways to implement the estimators presented in the book, we will closely follow the analysis of [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). We set up running variable and outcome as above. The baseline covariates will be all the other variables in the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n2uoMwzkCq4P" - }, - "outputs": [], - "source": [ - "# Running Variable and Outcome\n", - "df_ml = df\n", - "investigated_outcome = \"conspcfood_t1\"\n", - "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", - "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", - "\n", - "# Baseline covariates including consumption\n", - "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", - "\n", - "# Fixed effects for localities\n", - "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", - "\n", - "# Flexible covariates including localities indicators\n", - "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", - "Z.lasso <- as.matrix(cbind(i.fe, f.covs))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2n8yvua4Ns_A" - }, - "source": [ - "We will use the package `rdrobust` for the RD estimation. Before starting the DML procedure, we have to estimate a bandwidth to restrict the samples in the first stage estimation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VIO-PQEtOKob" - }, - "outputs": [], - "source": [ - "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nm8BC6JTQnV7" - }, - "source": [ - "The next chunk sets up the crossfitting and estimates the function $\\eta(Z)$, which we will use to adjust $Y$ for the second stage. We use Random Forest, a Boosting implementation, Linear Regression and Lasso with both a baseline and flexible covariate structure." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y-tGMe5iQhVd" - }, - "outputs": [], - "source": [ - "first_stage <- function(){\n", - " # Set up the cross-fitting\n", - " n <- nrow(df_ml)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", - "\n", - " # Matrix to store eta predictions\n", - " eta.fit <- matrix(NA, n, 5)\n", - "\n", - " # Create vector of observations to be considered in the first stage model\n", - " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", - " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", - "\n", - " data_fold <- df_ml[fold,]\n", - "\n", - " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", - "\n", - " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", - " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", - " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", - "\n", - " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", - "\n", - " lm1 <- lm(model, data = data_treated)\n", - " lm0 <- lm(model, data = data_control)\n", - " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", - "\n", - " las_base1 <- rlasso(model, data = data_treated)\n", - " las_base0 <- rlasso(model, data = data_control)\n", - " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", - "\n", - " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", - " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", - " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", - " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", - "\n", - " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", - " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", - " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", - " }\n", - " return(eta.fit)\n", - "}\n", - "\n", - "eta.fit <- first_stage()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ybTRUohWi_xE" - }, - "source": [ - "With the estimated $\\hat{\\eta}(Z)$ we can correct for confounding in $Y$ and now run the RDD estimation as second stage again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WdJkfePmx4iN" - }, - "outputs": [], - "source": [ - "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", - "\n", - "second_stage <- function(eta.fit){\n", - " adj_results <- NULL\n", - "\n", - " for(i in 1:length(methods)){\n", - " M.Y <- df_ml$Y - eta.fit[,i]\n", - " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", - " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - " }\n", - " return(adj_results)\n", - "}\n", - "\n", - "adj_frame <- as.data.frame(second_stage(eta.fit))\n", - "rownames(adj_frame) <- methods\n", - "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", - "print(adj_frame)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YTk3TdLgokSU" - }, - "source": [ - "Finally, we create a small simulation study with only $R=20$ repetitions to show the variance reducing effect of the inclusion of ML-based estimators for the covariates. The next block runs up to ten minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "U8kFY0gzClWK" - }, - "outputs": [], - "source": [ - "estimates <- adj_frame[,1]\n", - "std.err <- adj_frame[,2]\n", - "R <- 19\n", - "\n", - "for (i in 1:R){\n", - " eta.fit <- first_stage()\n", - " adj_results <- second_stage(eta.fit)\n", - " estimates <- cbind(estimates, adj_results[,1])\n", - " std.err <- cbind(std.err, adj_results[,2])\n", - "}" - ] - }, - { - "cell_type": "markdown", - "source": [ - "We aggregate the median of the estimates, the mean of the standard errors and also calculate the mean reduction of standard error compared to the \"no covariates\" estimation. We see, that including covariates can reduce the standard error of estimation around 15-20%." - ], - "metadata": { - "id": "gNVwChjU-UHF" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LqHHecS-FtEI" - }, - "outputs": [], - "source": [ - "med.est <- apply(estimates, 1, median)\n", - "mean.se <- apply(std.err, 1, mean)\n", - "adj_frame <- as.data.frame(cbind(med.est, mean.se))\n", - "rownames(adj_frame) <- methods\n", - "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", - "adj_frame[\"% reduction\"] <- (adj_frame[\"s.e.\"] - resframe[1,2]) * 100 / resframe[1,2]\n", - "adj_frame[\"Linear Adjusted (no cross-fit)\", ] = resframe_adj[1,]\n", - "print(adj_frame)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## We now repeat the exercise for the other outcomes (excluding the simulation)." - ], - "metadata": { - "id": "Xo0j5accQWnO" - } - }, - { - "cell_type": "markdown", - "source": [ - "Non-Food Consumption (Year 1)" - ], - "metadata": { - "id": "RKpKI5olQyQr" - } - }, - { - "cell_type": "code", - "source": [ - "# Running Variable and Outcome\n", - "df_ml = df\n", - "investigated_outcome = \"conspcnonfood_t1\"\n", - "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", - "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", - "\n", - "# Baseline covariates including consumption\n", - "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", - "\n", - "# Fixed effects for localities\n", - "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", - "\n", - "# Flexible covariates including localities indicators\n", - "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", - "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", - "\n", - "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", - "\n", - "first_stage <- function(){\n", - " # Set up the cross-fitting\n", - " n <- nrow(df_ml)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", - "\n", - " # Matrix to store eta predictions\n", - " eta.fit <- matrix(NA, n, 5)\n", - "\n", - " # Create vector of observations to be considered in the first stage model\n", - " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", - " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", - "\n", - " data_fold <- df_ml[fold,]\n", - "\n", - " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", - "\n", - " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", - " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", - " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", - "\n", - " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", - "\n", - " lm1 <- lm(model, data = data_treated)\n", - " lm0 <- lm(model, data = data_control)\n", - " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", - "\n", - " las_base1 <- rlasso(model, data = data_treated)\n", - " las_base0 <- rlasso(model, data = data_control)\n", - " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", - "\n", - " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", - " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", - " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", - " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", - "\n", - " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", - " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", - " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", - " }\n", - " return(eta.fit)\n", - "}\n", - "\n", - "eta.fit <- first_stage()\n", - "\n", - "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", - "\n", - "second_stage <- function(eta.fit){\n", - " adj_results <- NULL\n", - "\n", - " for(i in 1:length(methods)){\n", - " M.Y <- df_ml$Y - eta.fit[,i]\n", - " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", - " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - " }\n", - " return(adj_results)\n", - "}\n", - "\n", - "adj_frame <- as.data.frame(second_stage(eta.fit))\n", - "rownames(adj_frame) <- methods\n", - "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", - "print(adj_frame)\n" - ], - "metadata": { - "id": "nlp1hAxEQyAN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Food Consumption (Year 2)" - ], - "metadata": { - "id": "KXzyyQWLQ5Pp" - } - }, - { - "cell_type": "code", - "source": [ - "# Running Variable and Outcome\n", - "df_ml = df\n", - "investigated_outcome = \"conspcfood_t2\"\n", - "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", - "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", - "\n", - "# Baseline covariates including consumption\n", - "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", - "\n", - "# Fixed effects for localities\n", - "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", - "\n", - "# Flexible covariates including localities indicators\n", - "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", - "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", - "\n", - "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", - "\n", - "first_stage <- function(){\n", - " # Set up the cross-fitting\n", - " n <- nrow(df_ml)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", - "\n", - " # Matrix to store eta predictions\n", - " eta.fit <- matrix(NA, n, 5)\n", - "\n", - " # Create vector of observations to be considered in the first stage model\n", - " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", - " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", - "\n", - " data_fold <- df_ml[fold,]\n", - "\n", - " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", - "\n", - " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", - " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", - " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", - "\n", - " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", - "\n", - " lm1 <- lm(model, data = data_treated)\n", - " lm0 <- lm(model, data = data_control)\n", - " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", - "\n", - " las_base1 <- rlasso(model, data = data_treated)\n", - " las_base0 <- rlasso(model, data = data_control)\n", - " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", - "\n", - " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", - " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", - " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", - " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", - "\n", - " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", - " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", - " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", - " }\n", - " return(eta.fit)\n", - "}\n", - "\n", - "eta.fit <- first_stage()\n", - "\n", - "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", - "\n", - "second_stage <- function(eta.fit){\n", - " adj_results <- NULL\n", - "\n", - " for(i in 1:length(methods)){\n", - " M.Y <- df_ml$Y - eta.fit[,i]\n", - " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", - " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - " }\n", - " return(adj_results)\n", - "}\n", - "\n", - "adj_frame <- as.data.frame(second_stage(eta.fit))\n", - "rownames(adj_frame) <- methods\n", - "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", - "print(adj_frame)\n" - ], - "metadata": { - "id": "QPTlptD3QeGk" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Non-Food Consumption (Year 2)" - ], - "metadata": { - "id": "Hh3sHMUcQ8gP" - } - }, - { - "cell_type": "code", - "source": [ - "# Running Variable and Outcome\n", - "df_ml = df\n", - "investigated_outcome = \"conspcnonfood_t2\"\n", - "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", - "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", - "\n", - "# Baseline covariates including consumption\n", - "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", - "\n", - "# Fixed effects for localities\n", - "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", - "\n", - "# Flexible covariates including localities indicators\n", - "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", - "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", - "\n", - "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", - "\n", - "first_stage <- function(){\n", - " # Set up the cross-fitting\n", - " n <- nrow(df_ml)\n", - " Kf <- 5 # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", - "\n", - " # Matrix to store eta predictions\n", - " eta.fit <- matrix(NA, n, 5)\n", - "\n", - " # Create vector of observations to be considered in the first stage model\n", - " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", - " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", - "\n", - " data_fold <- df_ml[fold,]\n", - "\n", - " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", - "\n", - " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", - " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", - " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", - "\n", - " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", - " shrinkage = .1, distribution=\"gaussian\")\n", - " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", - "\n", - " lm1 <- lm(model, data = data_treated)\n", - " lm0 <- lm(model, data = data_control)\n", - " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", - "\n", - " las_base1 <- rlasso(model, data = data_treated)\n", - " las_base0 <- rlasso(model, data = data_control)\n", - " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", - "\n", - " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", - " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", - " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", - " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", - "\n", - " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", - " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", - " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", - " }\n", - " return(eta.fit)\n", - "}\n", - "\n", - "eta.fit <- first_stage()\n", - "\n", - "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", - "\n", - "second_stage <- function(eta.fit){\n", - " adj_results <- NULL\n", - "\n", - " for(i in 1:length(methods)){\n", - " M.Y <- df_ml$Y - eta.fit[,i]\n", - " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", - " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", - " }\n", - " return(adj_results)\n", - "}\n", - "\n", - "adj_frame <- as.data.frame(second_stage(eta.fit))\n", - "rownames(adj_frame) <- methods\n", - "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", - "print(adj_frame)\n" - ], - "metadata": { - "id": "_w5j1NYOQwqz" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "R", - "name": "ir" - }, - "language_info": { - "name": "R" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-VRZnOBNA6o7" + }, + "source": [ + "# Regression Discontinuity\n", + "This notebook illustrates the use of Regression Discontinuity in an empirical study. We analyze the effect of the antipoverty program *Progresa/Oportunidades* on the consumption behavior of families in Mexico in the early 2000s.\n", + "\n", + "The program was intended for families in extreme poverty and included financial incentives for participation in measures that improved the family's health, nutrition and children's education. The effect of this program is a widely studied problem in social and economic sciences and, according to the WHO, was a very successful measure in terms of reducing extreme poverty in Mexico.\n", + "\n", + "Eligibility for the program was determined based on a pre-intervention household poverty-index. Individuals above a certain threshold received the treatment (participation in the program) while individuals below the threshold were excluded and recorded as a control group. All observations above the threshold participated in the program, which makes the analysis fall into the standard (sharp) regression discontinuity design.\n", + "\n", + "First, we need to install and load some packages. This can take up to 15 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Yr5aL2yAgYN" + }, + "outputs": [], + "source": [ + "dependencies <- c(\"rdrobust\", \"fastDummies\", \"randomForest\", \"hdm\", \"gbm\", \"rdd\")\n", + "install.packages(dependencies)\n", + "lapply(dependencies, library, character.only = TRUE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GH0wFmHSxnen" + }, + "source": [ + "We use a dataset assembled by [Calonico et al. (2014)](https://rdpackages.github.io/references/Calonico-Cattaneo-Titiunik_2014_ECMA--Supplemental.pdf) and follow the analysis in [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf).\n", + "\n", + "First, we open the data and remove any observations that have NaN values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Rzbv0XXCxxJt" + }, + "outputs": [], + "source": [ + "df <- read.csv(\"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv\", row.names=1)\n", + "comp <- complete.cases(df)\n", + "df <- df[comp,]\n", + "print(\"Shape of Data:\")\n", + "print(dim(df))\n", + "print(\"Variable Names:\")\n", + "print(colnames(df))\n", + "head(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vGbvqQmpmoqV" + }, + "source": [ + "The data set contains 1,944 observations for which full covariate information of 27 variables is available.\n", + "\n", + "We want to measure the local average treatment effect of program participation on four outcome variables. The outcome variables are food and non-food consumption of the recorded families at two points in time, one year and two years after the implementation of the program.\n", + "\n", + "The baseline covariates, recorded prior to program implementation, include the household's size; household head's age, sex, years of education and employment status; spouse's age and years of education; number of children not older than five years and their sex, and physical characteristics of the house: whether the house has cement floors, water connection, water connection inside the house, a bathroom, electricity, number of rooms, pre-intervention consumption, and an identifier of the urban locality in which the house is located.\n", + "\n", + "The data fits to the pattern of a sharp RD design, namely, all individuals that were below the cut-off index received no intervention, and all individuals above the cut-off were eligible to join the *progresa* program and thus participated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9yvX75wy98g9" + }, + "source": [ + "## Estimation without Covariates" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bCueRzpuqNXn" + }, + "source": [ + "First, we will perform a very simple RD estimation with a weighted linear regression. We use a triangular kernel, which assigns weights to observations based on their distance from the cutoff point. The weights decrease linearly as the distance from the cutoff point increases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1LAMZP540pLn" + }, + "outputs": [], + "source": [ + "triangular_kernel <- function(index, h) {\n", + " weights <- 1 - abs(index)/h\n", + " weights[weights < 0] <- 0\n", + " return(weights)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N-I-EBps0ubO" + }, + "source": [ + "The parameter `h` is the bandwidth that controls the range of observations that receive non-zero weights. We use the `IKbandwidth` function from the `rdd` package that implements the *Imbens-Kalyanaraman* method. Another standard approach would be to use the standard deviation of `index`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bFuzAouP04lO" + }, + "outputs": [], + "source": [ + "h <- IKbandwidth(X=df$index, Y=df$conspcfood_t1, cutpoint = 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J9kU7tQ207A3" + }, + "source": [ + "We use the triangular kernel function to calculate weights for each observation. After that, we can fit two seperate linear regressions for both treatment and control groups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cjc7f7F6qM36" + }, + "outputs": [], + "source": [ + "weights <- triangular_kernel(df$index, h)\n", + "model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0,], weights = weights[df$index > 0])\n", + "model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0,], weights = weights[df$index < 0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MC5vPB-I1jeH" + }, + "source": [ + "The treatment effect at the cutoff point is estimated as the difference between the predictions of the two models at the cutoff point." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "279my1C8o9a3" + }, + "outputs": [], + "source": [ + "cutoff <- 0\n", + "treatment_effect <- predict(model_treated, newdata = data.frame(index = cutoff)) -\n", + " predict(model_control, newdata = data.frame(index = cutoff))\n", + "treatment_effect" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uW6PYdz-BESB" + }, + "source": [ + "We estimate that the participation in the program reduced food consumption by $22.1$ units in the first year after the intervention. We can repeat the estimation using the `rdd` package, which yields us an estimate as well as a confidence band calculated according to the formulas presented in the book. We look at all four targets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6rLo9c_YGWIq" + }, + "outputs": [], + "source": [ + "result <- c()\n", + "for (outcome in c(\"conspcfood_t1\", \"conspcnonfood_t1\", \"conspcfood_t2\", \"conspcnonfood_t2\")){\n", + " rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1)\n", + " result <- rbind(result, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + "}\n", + "resframe <- as.data.frame(result)\n", + "colnames(resframe) <- c(\"LATE\", \"s.e.\")\n", + "rownames(resframe) <- c(\"Food T_1\", \"Non-Food T_1\", \"Food T_2\", \"Non-Food T_2\")\n", + "print(resframe)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BzzCc3oWZycJ" + }, + "source": [ + "While the effects in the first year after the intervention are negative, we observe significant positive effects in the second year after an individual or household was accepted in the *Progresa* program. This is in accordance to the previous analysis of this dataset. One possible explanation for this is that the program households have more money and can thus afford more. This was the desired effect of the program to combat hunger and extreme poverty.\n", + "\n", + "The following plot visualizes the two weighted regressions at the cut-off for the last outcome variable (non-food consumption in `t2`). We can clearly see the \"jump\" at the cut-off, which is our LATE." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gBbbIMxEZb6V" + }, + "outputs": [], + "source": [ + "rdplot(df$conspcfood_t1, df$index, c=0, x.lim = c(-1,1), y.lim = c(250,400))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hDEf53bE-Aki" + }, + "source": [ + "## Estimation with Covariates\n", + "\n", + "For identification and estimation of the average treatment effect at the cutoff value no covariate information is required except the running variable, but nevertheless in many applications additional covariates are collected which might be exploited for the analysis.\n", + "\n", + "\n", + "The standard approach is simply to take up the regressors in the weighted least squares regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JRdUQ8gcsGCg" + }, + "outputs": [], + "source": [ + "model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index > 0,], weights = weights[df$index > 0])\n", + "model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index < 0,], weights = weights[df$index < 0])\n", + "prediction_treated <- predict(model_treated, newdata = data.frame(index = cutoff,\n", + " hhownhouse = weighted.mean(df[df$index > 0,]$hhownhouse, w =weights[df$index > 0]),\n", + " headage = weighted.mean(df[df$index > 0,]$headage, w =weights[df$index > 0]),\n", + " heademp = weighted.mean(df[df$index > 0,]$heademp, w =weights[df$index > 0]),\n", + " headeduc = weighted.mean(df[df$index > 0,]$headeduc, w =weights[df$index > 0])))\n", + "prediction_control <- predict(model_control, newdata = data.frame(index = cutoff,\n", + " hhownhouse = weighted.mean(df[df$index < 0,]$hhownhouse, w = weights[df$index < 0]),\n", + " headage = weighted.mean(df[df$index < 0,]$headage, w = weights[df$index < 0]),\n", + " heademp = weighted.mean(df[df$index < 0,]$heademp, w = weights[df$index < 0]),\n", + " headeduc = weighted.mean(df[df$index < 0,]$headeduc, w = weights[df$index < 0])))\n", + "treatment_effect <- prediction_treated - prediction_control\n", + "treatment_effect" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8_-6oj93FYY-" + }, + "source": [ + "Including these selected covariates does not have a significant impact on the LATE estimation.\n", + "\n", + "Again, we can also use `rdrobust` to repeat the estimation with all other outcomes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OYQuZcvjyYx6" + }, + "outputs": [], + "source": [ + "result <- c()\n", + "for (outcome in c(\"conspcfood_t1\", \"conspcnonfood_t1\", \"conspcfood_t2\", \"conspcnonfood_t2\")){\n", + " rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1, covs = df[,c(1:8,10:17,19,22)])\n", + " result <- rbind(result, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + "}\n", + "resframe_adj <- as.data.frame(result)\n", + "colnames(resframe_adj) <- c(\"LATE\", \"s.e.\")\n", + "rownames(resframe_adj) <- c(\"Food T_1\", \"Non-Food T_1\", \"Food T_2\", \"Non-Food T_2\")\n", + "resframe_adj[\"% reduction\"] = (resframe_adj[\"s.e.\"] - resframe[,2]) * 100 / resframe[,2]\n", + "print(resframe_adj)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5q8S0wNhabWy" + }, + "source": [ + "Overall, the adjustment by only a few covariates has not changed the estimated coefficient much from the result without covariates. However, including covariates does reduce the standard errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9U8UkHmv-D-0" + }, + "source": [ + "## Estimation using ML" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NiYSglH9E0Er" + }, + "source": [ + "As discussed in the book, including many covariates in RDD estimation can be beneficial for multiple reasons:\n", + "1. **Efficiency and power improvements**: As in randomized control trials, using covariates can increase efficiency and improve power.\n", + "2. **Auxiliary information**: In RDD the score determines the treatment assignment and measurement errors in the running variable can distort the results. Additional covariates can be exploited to overcome these issues or to deal with missing data problems.\n", + "3. **Treatment effect heterogeneity**: Covariates can be used to define subgroups in which the treatment effects differ.\n", + "4. **Other parameters of interest and extrapolation**: As the identified treatment effect in RDD is local at the cutoff, additional covariates might help for extrapolation of the treatment effects or identify other causal parameters.\n", + "\n", + "However, including a high number of covariates also comes with additional challenges, such as variables selection, non-linearities or interactions between covariates. The best way to overcome these is the use of modern ML methods.\n", + "\n", + "There are multiple ways to implement the estimators presented in the book, we will closely follow the analysis of [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). We set up running variable and outcome as above. The baseline covariates will be all the other variables in the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n2uoMwzkCq4P" + }, + "outputs": [], + "source": [ + "# Running Variable and Outcome\n", + "df_ml = df\n", + "investigated_outcome = \"conspcfood_t1\"\n", + "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", + "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", + "\n", + "# Baseline covariates including consumption\n", + "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", + "\n", + "# Fixed effects for localities\n", + "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", + "\n", + "# Flexible covariates including localities indicators\n", + "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", + "Z.lasso <- as.matrix(cbind(i.fe, f.covs))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2n8yvua4Ns_A" + }, + "source": [ + "We will use the package `rdrobust` for the RD estimation. Before starting the DML procedure, we have to estimate a bandwidth to restrict the samples in the first stage estimation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VIO-PQEtOKob" + }, + "outputs": [], + "source": [ + "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nm8BC6JTQnV7" + }, + "source": [ + "The next chunk sets up the crossfitting and estimates the function $\\eta(Z)$, which we will use to adjust $Y$ for the second stage. We use Random Forest, a Boosting implementation, Linear Regression and Lasso with both a baseline and flexible covariate structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y-tGMe5iQhVd" + }, + "outputs": [], + "source": [ + "first_stage <- function(){\n", + " # Set up the cross-fitting\n", + " n <- nrow(df_ml)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", + "\n", + " # Matrix to store eta predictions\n", + " eta.fit <- matrix(NA, n, 5)\n", + "\n", + " # Create vector of observations to be considered in the first stage model\n", + " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", + " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", + "\n", + " data_fold <- df_ml[fold,]\n", + "\n", + " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", + "\n", + " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", + " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", + " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", + "\n", + " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", + "\n", + " lm1 <- lm(model, data = data_treated)\n", + " lm0 <- lm(model, data = data_control)\n", + " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", + "\n", + " las_base1 <- rlasso(model, data = data_treated)\n", + " las_base0 <- rlasso(model, data = data_control)\n", + " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", + "\n", + " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", + " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", + " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", + " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", + "\n", + " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", + " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", + " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", + " }\n", + " return(eta.fit)\n", + "}\n", + "\n", + "eta.fit <- first_stage()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ybTRUohWi_xE" + }, + "source": [ + "With the estimated $\\hat{\\eta}(Z)$ we can correct for confounding in $Y$ and now run the RDD estimation as second stage again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WdJkfePmx4iN" + }, + "outputs": [], + "source": [ + "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", + "\n", + "second_stage <- function(eta.fit){\n", + " adj_results <- NULL\n", + "\n", + " for(i in 1:length(methods)){\n", + " M.Y <- df_ml$Y - eta.fit[,i]\n", + " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", + " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + " }\n", + " return(adj_results)\n", + "}\n", + "\n", + "adj_frame <- as.data.frame(second_stage(eta.fit))\n", + "rownames(adj_frame) <- methods\n", + "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", + "print(adj_frame)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YTk3TdLgokSU" + }, + "source": [ + "Finally, we create a small simulation study with only $R=20$ repetitions to show the variance reducing effect of the inclusion of ML-based estimators for the covariates. The next block runs up to ten minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U8kFY0gzClWK" + }, + "outputs": [], + "source": [ + "estimates <- adj_frame[,1]\n", + "std.err <- adj_frame[,2]\n", + "R <- 19\n", + "\n", + "for (i in 1:R){\n", + " eta.fit <- first_stage()\n", + " adj_results <- second_stage(eta.fit)\n", + " estimates <- cbind(estimates, adj_results[,1])\n", + " std.err <- cbind(std.err, adj_results[,2])\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gNVwChjU-UHF" + }, + "source": [ + "We aggregate the median of the estimates, the mean of the standard errors and also calculate the mean reduction of standard error compared to the \"no covariates\" estimation. We see, that including covariates can reduce the standard error of estimation around 15-20%." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LqHHecS-FtEI" + }, + "outputs": [], + "source": [ + "med.est <- apply(estimates, 1, median)\n", + "mean.se <- apply(std.err, 1, mean)\n", + "adj_frame <- as.data.frame(cbind(med.est, mean.se))\n", + "rownames(adj_frame) <- methods\n", + "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", + "adj_frame[\"% reduction\"] <- (adj_frame[\"s.e.\"] - resframe[1,2]) * 100 / resframe[1,2]\n", + "adj_frame[\"Linear Adjusted (no cross-fit)\", ] = resframe_adj[1,]\n", + "print(adj_frame)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xo0j5accQWnO" + }, + "source": [ + "## We now repeat the exercise for the other outcomes (excluding the simulation)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RKpKI5olQyQr" + }, + "source": [ + "Non-Food Consumption (Year 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nlp1hAxEQyAN" + }, + "outputs": [], + "source": [ + "# Running Variable and Outcome\n", + "df_ml = df\n", + "investigated_outcome = \"conspcnonfood_t1\"\n", + "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", + "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", + "\n", + "# Baseline covariates including consumption\n", + "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", + "\n", + "# Fixed effects for localities\n", + "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", + "\n", + "# Flexible covariates including localities indicators\n", + "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", + "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", + "\n", + "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", + "\n", + "first_stage <- function(){\n", + " # Set up the cross-fitting\n", + " n <- nrow(df_ml)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", + "\n", + " # Matrix to store eta predictions\n", + " eta.fit <- matrix(NA, n, 5)\n", + "\n", + " # Create vector of observations to be considered in the first stage model\n", + " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", + " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", + "\n", + " data_fold <- df_ml[fold,]\n", + "\n", + " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", + "\n", + " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", + " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", + " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", + "\n", + " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", + "\n", + " lm1 <- lm(model, data = data_treated)\n", + " lm0 <- lm(model, data = data_control)\n", + " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", + "\n", + " las_base1 <- rlasso(model, data = data_treated)\n", + " las_base0 <- rlasso(model, data = data_control)\n", + " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", + "\n", + " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", + " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", + " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", + " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", + "\n", + " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", + " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", + " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", + " }\n", + " return(eta.fit)\n", + "}\n", + "\n", + "eta.fit <- first_stage()\n", + "\n", + "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", + "\n", + "second_stage <- function(eta.fit){\n", + " adj_results <- NULL\n", + "\n", + " for(i in 1:length(methods)){\n", + " M.Y <- df_ml$Y - eta.fit[,i]\n", + " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", + " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + " }\n", + " return(adj_results)\n", + "}\n", + "\n", + "adj_frame <- as.data.frame(second_stage(eta.fit))\n", + "rownames(adj_frame) <- methods\n", + "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", + "print(adj_frame)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KXzyyQWLQ5Pp" + }, + "source": [ + "Food Consumption (Year 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QPTlptD3QeGk" + }, + "outputs": [], + "source": [ + "# Running Variable and Outcome\n", + "df_ml = df\n", + "investigated_outcome = \"conspcfood_t2\"\n", + "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", + "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", + "\n", + "# Baseline covariates including consumption\n", + "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", + "\n", + "# Fixed effects for localities\n", + "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", + "\n", + "# Flexible covariates including localities indicators\n", + "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", + "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", + "\n", + "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", + "\n", + "first_stage <- function(){\n", + " # Set up the cross-fitting\n", + " n <- nrow(df_ml)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", + "\n", + " # Matrix to store eta predictions\n", + " eta.fit <- matrix(NA, n, 5)\n", + "\n", + " # Create vector of observations to be considered in the first stage model\n", + " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", + " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", + "\n", + " data_fold <- df_ml[fold,]\n", + "\n", + " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", + "\n", + " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", + " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", + " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", + "\n", + " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", + "\n", + " lm1 <- lm(model, data = data_treated)\n", + " lm0 <- lm(model, data = data_control)\n", + " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", + "\n", + " las_base1 <- rlasso(model, data = data_treated)\n", + " las_base0 <- rlasso(model, data = data_control)\n", + " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", + "\n", + " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", + " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", + " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", + " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", + "\n", + " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", + " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", + " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", + " }\n", + " return(eta.fit)\n", + "}\n", + "\n", + "eta.fit <- first_stage()\n", + "\n", + "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", + "\n", + "second_stage <- function(eta.fit){\n", + " adj_results <- NULL\n", + "\n", + " for(i in 1:length(methods)){\n", + " M.Y <- df_ml$Y - eta.fit[,i]\n", + " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", + " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + " }\n", + " return(adj_results)\n", + "}\n", + "\n", + "adj_frame <- as.data.frame(second_stage(eta.fit))\n", + "rownames(adj_frame) <- methods\n", + "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", + "print(adj_frame)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hh3sHMUcQ8gP" + }, + "source": [ + "Non-Food Consumption (Year 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_w5j1NYOQwqz" + }, + "outputs": [], + "source": [ + "# Running Variable and Outcome\n", + "df_ml = df\n", + "investigated_outcome = \"conspcnonfood_t2\"\n", + "names(df_ml)[names(df_ml) == \"index\"] <- \"X\"\n", + "names(df_ml)[names(df_ml) == investigated_outcome] <- \"Y\"\n", + "\n", + "# Baseline covariates including consumption\n", + "b.covs <- names(df_ml[,c(1:8,10:17,19,22)])\n", + "\n", + "# Fixed effects for localities\n", + "i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE))\n", + "\n", + "# Flexible covariates including localities indicators\n", + "f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1]\n", + "Z.lasso <- as.matrix(cbind(i.fe, f.covs))\n", + "\n", + "h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]]\n", + "\n", + "first_stage <- function(){\n", + " # Set up the cross-fitting\n", + " n <- nrow(df_ml)\n", + " Kf <- 5 # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + " cfgroup <- sample(sampleframe, size=n, replace = FALSE)\n", + "\n", + " # Matrix to store eta predictions\n", + " eta.fit <- matrix(NA, n, 5)\n", + "\n", + " # Create vector of observations to be considered in the first stage model\n", + " weights <- (abs(df_ml$X)0 & !fold & weights>0,]\n", + " data_control <- df_ml[df_ml$X<0 & !fold & weights>0,]\n", + "\n", + " data_fold <- df_ml[fold,]\n", + "\n", + " model <- as.formula(paste(\"Y~\", paste(b.covs, collapse = \"+\")))\n", + "\n", + " rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000)\n", + " rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000)\n", + " eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2\n", + "\n", + " gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1,\n", + " shrinkage = .1, distribution=\"gaussian\")\n", + " eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2\n", + "\n", + " lm1 <- lm(model, data = data_treated)\n", + " lm0 <- lm(model, data = data_control)\n", + " eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2\n", + "\n", + " las_base1 <- rlasso(model, data = data_treated)\n", + " las_base0 <- rlasso(model, data = data_control)\n", + " eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2\n", + "\n", + " data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated)\n", + " data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control)\n", + " data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold)\n", + " model_flex <- as.formula(paste(\"Y~\", paste(c(b.covs, colnames(Z.lasso)), collapse = \"+\")))\n", + "\n", + " las_flex1 <- rlasso(model_flex, data = data_treated_extended)\n", + " las_flex0 <- rlasso(model_flex, data = data_control_extended)\n", + " eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2\n", + " }\n", + " return(eta.fit)\n", + "}\n", + "\n", + "eta.fit <- first_stage()\n", + "\n", + "methods <- c(\"Random Forest\", \"Gradient Boosting\", \"Linear Regression\", \"Lasso Baseline\", \"Lasso Flexible\")\n", + "\n", + "second_stage <- function(eta.fit){\n", + " adj_results <- NULL\n", + "\n", + " for(i in 1:length(methods)){\n", + " M.Y <- df_ml$Y - eta.fit[,i]\n", + " rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1)\n", + " adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se[\"Robust\",]))\n", + " }\n", + " return(adj_results)\n", + "}\n", + "\n", + "adj_frame <- as.data.frame(second_stage(eta.fit))\n", + "rownames(adj_frame) <- methods\n", + "colnames(adj_frame) <- c(\"LATE\", \"s.e.\")\n", + "print(adj_frame)\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/T/dml-for-conditional-average-treatment-effect.Rmd b/T/dml-for-conditional-average-treatment-effect.Rmd new file mode 100644 index 00000000..484a1a7f --- /dev/null +++ b/T/dml-for-conditional-average-treatment-effect.Rmd @@ -0,0 +1,626 @@ +--- +title: An R Markdown document converted from "T/dml-for-conditional-average-treatment-effect.irnb" +output: html_document +--- + +# DML for CATE + +This is a simple demonstration of Debiased Machine Learning estimator for the Conditional Average Treatment Effect. +Goal is to estimate the effect of 401(k) eligibility on net financial assets for each value of income. +The method is based on the following paper. + +* Title: Debiased Machine Learning of Conditional Average Treatment Effect and Other Causal Functions + +* Authors: Semenova, Vira and Chernozhukov, Victor. + +* Arxiv version: https://arxiv.org/pdf/1702.06240.pdf + +* Published version with replication code: https://academic.oup.com/ectj/advance-article/doi/10.1093/ectj/utaa027/5899048 + +Background + +The target function is Conditional Average Treatment Effect, defined as + +$$ g(x)=E [ Y(1) - Y(0) |X=x], $$ + +where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income. +$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$. + + + +If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is + +$$ g(x) = E [ Y (\eta_0) \mid X=x], $$ +where the transformed outcome variable is + +$$Y (\eta) = \dfrac{D}{s(Z)} \left( Y - \mu(1,Z) \right) - \dfrac{1-D}{1-s(Z)} \left( Y - \mu(0,Z) \right) + \mu(1,Z) - \mu(0,Z),$$ + +the probability of eligibility is + +$$s_0(z) = Pr (D=1 \mid Z=z),$$ + +the expected net financial asset given $D =d \in \{1,0\}$ and $Z=z$ is + +$$ \mu(d,z) = E[ Y \mid Z=z, D=d]. $$ + +Our goal is to estimate $g(x)$. + + +In step 1, we estimate the unknown functions $s_0(z), \mu(1,z), \mu(0,z)$ and plug them into $Y (\eta)$. + + +In step 2, we approximate the function $g(x)$ by a linear combination of basis functions: + +$$ g(x) = p(x)' \beta_0, $$ + + +where $p(x)$ is a vector of polynomials or splines and + +$$ \beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\eta_0) $$ + +is the best linear predictor. We report + +$$ +\widehat{g}(x) = p(x)' \widehat{\beta}, +$$ + +where $\widehat{\beta}$ is the ordinary least squares estimate of $\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$ + +$$ + \widehat{\beta} :=\left( \dfrac{1}{N} \sum_{i=1}^N p(X_i) p(X_i)' \right)^{-1} \dfrac{1}{N} \sum_{i=1}^N p(X_i)Y_i(\widehat{\eta}) +$$ + + + + + + + +```{r} +## load packages +install.packages("foreign") +install.packages("quantreg") +install.packages("splines") +install.packages("lattice") +#install.packages("mnormt") +install.packages("Hmisc") +install.packages("fda") +install.packages("hdm") +install.packages("randomForest") +install.packages("ranger") +install.packages("sandwich") + +library(foreign) +library(quantreg) +library(splines) +library(lattice) +#library(mnormt) +library(Hmisc) +library(fda) +library(hdm) +library(randomForest) +library(ranger) +library(sandwich) +library(ggplot2) +``` + +```{r} +## 401k dataset +data(pension) +pension$net_tfa<-pension$net_tfa/10000 +## covariate of interest -- log income -- +pension$inc = log(pension$inc) +#pension$inc[is.na(pension$inc)]<-0 +pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,] + + +## outcome variable -- total net financial assets +Y=pension$net_tfa +## binary treatment -- indicator of 401(k) eligibility +D=pension$e401 + + +X=pension$inc +## target parameter is CATE = E[ Y(1) - Y(0) | X] + + +## raw covariates so that Y(1) and Y(0) are independent of D given Z +Z = pension[,c("age","inc","fsize","educ","male","db","marr","twoearn","pira","hown","hval","hequity","hmort", + "nohs","hs","smcol")] + + +y_name <- "net_tfa"; +d_name <- "e401"; +form_z <- "(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2"; + + + +cat(sprintf("\n sample size is %g \n", length(Y) )) +cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2] )) +``` + +In Step 1, we estimate three functions: + +1. probability of treatment assignment $s_0(z)$ + +2.-3. regression functions $\mu_0(1,z)$ and $\mu_0(0,z)$. + +We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r + +For each function, we try random forest. + +First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by lasso + +```{r} + +first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) { + + # Sample size + N<-dim(data)[1] + # Estimated regression function in control group + mu0.hat<-rep(1,N) + # Estimated regression function in treated group + mu1.hat<-rep(1,N) + # Propensity score + s.hat<-rep(1,N) + seed=1 + ## define sample splitting + set.seed(seed) + inds.train=sample(1:N,floor(N/2)) + inds.eval=setdiff(1:N,inds.train) + + print ("Estimate treatment probability, first half") + ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest + fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.train,]) + + s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type="response") + print ("Estimate treatment probability, second half") + fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.eval,]) + s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type="response") + + + + + + data1<-data + data1[,d_name]<-1 + + data0<-data + data0[,d_name]<-0 + + print ("Estimate expectation function, first half") + fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.train,]) + mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,]) + mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,]) + + print ("Estimate expectation function, second half") + fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.eval,]) + mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,]) + mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,]) + + return (list(mu1.hat=mu1.hat, + mu0.hat=mu0.hat, + s.hat=s.hat)) + +} +``` + +First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by random forest + +```{r} +first_stage_rf<-function(Y,D,Z,seed=1) { + + # Sample size + N<-length(D) + # Estimated regression function in control group + mu0.hat<-rep(1,N) + # Estimated regression function in treated group + mu1.hat<-rep(1,N) + # Propensity score + s.hat<-rep(1,N) + + + ## define sample splitting + set.seed(seed) + inds.train=sample(1:N,floor(N/2)) + inds.eval=setdiff(1:N,inds.train) + + print ("Estimate treatment probability, first half") + ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest + D.f<-as.factor(as.character(D)) + fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train) + s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type="prob")[,2] + print ("Estimate treatment probability, second half") + fitted.rf<-randomForest(Z,D.f,subset=inds.eval) + s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type="prob")[,2] + + ## conditional expected net financial assets (i.e., regression function) based on random forest + + covariates<-cbind(Z,D) + + covariates1<-cbind(Z,D=rep(1,N)) + covariates0<-cbind(Z,D=rep(0,N)) + + print ("Estimate expectation function, first half") + fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train) + mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,]) + mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,]) + + print ("Estimate expectation function, second half") + fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval) + mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,]) + mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,]) + + return (list(mu1.hat=mu1.hat, + mu0.hat=mu0.hat, + s.hat=s.hat)) + +} +``` + +In Step 2, we approximate $Y(\eta_0)$ by a vector of basis functions. There are two use cases: +**** +2.A. Group Average Treatment Effect, described above + + +2.B. Average Treatment Effect conditional on income value. There are three smoothing options: + +1. splines offered in ```least_squares_splines``` + +2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series``` + +3. standard polynomials with the highest degree input by user ```least_squares_series_old``` + + +The default option is option 3. + +2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as + +$$ - \infty = \ell_0 < \ell_1 < \ell_2 \dots \ell_K = \infty $$ + +define intervals $I_k = [ \ell_{k-1}, \ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator + +$$ G_k(X) = 1[X \in I_k], $$ + +and the vector of basis functions + +$$ p(X) = (G_1(X), G_2(X), \dots, G_K(X)) $$ + +Then, the Best Linear Predictor $\beta_0$ vector shows the average treatment effect for each group. + +```{r} +## estimate first stage functions by random forest +## may take a while +fs.hat.rf = first_stage_rf(Y,D,Z) +``` + +```{r} +X=pension$inc +fs.hat<-fs.hat.rf +min_cutoff=0.01 +# regression function +mu1.hat<-fs.hat[["mu1.hat"]] +mu0.hat<-fs.hat[["mu0.hat"]] +# propensity score +s.hat<-fs.hat[["s.hat"]] +s.hat<-sapply(s.hat,max,min_cutoff) +### Construct Orthogonal Signal + + +RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat +``` + +```{r} +qtmax <- function(C, S=10000, alpha) + {; + p <- nrow(C); + tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max); + return(quantile(tmaxs, 1-alpha)); + }; + +# This function computes the square root of a symmetric matrix using the spectral decomposition; + + +group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) { + + grid<-quantile(X,probs=c((0:max_grid)/max_grid)) + X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1) + + for (k in 2:((length(grid)))) { + X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) ) + + ols.fit<- lm(Y~X.raw-1) + coefs <- coef(ols.fit) + vars <- names(coefs) + HCV.coefs <- vcovHC(ols.fit, type = 'HC') + coefs.se <- sqrt(diag(HCV.coefs)) # White std errors + ## this is an identity matrix + ## qtmax is simplified + C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs)))); + + + tes <- coefs + tes.se <- coefs.se + tes.cor <- C.coefs + crit.val <- qtmax(tes.cor,B,alpha); + + tes.ucb <- tes + crit.val * tes.se; + tes.lcb <- tes - crit.val * tes.se; + + tes.uci <- tes + qnorm(1-alpha/2) * tes.se; + tes.lci <- tes + qnorm(alpha/2) * tes.se; + + + return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci, + ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val )) +} +``` + +```{r} +res<-group_average_treatment_effect(X=X,Y=RobustSignal) +``` + +```{r} +## this code is taken from L1 14.382 taught at MIT +## author: Mert Demirer +options(repr.plot.width=10, repr.plot.height=8) + +tes<-res$beta.hat +tes.lci<-res$ghat.lower.point +tes.uci<-res$ghat.upper.point + +tes.lcb<-res$ghat.lower +tes.ucb<-res$ghat.upper +tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%') + +plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type="n",xlab="Income group", + ylab="Average Effect on NET TFA (per 10 K)", main="Group Average Treatment Effects on NET TFA", xaxt="n"); +axis(1, at=1:5, labels=tes.lev); +for (i in 1:5) +{; + rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = "red", lwd = 3); + rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 ); + segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 ); +}; +abline(h=0); + +legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8); + +dev.off() +``` + +```{r} +least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) { + ## Create technical regressors + cv.bsp<-rep(0,max_knot-1) + for (knot in 2:max_knot) { + breaks<- quantile(X, c(0:knot)/knot) + formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1] + fit <- lm(formula.bsp); + cv.bsp[knot-1] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); + } + ## Number of knots chosen by cross-validation + cv_knot<-which.min(cv.bsp)+1 + breaks<- quantile(X, c(0:cv_knot)/cv_knot) + formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1] + fit <- lm(formula.bsp); + + + return(list(cv_knot=cv_knot,fit=fit)) +} + + +least_squares_series<-function(X, Y,max_degree,...) { + + cv.pol<-rep(0,max_degree) + for (degree in 1:max_degree) { + formula.pol <- Y ~ poly(X, degree) + fit <- lm(formula.pol ); + cv.pol[degree] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); + } + ## Number of knots chosen by cross-validation + cv_degree<-which.min(cv.pol) + ## Estimate coefficients + formula.pol <- Y ~ poly(X, cv_degree) + fit <- lm(formula.pol); + + + return(list(fit=fit,cv_degree=cv_degree)) +} +``` + +```{r} +msqrt <- function(C) + {; + C.eig <- eigen(C); + return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors)); + }; + + +tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) { + + + numerator_grid<-regressors_grid%*%msqrt( Omega.hat) + denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid))) + + norm_numerator_grid<-numerator_grid + for (k in 1:dim(numerator_grid)[1]) { + norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k] + } + + tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max) + return(quantile(tmaxs, 1-alpha)) + +} +``` + +```{r} +second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method="poly",min_cutoff=0.01,alpha=0.05,eps=0.1,...) { + + X_grid = seq(min(X),max(X),eps) + mu1.hat<-fs.hat[["mu1.hat"]] + mu0.hat<-fs.hat[["mu0.hat"]] + # propensity score + s.hat<-fs.hat[["s.hat"]] + s.hat<-sapply(s.hat,max,min_cutoff) + ### Construct Orthogonal Signal + + RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat + + + + # Estimate the target function using least squares series + if (ss_method == "ortho_poly") { + res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree) + fit<-res$fit + cv_degree<-res$cv_degree + regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree)) + + } + if (ss_method == "splines") { + + res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv) + fit<-res$fit + cv_knot<-res$cv_knot + breaks<- quantile(X, c(0:cv_knot)/cv_knot) + regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]) + degree=cv_knot + + + } + + + g.hat<-regressors_grid%*%coef(fit) + + + HCV.coefs <- vcovHC(fit, type = 'HC') + #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit)) + standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid))) + ### Lower Pointwise CI + ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error + ### Upper Pointwise CI + ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error + + max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha) + + + ## Lower Uniform CI + ghat.lower<-g.hat-max_tstat*standard_error + ## Upper Uniform CI + ghat.upper<-g.hat+max_tstat*standard_error + return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point, + ghat.upper.point=ghat.upper.point,X_grid=X_grid)) + + + +} +``` + +```{r} +make_plot<-function(res,lowy,highy,degree,ss_method = "series",uniform=TRUE,...) { + + + title=paste0("Effect of 401(k) on Net TFA, ", ss_method) + X_grid=res$X_grid + len = length(X_grid) + + + if (uniform) { + group <-c(rep("UCI",len), rep("PCI",len), rep("Estimate",len),rep("PCIL",len),rep("UCIL",len)) + group_type<- c(rep("CI",len), rep("CI",len), rep("Estimate",len),rep("CI",len),rep("CI",len)) + group_ci_type<-c(rep("Uniform",len), rep("Point",len), rep("Uniform",len),rep("Point",len),rep("Uniform",len)) + + df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type ) + p<-ggplot(data=df)+ + aes(x=exp(income),y=outcome,colour=group )+ + theme_bw()+ + xlab("Income")+ + ylab("Net TFA, (thousand dollars)")+ + scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ + theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ + theme(legend.title=element_blank())+ + theme(legend.position="none")+ + ylim(low=lowy,high=highy)+ + geom_line(aes(linetype = group_line),size=1.5)+ + scale_linetype_manual(values=c("dashed","solid"))+ + ggtitle(title) + } + + if (!uniform) { + group <-c( rep("PCI",len), rep("Estimate",len),rep("PCIL",len)) + group_type<- c(rep("CI",len), rep("Estimate",len),rep("CI",len)) + group_ci_type<-c(rep("Point",len), rep("Uniform",len),rep("Point",len)) + + df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type ) + + p<-ggplot(data=df)+ + aes(x=exp(income),y=outcome,colour=group )+ + theme_bw()+ + xlab("Income")+ + ylab("Net TFA, (thousand dollars)")+ + scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ + theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ + theme(legend.title=element_blank())+ + theme(legend.position="none")+ + ylim(low=lowy,high=highy)+ + geom_line(aes(linetype = group_line),size=1.5)+ + scale_linetype_manual(values=c("dashed","solid"))+ + ggtitle(title) + + } + + + + return(p) +} +``` + +```{r} +res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="splines",max_degree=3) +``` + +```{r} +res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="ortho_poly",max_degree=3) +``` + +#plot findings: + +-- black solid line shows estimated function $p(x)' \widehat{\beta}$ + +-- blue dashed lines show pointwise confidence bands for this function + +```{r} +p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho_poly",uniform=FALSE, lowy=-10,highy=20) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +plot findings: + +-- black solid line shows estimated function $p(x)' \widehat{\beta}$ + +-- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\beta_0$ with probability 0.95 + +-- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \rightarrow p(x)'\beta_0$ with probability 0.95 on some compact range + +```{r} +p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho polynomials",uniform=TRUE,lowy=-10,highy=25) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +```{r} +p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=FALSE, lowy=-15,highy=10) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +```{r} +p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=TRUE,lowy=-20,highy=20) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + diff --git a/T/dml-for-conditional-average-treatment-effect.irnb b/T/dml-for-conditional-average-treatment-effect.irnb index b16109f0..6aaec1fb 100644 --- a/T/dml-for-conditional-average-treatment-effect.irnb +++ b/T/dml-for-conditional-average-treatment-effect.irnb @@ -1,963 +1,843 @@ { - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "PtRhnrDslwi1" + }, + "source": [ + "# DML for CATE\n", + "\n", + "This is a simple demonstration of Debiased Machine Learning estimator for the Conditional Average Treatment Effect.\n", + "Goal is to estimate the effect of 401(k) eligibility on net financial assets for each value of income.\n", + "The method is based on the following paper.\n", + "\n", + "* Title: Debiased Machine Learning of Conditional Average Treatment Effect and Other Causal Functions\n", + "\n", + "* Authors: Semenova, Vira and Chernozhukov, Victor.\n", + "\n", + "* Arxiv version: https://arxiv.org/pdf/1702.06240.pdf\n", + "\n", + "* Published version with replication code: https://academic.oup.com/ectj/advance-article/doi/10.1093/ectj/utaa027/5899048\n" + ] }, - "nbformat_minor": 0, - "nbformat": 4, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# DML for CATE\n", - "\n", - "This is a simple demonstration of Debiased Machine Learning estimator for the Conditional Average Treatment Effect.\n", - "Goal is to estimate the effect of 401(k) eligibility on net financial assets for each value of income.\n", - "The method is based on the following paper.\n", - "\n", - "* Title: Debiased Machine Learning of Conditional Average Treatment Effect and Other Causal Functions\n", - "\n", - "* Authors: Semenova, Vira and Chernozhukov, Victor.\n", - "\n", - "* Arxiv version: https://arxiv.org/pdf/1702.06240.pdf\n", - "\n", - "* Published version with replication code: https://academic.oup.com/ectj/advance-article/doi/10.1093/ectj/utaa027/5899048\n" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-05-24T12:51:59.763661Z", - "iopub.execute_input": "2021-05-24T12:51:59.765554Z", - "iopub.status.idle": "2021-05-24T12:51:59.866888Z" - }, - "id": "PtRhnrDslwi1" - } - }, - { - "cell_type": "markdown", - "source": [ - "Background\n", - "\n", - "The target function is Conditional Average Treatment Effect, defined as\n", - "\n", - "$$ g(x)=E [ Y(1) - Y(0) |X=x], $$\n", - "\n", - "where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income.\n", - "$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$.\n", - "\n", - "\n", - "\n", - "If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is\n", - "\n", - "$$ g(x) = E [ Y (\\eta_0) \\mid X=x], $$\n", - "where the transformed outcome variable is\n", - "\n", - "$$Y (\\eta) = \\dfrac{D}{s(Z)} \\left( Y - \\mu(1,Z) \\right) - \\dfrac{1-D}{1-s(Z)} \\left( Y - \\mu(0,Z) \\right) + \\mu(1,Z) - \\mu(0,Z),$$\n", - "\n", - "the probability of eligibility is\n", - "\n", - "$$s_0(z) = Pr (D=1 \\mid Z=z),$$\n", - "\n", - "the expected net financial asset given $D =d \\in \\{1,0\\}$ and $Z=z$ is\n", - "\n", - "$$ \\mu(d,z) = E[ Y \\mid Z=z, D=d]. $$\n", - "\n", - "Our goal is to estimate $g(x)$.\n", - "\n", - "\n", - "In step 1, we estimate the unknown functions $s_0(z), \\mu(1,z), \\mu(0,z)$ and plug them into $Y (\\eta)$.\n", - "\n", - "\n", - "In step 2, we approximate the function $g(x)$ by a linear combination of basis functions:\n", - "\n", - "$$ g(x) = p(x)' \\beta_0, $$\n", - "\n", - "\n", - "where $p(x)$ is a vector of polynomials or splines and\n", - "\n", - "$$ \\beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\\eta_0) $$\n", - "\n", - "is the best linear predictor. We report\n", - "\n", - "$$\n", - "\\widehat{g}(x) = p(x)' \\widehat{\\beta},\n", - "$$\n", - "\n", - "where $\\widehat{\\beta}$ is the ordinary least squares estimate of $\\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$\n", - "\n", - "$$\n", - "\t\\widehat{\\beta} :=\\left( \\dfrac{1}{N} \\sum_{i=1}^N p(X_i) p(X_i)' \\right)^{-1} \\dfrac{1}{N} \\sum_{i=1}^N p(X_i)Y_i(\\widehat{\\eta})\n", - "$$\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "metadata": { - "id": "ibFWChhelwi4" - } - }, - { - "cell_type": "code", - "source": [ - "## load packages\n", - "install.packages(\"foreign\")\n", - "install.packages(\"quantreg\")\n", - "install.packages(\"splines\")\n", - "install.packages(\"lattice\")\n", - "#install.packages(\"mnormt\")\n", - "install.packages(\"Hmisc\")\n", - "install.packages(\"fda\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"ranger\")\n", - "install.packages(\"sandwich\")\n", - "\n", - "library(foreign)\n", - "library(quantreg)\n", - "library(splines)\n", - "library(lattice)\n", - "#library(mnormt)\n", - "library(Hmisc)\n", - "library(fda)\n", - "library(hdm)\n", - "library(randomForest)\n", - "library(ranger)\n", - "library(sandwich)\n", - "library(ggplot2)\n" - ], - "metadata": { - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "_execution_state": "idle", - "execution": { - "iopub.status.busy": "2021-06-17T23:22:23.184728Z", - "iopub.execute_input": "2021-06-17T23:22:23.212298Z", - "iopub.status.idle": "2021-06-17T23:22:25.857077Z" - }, - "trusted": true, - "id": "Lsd0vLFOlwi4" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## 401k dataset\n", - "data(pension)\n", - "pension$net_tfa<-pension$net_tfa/10000\n", - "## covariate of interest -- log income --\n", - "pension$inc = log(pension$inc)\n", - "#pension$inc[is.na(pension$inc)]<-0\n", - "pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,]\n", - "\n", - "\n", - "## outcome variable -- total net financial assets\n", - "Y=pension$net_tfa\n", - "## binary treatment -- indicator of 401(k) eligibility\n", - "D=pension$e401\n", - "\n", - "\n", - "X=pension$inc\n", - "## target parameter is CATE = E[ Y(1) - Y(0) | X]\n", - "\n", - "\n", - "## raw covariates so that Y(1) and Y(0) are independent of D given Z\n", - "Z = pension[,c(\"age\",\"inc\",\"fsize\",\"educ\",\"male\",\"db\",\"marr\",\"twoearn\",\"pira\",\"hown\",\"hval\",\"hequity\",\"hmort\",\n", - " \"nohs\",\"hs\",\"smcol\")]\n", - "\n", - "\n", - "y_name <- \"net_tfa\";\n", - "d_name <- \"e401\";\n", - "form_z <- \"(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2\";\n", - "\n", - "\n", - "\n", - "cat(sprintf(\"\\n sample size is %g \\n\", length(Y) ))\n", - "cat(sprintf(\"\\n num raw covariates z is %g \\n\", dim(Z)[2] ))" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:22:29.81323Z", - "iopub.execute_input": "2021-06-17T23:22:29.815342Z", - "iopub.status.idle": "2021-06-17T23:22:30.080343Z" - }, - "trusted": true, - "id": "P3fqmaJFlwi6" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "In Step 1, we estimate three functions:\n", - "\n", - "1. probability of treatment assignment $s_0(z)$\n", - "\n", - "2.-3. regression functions $\\mu_0(1,z)$ and $\\mu_0(0,z)$. \n", - "\n", - "We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r\n", - "\n", - "For each function, we try random forest.\n" - ], - "metadata": { - "id": "BMNm-HsYlwi7" - } - }, - { - "cell_type": "markdown", - "source": [ - "First Stage: estimate $\\mu_0(1,z)$ and $\\mu_0(0,z)$ and $s_0(z)$ by lasso" - ], - "metadata": { - "id": "mug_Q4w2lwi7" - } - }, - { - "cell_type": "code", - "source": [ - "\n", - "first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) {\n", - "\n", - " # Sample size\n", - " N<-dim(data)[1]\n", - " # Estimated regression function in control group\n", - " mu0.hat<-rep(1,N)\n", - " # Estimated regression function in treated group\n", - " mu1.hat<-rep(1,N)\n", - " # Propensity score\n", - " s.hat<-rep(1,N)\n", - " seed=1\n", - " ## define sample splitting\n", - " set.seed(seed)\n", - " inds.train=sample(1:N,floor(N/2))\n", - " inds.eval=setdiff(1:N,inds.train)\n", - "\n", - " print (\"Estimate treatment probability, first half\")\n", - " ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest\n", - " fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,\"~\",form_z )),data=data[inds.train,])\n", - "\n", - " s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type=\"response\")\n", - " print (\"Estimate treatment probability, second half\")\n", - " fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,\"~\",form_z )),data=data[inds.eval,])\n", - " s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type=\"response\")\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " data1<-data\n", - " data1[,d_name]<-1\n", - "\n", - " data0<-data\n", - " data0[,d_name]<-0\n", - "\n", - " print (\"Estimate expectation function, first half\")\n", - " fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,\"~\",d_name,\"+(\",form_z,\")\" )),data=data[inds.train,])\n", - " mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,])\n", - " mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,])\n", - "\n", - " print (\"Estimate expectation function, second half\")\n", - " fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,\"~\",d_name,\"+(\",form_z,\")\" )),data=data[inds.eval,])\n", - " mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,])\n", - " mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,])\n", - "\n", - " return (list(mu1.hat=mu1.hat,\n", - " mu0.hat=mu0.hat,\n", - " s.hat=s.hat))\n", - "\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:22:34.034028Z", - "iopub.execute_input": "2021-06-17T23:22:34.035487Z", - "iopub.status.idle": "2021-06-17T23:22:34.048805Z" - }, - "trusted": true, - "id": "um5Uxua2lwi7" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "First Stage: estimate $\\mu_0(1,z)$ and $\\mu_0(0,z)$ and $s_0(z)$ by random forest" - ], - "metadata": { - "id": "HCwXUQtUlwi8" - } - }, - { - "cell_type": "code", - "source": [ - "first_stage_rf<-function(Y,D,Z,seed=1) {\n", - "\n", - " # Sample size\n", - " N<-length(D)\n", - " # Estimated regression function in control group\n", - " mu0.hat<-rep(1,N)\n", - " # Estimated regression function in treated group\n", - " mu1.hat<-rep(1,N)\n", - " # Propensity score\n", - " s.hat<-rep(1,N)\n", - "\n", - "\n", - " ## define sample splitting\n", - " set.seed(seed)\n", - " inds.train=sample(1:N,floor(N/2))\n", - " inds.eval=setdiff(1:N,inds.train)\n", - "\n", - " print (\"Estimate treatment probability, first half\")\n", - " ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest\n", - " D.f<-as.factor(as.character(D))\n", - " fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train)\n", - " s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type=\"prob\")[,2]\n", - " print (\"Estimate treatment probability, second half\")\n", - " fitted.rf<-randomForest(Z,D.f,subset=inds.eval)\n", - " s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type=\"prob\")[,2]\n", - "\n", - " ## conditional expected net financial assets (i.e., regression function) based on random forest\n", - "\n", - " covariates<-cbind(Z,D)\n", - "\n", - " covariates1<-cbind(Z,D=rep(1,N))\n", - " covariates0<-cbind(Z,D=rep(0,N))\n", - "\n", - " print (\"Estimate expectation function, first half\")\n", - " fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train)\n", - " mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,])\n", - " mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,])\n", - "\n", - " print (\"Estimate expectation function, second half\")\n", - " fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval)\n", - " mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,])\n", - " mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,])\n", - "\n", - " return (list(mu1.hat=mu1.hat,\n", - " mu0.hat=mu0.hat,\n", - " s.hat=s.hat))\n", - "\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:22:39.936225Z", - "iopub.execute_input": "2021-06-17T23:22:39.937654Z", - "iopub.status.idle": "2021-06-17T23:22:39.949402Z" - }, - "trusted": true, - "id": "XXWJvNPTlwi8" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "In Step 2, we approximate $Y(\\eta_0)$ by a vector of basis functions. There are two use cases:\n", - "****\n", - "2.A. Group Average Treatment Effect, described above\n", - "\n", - "\n", - "2.B. Average Treatment Effect conditional on income value. There are three smoothing options:\n", - "\n", - "1. splines offered in ```least_squares_splines```\n", - "\n", - "2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series```\n", - "\n", - "3. standard polynomials with the highest degree input by user ```least_squares_series_old```\n", - "\n", - "\n", - "The default option is option 3." - ], - "metadata": { - "trusted": true, - "id": "dlNaseXIlwi8" - } - }, - { - "cell_type": "markdown", - "source": [ - "2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as\n", - "\n", - "$$ - \\infty = \\ell_0 < \\ell_1 < \\ell_2 \\dots \\ell_K = \\infty $$\n", - "\n", - "define intervals $I_k = [ \\ell_{k-1}, \\ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator\n", - "\n", - "$$ G_k(X) = 1[X \\in I_k], $$\n", - "\n", - "and the vector of basis functions\n", - "\n", - "$$ p(X) = (G_1(X), G_2(X), \\dots, G_K(X)) $$\n", - "\n", - "Then, the Best Linear Predictor $\\beta_0$ vector shows the average treatment effect for each group." - ], - "metadata": { - "id": "lg4CWsRqlwi8" - } - }, - { - "cell_type": "code", - "source": [ - "## estimate first stage functions by random forest\n", - "## may take a while\n", - "fs.hat.rf = first_stage_rf(Y,D,Z)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:22:43.39889Z", - "iopub.execute_input": "2021-06-17T23:22:43.400377Z", - "iopub.status.idle": "2021-06-17T23:25:21.477536Z" - }, - "trusted": true, - "id": "gDrRceialwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "X=pension$inc\n", - "fs.hat<-fs.hat.rf\n", - "min_cutoff=0.01\n", - "# regression function\n", - "mu1.hat<-fs.hat[[\"mu1.hat\"]]\n", - "mu0.hat<-fs.hat[[\"mu0.hat\"]]\n", - "# propensity score\n", - "s.hat<-fs.hat[[\"s.hat\"]]\n", - "s.hat<-sapply(s.hat,max,min_cutoff)\n", - "### Construct Orthogonal Signal\n", - "\n", - "\n", - "RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:29.48508Z", - "iopub.execute_input": "2021-06-17T23:29:29.486762Z", - "iopub.status.idle": "2021-06-17T23:29:29.51917Z" - }, - "trusted": true, - "id": "Y76SLTKPlwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "qtmax <- function(C, S=10000, alpha)\n", - " {;\n", - " p <- nrow(C);\n", - " tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max);\n", - " return(quantile(tmaxs, 1-alpha));\n", - " };\n", - "\n", - "# This function computes the square root of a symmetric matrix using the spectral decomposition;\n", - "\n", - "\n", - "group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) {\n", - "\n", - " grid<-quantile(X,probs=c((0:max_grid)/max_grid))\n", - " X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1)\n", - "\n", - " for (k in 2:((length(grid)))) {\n", - " X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) )\n", - "\n", - " ols.fit<- lm(Y~X.raw-1)\n", - " coefs <- coef(ols.fit)\n", - " vars <- names(coefs)\n", - " HCV.coefs <- vcovHC(ols.fit, type = 'HC')\n", - " coefs.se <- sqrt(diag(HCV.coefs)) # White std errors\n", - " ## this is an identity matrix\n", - " ## qtmax is simplified\n", - " C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs))));\n", - "\n", - "\n", - " tes <- coefs\n", - " tes.se <- coefs.se\n", - " tes.cor <- C.coefs\n", - " crit.val <- qtmax(tes.cor,B,alpha);\n", - "\n", - " tes.ucb <- tes + crit.val * tes.se;\n", - " tes.lcb <- tes - crit.val * tes.se;\n", - "\n", - " tes.uci <- tes + qnorm(1-alpha/2) * tes.se;\n", - " tes.lci <- tes + qnorm(alpha/2) * tes.se;\n", - "\n", - "\n", - " return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci,\n", - " ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val ))\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:31.606456Z", - "iopub.execute_input": "2021-06-17T23:29:31.609202Z", - "iopub.status.idle": "2021-06-17T23:29:31.629475Z" - }, - "trusted": true, - "id": "nODBERqdlwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "res<-group_average_treatment_effect(X=X,Y=RobustSignal)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:33.458131Z", - "iopub.execute_input": "2021-06-17T23:29:33.459626Z", - "iopub.status.idle": "2021-06-17T23:29:34.195401Z" - }, - "trusted": true, - "id": "anNpVKgnlwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "## this code is taken from L1 14.382 taught at MIT\n", - "## author: Mert Demirer\n", - "options(repr.plot.width=10, repr.plot.height=8)\n", - "\n", - "tes<-res$beta.hat\n", - "tes.lci<-res$ghat.lower.point\n", - "tes.uci<-res$ghat.upper.point\n", - "\n", - "tes.lcb<-res$ghat.lower\n", - "tes.ucb<-res$ghat.upper\n", - "tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%')\n", - "\n", - "plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type=\"n\",xlab=\"Income group\",\n", - " ylab=\"Average Effect on NET TFA (per 10 K)\", main=\"Group Average Treatment Effects on NET TFA\", xaxt=\"n\");\n", - "axis(1, at=1:5, labels=tes.lev);\n", - "for (i in 1:5)\n", - "{;\n", - " rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = \"red\", lwd = 3);\n", - " rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 );\n", - " segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 );\n", - "};\n", - "abline(h=0);\n", - "\n", - "legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8);\n", - "\n", - "dev.off()" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:36.013118Z", - "iopub.execute_input": "2021-06-17T23:29:36.014791Z", - "iopub.status.idle": "2021-06-17T23:29:36.401451Z" - }, - "trusted": true, - "id": "sdpo7pwMlwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) {\n", - " ## Create technical regressors\n", - " cv.bsp<-rep(0,max_knot-1)\n", - " for (knot in 2:max_knot) {\n", - " breaks<- quantile(X, c(0:knot)/knot)\n", - " formula.bsp \t<- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]\n", - " fit\t<- lm(formula.bsp);\n", - " cv.bsp[knot-1]\t\t<- sum( (fit$res / (1 - hatvalues(fit)) )^2);\n", - " }\n", - " ## Number of knots chosen by cross-validation\n", - " cv_knot<-which.min(cv.bsp)+1\n", - " breaks<- quantile(X, c(0:cv_knot)/cv_knot)\n", - " formula.bsp \t<- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1]\n", - " fit\t<- lm(formula.bsp);\n", - "\n", - "\n", - " return(list(cv_knot=cv_knot,fit=fit))\n", - "}\n", - "\n", - "\n", - "least_squares_series<-function(X, Y,max_degree,...) {\n", - "\n", - " cv.pol<-rep(0,max_degree)\n", - " for (degree in 1:max_degree) {\n", - " formula.pol \t<- Y ~ poly(X, degree)\n", - " fit\t<- lm(formula.pol );\n", - " cv.pol[degree]\t\t<- sum( (fit$res / (1 - hatvalues(fit)) )^2);\n", - " }\n", - " ## Number of knots chosen by cross-validation\n", - " cv_degree<-which.min(cv.pol)\n", - " ## Estimate coefficients\n", - " formula.pol \t<- Y ~ poly(X, cv_degree)\n", - " fit\t<- lm(formula.pol);\n", - "\n", - "\n", - " return(list(fit=fit,cv_degree=cv_degree))\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:39.594018Z", - "iopub.execute_input": "2021-06-17T23:29:39.595684Z", - "iopub.status.idle": "2021-06-17T23:29:39.61105Z" - }, - "trusted": true, - "id": "_BXi61OQlwi9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "msqrt <- function(C)\n", - " {;\n", - " C.eig <- eigen(C);\n", - " return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors));\n", - " };\n", - "\n", - "\n", - "tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) {\n", - "\n", - "\n", - " numerator_grid<-regressors_grid%*%msqrt( Omega.hat)\n", - " denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid)))\n", - "\n", - " norm_numerator_grid<-numerator_grid\n", - " for (k in 1:dim(numerator_grid)[1]) {\n", - " norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k]\n", - " }\n", - "\n", - " tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max)\n", - " return(quantile(tmaxs, 1-alpha))\n", - "\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:41.532655Z", - "iopub.execute_input": "2021-06-17T23:29:41.534418Z", - "iopub.status.idle": "2021-06-17T23:29:41.549624Z" - }, - "trusted": true, - "id": "uvxphkMUlwi-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method=\"poly\",min_cutoff=0.01,alpha=0.05,eps=0.1,...) {\n", - "\n", - " X_grid = seq(min(X),max(X),eps)\n", - " mu1.hat<-fs.hat[[\"mu1.hat\"]]\n", - " mu0.hat<-fs.hat[[\"mu0.hat\"]]\n", - " # propensity score\n", - " s.hat<-fs.hat[[\"s.hat\"]]\n", - " s.hat<-sapply(s.hat,max,min_cutoff)\n", - " ### Construct Orthogonal Signal\n", - "\n", - " RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat\n", - "\n", - "\n", - "\n", - " # Estimate the target function using least squares series\n", - " if (ss_method == \"ortho_poly\") {\n", - " res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree)\n", - " fit<-res$fit\n", - " cv_degree<-res$cv_degree\n", - " regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree))\n", - "\n", - " }\n", - " if (ss_method == \"splines\") {\n", - "\n", - " res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv)\n", - " fit<-res$fit\n", - " cv_knot<-res$cv_knot\n", - " breaks<- quantile(X, c(0:cv_knot)/cv_knot)\n", - " regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1])\n", - " degree=cv_knot\n", - "\n", - "\n", - " }\n", - "\n", - "\n", - " g.hat<-regressors_grid%*%coef(fit)\n", - "\n", - "\n", - " HCV.coefs <- vcovHC(fit, type = 'HC')\n", - " #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit))\n", - " standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid)))\n", - " ### Lower Pointwise CI\n", - " ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error\n", - " ### Upper Pointwise CI\n", - " ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error\n", - "\n", - " max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha)\n", - "\n", - "\n", - " ## Lower Uniform CI\n", - " ghat.lower<-g.hat-max_tstat*standard_error\n", - " ## Upper Uniform CI\n", - " ghat.upper<-g.hat+max_tstat*standard_error\n", - " return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point,\n", - " ghat.upper.point=ghat.upper.point,X_grid=X_grid))\n", - "\n", - "\n", - "\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:29:43.135629Z", - "iopub.execute_input": "2021-06-17T23:29:43.137405Z", - "iopub.status.idle": "2021-06-17T23:29:43.150866Z" - }, - "trusted": true, - "id": "nupH8ItWlwi-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "make_plot<-function(res,lowy,highy,degree,ss_method = \"series\",uniform=TRUE,...) {\n", - "\n", - "\n", - " title=paste0(\"Effect of 401(k) on Net TFA, \", ss_method)\n", - " X_grid=res$X_grid\n", - " len = length(X_grid)\n", - "\n", - "\n", - " if (uniform) {\n", - " group <-c(rep(\"UCI\",len), rep(\"PCI\",len), rep(\"Estimate\",len),rep(\"PCIL\",len),rep(\"UCIL\",len))\n", - " group_type<- c(rep(\"CI\",len), rep(\"CI\",len), rep(\"Estimate\",len),rep(\"CI\",len),rep(\"CI\",len))\n", - " group_ci_type<-c(rep(\"Uniform\",len), rep(\"Point\",len), rep(\"Uniform\",len),rep(\"Point\",len),rep(\"Uniform\",len))\n", - "\n", - " df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type )\n", - " p<-ggplot(data=df)+\n", - " aes(x=exp(income),y=outcome,colour=group )+\n", - " theme_bw()+\n", - " xlab(\"Income\")+\n", - " ylab(\"Net TFA, (thousand dollars)\")+\n", - " scale_colour_manual(values=c(\"black\",\"blue\",\"blue\",\"blue\",\"blue\"))+\n", - " theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family=\"serif\"))+\n", - " theme(legend.title=element_blank())+\n", - " theme(legend.position=\"none\")+\n", - " ylim(low=lowy,high=highy)+\n", - " geom_line(aes(linetype = group_line),size=1.5)+\n", - " scale_linetype_manual(values=c(\"dashed\",\"solid\"))+\n", - " ggtitle(title)\n", - " }\n", - "\n", - " if (!uniform) {\n", - " group <-c( rep(\"PCI\",len), rep(\"Estimate\",len),rep(\"PCIL\",len))\n", - " group_type<- c(rep(\"CI\",len), rep(\"Estimate\",len),rep(\"CI\",len))\n", - " group_ci_type<-c(rep(\"Point\",len), rep(\"Uniform\",len),rep(\"Point\",len))\n", - "\n", - " df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type )\n", - "\n", - " p<-ggplot(data=df)+\n", - " aes(x=exp(income),y=outcome,colour=group )+\n", - " theme_bw()+\n", - " xlab(\"Income\")+\n", - " ylab(\"Net TFA, (thousand dollars)\")+\n", - " scale_colour_manual(values=c(\"black\",\"blue\",\"blue\",\"blue\",\"blue\"))+\n", - " theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family=\"serif\"))+\n", - " theme(legend.title=element_blank())+\n", - " theme(legend.position=\"none\")+\n", - " ylim(low=lowy,high=highy)+\n", - " geom_line(aes(linetype = group_line),size=1.5)+\n", - " scale_linetype_manual(values=c(\"dashed\",\"solid\"))+\n", - " ggtitle(title)\n", - "\n", - " }\n", - "\n", - "\n", - "\n", - " return(p)\n", - "}" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:30:32.523395Z", - "iopub.execute_input": "2021-06-17T23:30:32.524854Z", - "iopub.status.idle": "2021-06-17T23:30:32.538133Z" - }, - "trusted": true, - "id": "NB10aQcPlwi-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method=\"splines\",max_degree=3)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:30:34.152061Z", - "iopub.execute_input": "2021-06-17T23:30:34.153603Z", - "iopub.status.idle": "2021-06-17T23:30:34.373651Z" - }, - "trusted": true, - "id": "A_cHQnAblwi-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method=\"ortho_poly\",max_degree=3)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:30:35.857481Z", - "iopub.execute_input": "2021-06-17T23:30:35.858912Z", - "iopub.status.idle": "2021-06-17T23:30:35.999597Z" - }, - "trusted": true, - "id": "AGOMgvgglwi-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#plot findings:\n", - "\n", - "-- black solid line shows estimated function $p(x)' \\widehat{\\beta}$\n", - "\n", - "-- blue dashed lines show pointwise confidence bands for this function" - ], - "metadata": { - "id": "jXpt7WdKlwi-" - } - }, - { - "cell_type": "code", - "source": [ - "p<-make_plot(res_ortho_rf_ortho_poly,ss_method=\"ortho_poly\",uniform=FALSE, lowy=-10,highy=20)\n", - "options(repr.plot.width=15, repr.plot.height=10)\n", - "print(p)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:30:38.317509Z", - "iopub.execute_input": "2021-06-17T23:30:38.318889Z", - "iopub.status.idle": "2021-06-17T23:30:38.869822Z" - }, - "trusted": true, - "id": "UfwShWdUlwi_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "plot findings:\n", - "\n", - "-- black solid line shows estimated function $p(x)' \\widehat{\\beta}$\n", - "\n", - "-- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\\beta_0$ with probability 0.95\n", - "\n", - "-- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \\rightarrow p(x)'\\beta_0$ with probability 0.95 on some compact range" - ], - "metadata": { - "id": "nH5ZbKG0lwi_" - } - }, - { - "cell_type": "code", - "source": [ - "p<-make_plot(res_ortho_rf_ortho_poly,ss_method=\"ortho polynomials\",uniform=TRUE,lowy=-10,highy=25)\n", - "options(repr.plot.width=15, repr.plot.height=10)\n", - "print(p)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:31:40.758163Z", - "iopub.execute_input": "2021-06-17T23:31:40.759621Z", - "iopub.status.idle": "2021-06-17T23:31:41.125637Z" - }, - "trusted": true, - "id": "nDcX_Pkylwi_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "p<-make_plot(res_ortho_rf_splines,ss_method=\"splines\",uniform=FALSE, lowy=-15,highy=10)\n", - "options(repr.plot.width=15, repr.plot.height=10)\n", - "print(p)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:30:46.617249Z", - "iopub.execute_input": "2021-06-17T23:30:46.618764Z", - "iopub.status.idle": "2021-06-17T23:30:46.973963Z" - }, - "trusted": true, - "id": "RRrnhWU0lwi_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "p<-make_plot(res_ortho_rf_splines,ss_method=\"splines\",uniform=TRUE,lowy=-20,highy=20)\n", - "options(repr.plot.width=15, repr.plot.height=10)\n", - "print(p)" - ], - "metadata": { - "execution": { - "iopub.status.busy": "2021-06-17T23:31:26.657033Z", - "iopub.execute_input": "2021-06-17T23:31:26.658474Z", - "iopub.status.idle": "2021-06-17T23:31:27.015569Z" - }, - "trusted": true, - "id": "ShL8EFdTlwi_" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "ibFWChhelwi4" + }, + "source": [ + "Background\n", + "\n", + "The target function is Conditional Average Treatment Effect, defined as\n", + "\n", + "$$ g(x)=E [ Y(1) - Y(0) |X=x], $$\n", + "\n", + "where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income.\n", + "$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$.\n", + "\n", + "\n", + "\n", + "If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is\n", + "\n", + "$$ g(x) = E [ Y (\\eta_0) \\mid X=x], $$\n", + "where the transformed outcome variable is\n", + "\n", + "$$Y (\\eta) = \\dfrac{D}{s(Z)} \\left( Y - \\mu(1,Z) \\right) - \\dfrac{1-D}{1-s(Z)} \\left( Y - \\mu(0,Z) \\right) + \\mu(1,Z) - \\mu(0,Z),$$\n", + "\n", + "the probability of eligibility is\n", + "\n", + "$$s_0(z) = Pr (D=1 \\mid Z=z),$$\n", + "\n", + "the expected net financial asset given $D =d \\in \\{1,0\\}$ and $Z=z$ is\n", + "\n", + "$$ \\mu(d,z) = E[ Y \\mid Z=z, D=d]. $$\n", + "\n", + "Our goal is to estimate $g(x)$.\n", + "\n", + "\n", + "In step 1, we estimate the unknown functions $s_0(z), \\mu(1,z), \\mu(0,z)$ and plug them into $Y (\\eta)$.\n", + "\n", + "\n", + "In step 2, we approximate the function $g(x)$ by a linear combination of basis functions:\n", + "\n", + "$$ g(x) = p(x)' \\beta_0, $$\n", + "\n", + "\n", + "where $p(x)$ is a vector of polynomials or splines and\n", + "\n", + "$$ \\beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\\eta_0) $$\n", + "\n", + "is the best linear predictor. We report\n", + "\n", + "$$\n", + "\\widehat{g}(x) = p(x)' \\widehat{\\beta},\n", + "$$\n", + "\n", + "where $\\widehat{\\beta}$ is the ordinary least squares estimate of $\\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$\n", + "\n", + "$$\n", + "\t\\widehat{\\beta} :=\\left( \\dfrac{1}{N} \\sum_{i=1}^N p(X_i) p(X_i)' \\right)^{-1} \\dfrac{1}{N} \\sum_{i=1}^N p(X_i)Y_i(\\widehat{\\eta})\n", + "$$\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "Lsd0vLFOlwi4" + }, + "outputs": [], + "source": [ + "## load packages\n", + "install.packages(\"foreign\")\n", + "install.packages(\"quantreg\")\n", + "install.packages(\"splines\")\n", + "install.packages(\"lattice\")\n", + "#install.packages(\"mnormt\")\n", + "install.packages(\"Hmisc\")\n", + "install.packages(\"fda\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"ranger\")\n", + "install.packages(\"sandwich\")\n", + "\n", + "library(foreign)\n", + "library(quantreg)\n", + "library(splines)\n", + "library(lattice)\n", + "#library(mnormt)\n", + "library(Hmisc)\n", + "library(fda)\n", + "library(hdm)\n", + "library(randomForest)\n", + "library(ranger)\n", + "library(sandwich)\n", + "library(ggplot2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "P3fqmaJFlwi6" + }, + "outputs": [], + "source": [ + "## 401k dataset\n", + "data(pension)\n", + "pension$net_tfa<-pension$net_tfa/10000\n", + "## covariate of interest -- log income --\n", + "pension$inc = log(pension$inc)\n", + "#pension$inc[is.na(pension$inc)]<-0\n", + "pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,]\n", + "\n", + "\n", + "## outcome variable -- total net financial assets\n", + "Y=pension$net_tfa\n", + "## binary treatment -- indicator of 401(k) eligibility\n", + "D=pension$e401\n", + "\n", + "\n", + "X=pension$inc\n", + "## target parameter is CATE = E[ Y(1) - Y(0) | X]\n", + "\n", + "\n", + "## raw covariates so that Y(1) and Y(0) are independent of D given Z\n", + "Z = pension[,c(\"age\",\"inc\",\"fsize\",\"educ\",\"male\",\"db\",\"marr\",\"twoearn\",\"pira\",\"hown\",\"hval\",\"hequity\",\"hmort\",\n", + " \"nohs\",\"hs\",\"smcol\")]\n", + "\n", + "\n", + "y_name <- \"net_tfa\";\n", + "d_name <- \"e401\";\n", + "form_z <- \"(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2\";\n", + "\n", + "\n", + "\n", + "cat(sprintf(\"\\n sample size is %g \\n\", length(Y) ))\n", + "cat(sprintf(\"\\n num raw covariates z is %g \\n\", dim(Z)[2] ))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BMNm-HsYlwi7" + }, + "source": [ + "In Step 1, we estimate three functions:\n", + "\n", + "1. probability of treatment assignment $s_0(z)$\n", + "\n", + "2.-3. regression functions $\\mu_0(1,z)$ and $\\mu_0(0,z)$. \n", + "\n", + "We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r\n", + "\n", + "For each function, we try random forest.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mug_Q4w2lwi7" + }, + "source": [ + "First Stage: estimate $\\mu_0(1,z)$ and $\\mu_0(0,z)$ and $s_0(z)$ by lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "um5Uxua2lwi7" + }, + "outputs": [], + "source": [ + "\n", + "first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) {\n", + "\n", + " # Sample size\n", + " N<-dim(data)[1]\n", + " # Estimated regression function in control group\n", + " mu0.hat<-rep(1,N)\n", + " # Estimated regression function in treated group\n", + " mu1.hat<-rep(1,N)\n", + " # Propensity score\n", + " s.hat<-rep(1,N)\n", + " seed=1\n", + " ## define sample splitting\n", + " set.seed(seed)\n", + " inds.train=sample(1:N,floor(N/2))\n", + " inds.eval=setdiff(1:N,inds.train)\n", + "\n", + " print (\"Estimate treatment probability, first half\")\n", + " ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest\n", + " fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,\"~\",form_z )),data=data[inds.train,])\n", + "\n", + " s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type=\"response\")\n", + " print (\"Estimate treatment probability, second half\")\n", + " fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,\"~\",form_z )),data=data[inds.eval,])\n", + " s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type=\"response\")\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " data1<-data\n", + " data1[,d_name]<-1\n", + "\n", + " data0<-data\n", + " data0[,d_name]<-0\n", + "\n", + " print (\"Estimate expectation function, first half\")\n", + " fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,\"~\",d_name,\"+(\",form_z,\")\" )),data=data[inds.train,])\n", + " mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,])\n", + " mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,])\n", + "\n", + " print (\"Estimate expectation function, second half\")\n", + " fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,\"~\",d_name,\"+(\",form_z,\")\" )),data=data[inds.eval,])\n", + " mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,])\n", + " mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,])\n", + "\n", + " return (list(mu1.hat=mu1.hat,\n", + " mu0.hat=mu0.hat,\n", + " s.hat=s.hat))\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HCwXUQtUlwi8" + }, + "source": [ + "First Stage: estimate $\\mu_0(1,z)$ and $\\mu_0(0,z)$ and $s_0(z)$ by random forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XXWJvNPTlwi8" + }, + "outputs": [], + "source": [ + "first_stage_rf<-function(Y,D,Z,seed=1) {\n", + "\n", + " # Sample size\n", + " N<-length(D)\n", + " # Estimated regression function in control group\n", + " mu0.hat<-rep(1,N)\n", + " # Estimated regression function in treated group\n", + " mu1.hat<-rep(1,N)\n", + " # Propensity score\n", + " s.hat<-rep(1,N)\n", + "\n", + "\n", + " ## define sample splitting\n", + " set.seed(seed)\n", + " inds.train=sample(1:N,floor(N/2))\n", + " inds.eval=setdiff(1:N,inds.train)\n", + "\n", + " print (\"Estimate treatment probability, first half\")\n", + " ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest\n", + " D.f<-as.factor(as.character(D))\n", + " fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train)\n", + " s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type=\"prob\")[,2]\n", + " print (\"Estimate treatment probability, second half\")\n", + " fitted.rf<-randomForest(Z,D.f,subset=inds.eval)\n", + " s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type=\"prob\")[,2]\n", + "\n", + " ## conditional expected net financial assets (i.e., regression function) based on random forest\n", + "\n", + " covariates<-cbind(Z,D)\n", + "\n", + " covariates1<-cbind(Z,D=rep(1,N))\n", + " covariates0<-cbind(Z,D=rep(0,N))\n", + "\n", + " print (\"Estimate expectation function, first half\")\n", + " fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train)\n", + " mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,])\n", + " mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,])\n", + "\n", + " print (\"Estimate expectation function, second half\")\n", + " fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval)\n", + " mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,])\n", + " mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,])\n", + "\n", + " return (list(mu1.hat=mu1.hat,\n", + " mu0.hat=mu0.hat,\n", + " s.hat=s.hat))\n", + "\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dlNaseXIlwi8" + }, + "source": [ + "In Step 2, we approximate $Y(\\eta_0)$ by a vector of basis functions. There are two use cases:\n", + "****\n", + "2.A. Group Average Treatment Effect, described above\n", + "\n", + "\n", + "2.B. Average Treatment Effect conditional on income value. There are three smoothing options:\n", + "\n", + "1. splines offered in ```least_squares_splines```\n", + "\n", + "2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series```\n", + "\n", + "3. standard polynomials with the highest degree input by user ```least_squares_series_old```\n", + "\n", + "\n", + "The default option is option 3." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lg4CWsRqlwi8" + }, + "source": [ + "2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as\n", + "\n", + "$$ - \\infty = \\ell_0 < \\ell_1 < \\ell_2 \\dots \\ell_K = \\infty $$\n", + "\n", + "define intervals $I_k = [ \\ell_{k-1}, \\ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator\n", + "\n", + "$$ G_k(X) = 1[X \\in I_k], $$\n", + "\n", + "and the vector of basis functions\n", + "\n", + "$$ p(X) = (G_1(X), G_2(X), \\dots, G_K(X)) $$\n", + "\n", + "Then, the Best Linear Predictor $\\beta_0$ vector shows the average treatment effect for each group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gDrRceialwi9" + }, + "outputs": [], + "source": [ + "## estimate first stage functions by random forest\n", + "## may take a while\n", + "fs.hat.rf = first_stage_rf(Y,D,Z)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y76SLTKPlwi9" + }, + "outputs": [], + "source": [ + "X=pension$inc\n", + "fs.hat<-fs.hat.rf\n", + "min_cutoff=0.01\n", + "# regression function\n", + "mu1.hat<-fs.hat[[\"mu1.hat\"]]\n", + "mu0.hat<-fs.hat[[\"mu0.hat\"]]\n", + "# propensity score\n", + "s.hat<-fs.hat[[\"s.hat\"]]\n", + "s.hat<-sapply(s.hat,max,min_cutoff)\n", + "### Construct Orthogonal Signal\n", + "\n", + "\n", + "RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nODBERqdlwi9" + }, + "outputs": [], + "source": [ + "qtmax <- function(C, S=10000, alpha)\n", + " {;\n", + " p <- nrow(C);\n", + " tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max);\n", + " return(quantile(tmaxs, 1-alpha));\n", + " };\n", + "\n", + "# This function computes the square root of a symmetric matrix using the spectral decomposition;\n", + "\n", + "\n", + "group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) {\n", + "\n", + " grid<-quantile(X,probs=c((0:max_grid)/max_grid))\n", + " X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1)\n", + "\n", + " for (k in 2:((length(grid)))) {\n", + " X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) )\n", + "\n", + " ols.fit<- lm(Y~X.raw-1)\n", + " coefs <- coef(ols.fit)\n", + " vars <- names(coefs)\n", + " HCV.coefs <- vcovHC(ols.fit, type = 'HC')\n", + " coefs.se <- sqrt(diag(HCV.coefs)) # White std errors\n", + " ## this is an identity matrix\n", + " ## qtmax is simplified\n", + " C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs))));\n", + "\n", + "\n", + " tes <- coefs\n", + " tes.se <- coefs.se\n", + " tes.cor <- C.coefs\n", + " crit.val <- qtmax(tes.cor,B,alpha);\n", + "\n", + " tes.ucb <- tes + crit.val * tes.se;\n", + " tes.lcb <- tes - crit.val * tes.se;\n", + "\n", + " tes.uci <- tes + qnorm(1-alpha/2) * tes.se;\n", + " tes.lci <- tes + qnorm(alpha/2) * tes.se;\n", + "\n", + "\n", + " return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci,\n", + " ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val ))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "anNpVKgnlwi9" + }, + "outputs": [], + "source": [ + "res<-group_average_treatment_effect(X=X,Y=RobustSignal)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sdpo7pwMlwi9" + }, + "outputs": [], + "source": [ + "## this code is taken from L1 14.382 taught at MIT\n", + "## author: Mert Demirer\n", + "options(repr.plot.width=10, repr.plot.height=8)\n", + "\n", + "tes<-res$beta.hat\n", + "tes.lci<-res$ghat.lower.point\n", + "tes.uci<-res$ghat.upper.point\n", + "\n", + "tes.lcb<-res$ghat.lower\n", + "tes.ucb<-res$ghat.upper\n", + "tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%')\n", + "\n", + "plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type=\"n\",xlab=\"Income group\",\n", + " ylab=\"Average Effect on NET TFA (per 10 K)\", main=\"Group Average Treatment Effects on NET TFA\", xaxt=\"n\");\n", + "axis(1, at=1:5, labels=tes.lev);\n", + "for (i in 1:5)\n", + "{;\n", + " rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = \"red\", lwd = 3);\n", + " rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 );\n", + " segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 );\n", + "};\n", + "abline(h=0);\n", + "\n", + "legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8);\n", + "\n", + "dev.off()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_BXi61OQlwi9" + }, + "outputs": [], + "source": [ + "least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) {\n", + " ## Create technical regressors\n", + " cv.bsp<-rep(0,max_knot-1)\n", + " for (knot in 2:max_knot) {\n", + " breaks<- quantile(X, c(0:knot)/knot)\n", + " formula.bsp \t<- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]\n", + " fit\t<- lm(formula.bsp);\n", + " cv.bsp[knot-1]\t\t<- sum( (fit$res / (1 - hatvalues(fit)) )^2);\n", + " }\n", + " ## Number of knots chosen by cross-validation\n", + " cv_knot<-which.min(cv.bsp)+1\n", + " breaks<- quantile(X, c(0:cv_knot)/cv_knot)\n", + " formula.bsp \t<- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1]\n", + " fit\t<- lm(formula.bsp);\n", + "\n", + "\n", + " return(list(cv_knot=cv_knot,fit=fit))\n", + "}\n", + "\n", + "\n", + "least_squares_series<-function(X, Y,max_degree,...) {\n", + "\n", + " cv.pol<-rep(0,max_degree)\n", + " for (degree in 1:max_degree) {\n", + " formula.pol \t<- Y ~ poly(X, degree)\n", + " fit\t<- lm(formula.pol );\n", + " cv.pol[degree]\t\t<- sum( (fit$res / (1 - hatvalues(fit)) )^2);\n", + " }\n", + " ## Number of knots chosen by cross-validation\n", + " cv_degree<-which.min(cv.pol)\n", + " ## Estimate coefficients\n", + " formula.pol \t<- Y ~ poly(X, cv_degree)\n", + " fit\t<- lm(formula.pol);\n", + "\n", + "\n", + " return(list(fit=fit,cv_degree=cv_degree))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uvxphkMUlwi-" + }, + "outputs": [], + "source": [ + "msqrt <- function(C)\n", + " {;\n", + " C.eig <- eigen(C);\n", + " return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors));\n", + " };\n", + "\n", + "\n", + "tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) {\n", + "\n", + "\n", + " numerator_grid<-regressors_grid%*%msqrt( Omega.hat)\n", + " denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid)))\n", + "\n", + " norm_numerator_grid<-numerator_grid\n", + " for (k in 1:dim(numerator_grid)[1]) {\n", + " norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k]\n", + " }\n", + "\n", + " tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max)\n", + " return(quantile(tmaxs, 1-alpha))\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nupH8ItWlwi-" + }, + "outputs": [], + "source": [ + "second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method=\"poly\",min_cutoff=0.01,alpha=0.05,eps=0.1,...) {\n", + "\n", + " X_grid = seq(min(X),max(X),eps)\n", + " mu1.hat<-fs.hat[[\"mu1.hat\"]]\n", + " mu0.hat<-fs.hat[[\"mu0.hat\"]]\n", + " # propensity score\n", + " s.hat<-fs.hat[[\"s.hat\"]]\n", + " s.hat<-sapply(s.hat,max,min_cutoff)\n", + " ### Construct Orthogonal Signal\n", + "\n", + " RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat\n", + "\n", + "\n", + "\n", + " # Estimate the target function using least squares series\n", + " if (ss_method == \"ortho_poly\") {\n", + " res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree)\n", + " fit<-res$fit\n", + " cv_degree<-res$cv_degree\n", + " regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree))\n", + "\n", + " }\n", + " if (ss_method == \"splines\") {\n", + "\n", + " res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv)\n", + " fit<-res$fit\n", + " cv_knot<-res$cv_knot\n", + " breaks<- quantile(X, c(0:cv_knot)/cv_knot)\n", + " regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1])\n", + " degree=cv_knot\n", + "\n", + "\n", + " }\n", + "\n", + "\n", + " g.hat<-regressors_grid%*%coef(fit)\n", + "\n", + "\n", + " HCV.coefs <- vcovHC(fit, type = 'HC')\n", + " #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit))\n", + " standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid)))\n", + " ### Lower Pointwise CI\n", + " ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error\n", + " ### Upper Pointwise CI\n", + " ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error\n", + "\n", + " max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha)\n", + "\n", + "\n", + " ## Lower Uniform CI\n", + " ghat.lower<-g.hat-max_tstat*standard_error\n", + " ## Upper Uniform CI\n", + " ghat.upper<-g.hat+max_tstat*standard_error\n", + " return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point,\n", + " ghat.upper.point=ghat.upper.point,X_grid=X_grid))\n", + "\n", + "\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NB10aQcPlwi-" + }, + "outputs": [], + "source": [ + "make_plot<-function(res,lowy,highy,degree,ss_method = \"series\",uniform=TRUE,...) {\n", + "\n", + "\n", + " title=paste0(\"Effect of 401(k) on Net TFA, \", ss_method)\n", + " X_grid=res$X_grid\n", + " len = length(X_grid)\n", + "\n", + "\n", + " if (uniform) {\n", + " group <-c(rep(\"UCI\",len), rep(\"PCI\",len), rep(\"Estimate\",len),rep(\"PCIL\",len),rep(\"UCIL\",len))\n", + " group_type<- c(rep(\"CI\",len), rep(\"CI\",len), rep(\"Estimate\",len),rep(\"CI\",len),rep(\"CI\",len))\n", + " group_ci_type<-c(rep(\"Uniform\",len), rep(\"Point\",len), rep(\"Uniform\",len),rep(\"Point\",len),rep(\"Uniform\",len))\n", + "\n", + " df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type )\n", + " p<-ggplot(data=df)+\n", + " aes(x=exp(income),y=outcome,colour=group )+\n", + " theme_bw()+\n", + " xlab(\"Income\")+\n", + " ylab(\"Net TFA, (thousand dollars)\")+\n", + " scale_colour_manual(values=c(\"black\",\"blue\",\"blue\",\"blue\",\"blue\"))+\n", + " theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family=\"serif\"))+\n", + " theme(legend.title=element_blank())+\n", + " theme(legend.position=\"none\")+\n", + " ylim(low=lowy,high=highy)+\n", + " geom_line(aes(linetype = group_line),size=1.5)+\n", + " scale_linetype_manual(values=c(\"dashed\",\"solid\"))+\n", + " ggtitle(title)\n", + " }\n", + "\n", + " if (!uniform) {\n", + " group <-c( rep(\"PCI\",len), rep(\"Estimate\",len),rep(\"PCIL\",len))\n", + " group_type<- c(rep(\"CI\",len), rep(\"Estimate\",len),rep(\"CI\",len))\n", + " group_ci_type<-c(rep(\"Point\",len), rep(\"Uniform\",len),rep(\"Point\",len))\n", + "\n", + " df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type )\n", + "\n", + " p<-ggplot(data=df)+\n", + " aes(x=exp(income),y=outcome,colour=group )+\n", + " theme_bw()+\n", + " xlab(\"Income\")+\n", + " ylab(\"Net TFA, (thousand dollars)\")+\n", + " scale_colour_manual(values=c(\"black\",\"blue\",\"blue\",\"blue\",\"blue\"))+\n", + " theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family=\"serif\"))+\n", + " theme(legend.title=element_blank())+\n", + " theme(legend.position=\"none\")+\n", + " ylim(low=lowy,high=highy)+\n", + " geom_line(aes(linetype = group_line),size=1.5)+\n", + " scale_linetype_manual(values=c(\"dashed\",\"solid\"))+\n", + " ggtitle(title)\n", + "\n", + " }\n", + "\n", + "\n", + "\n", + " return(p)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A_cHQnAblwi-" + }, + "outputs": [], + "source": [ + "res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method=\"splines\",max_degree=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AGOMgvgglwi-" + }, + "outputs": [], + "source": [ + "res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method=\"ortho_poly\",max_degree=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jXpt7WdKlwi-" + }, + "source": [ + "#plot findings:\n", + "\n", + "-- black solid line shows estimated function $p(x)' \\widehat{\\beta}$\n", + "\n", + "-- blue dashed lines show pointwise confidence bands for this function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UfwShWdUlwi_" + }, + "outputs": [], + "source": [ + "p<-make_plot(res_ortho_rf_ortho_poly,ss_method=\"ortho_poly\",uniform=FALSE, lowy=-10,highy=20)\n", + "options(repr.plot.width=15, repr.plot.height=10)\n", + "print(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nH5ZbKG0lwi_" + }, + "source": [ + "plot findings:\n", + "\n", + "-- black solid line shows estimated function $p(x)' \\widehat{\\beta}$\n", + "\n", + "-- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\\beta_0$ with probability 0.95\n", + "\n", + "-- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \\rightarrow p(x)'\\beta_0$ with probability 0.95 on some compact range" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nDcX_Pkylwi_" + }, + "outputs": [], + "source": [ + "p<-make_plot(res_ortho_rf_ortho_poly,ss_method=\"ortho polynomials\",uniform=TRUE,lowy=-10,highy=25)\n", + "options(repr.plot.width=15, repr.plot.height=10)\n", + "print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RRrnhWU0lwi_" + }, + "outputs": [], + "source": [ + "p<-make_plot(res_ortho_rf_splines,ss_method=\"splines\",uniform=FALSE, lowy=-15,highy=10)\n", + "options(repr.plot.width=15, repr.plot.height=10)\n", + "print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ShL8EFdTlwi_" + }, + "outputs": [], + "source": [ + "p<-make_plot(res_ortho_rf_splines,ss_method=\"splines\",uniform=TRUE,lowy=-20,highy=20)\n", + "options(repr.plot.width=15, repr.plot.height=10)\n", + "print(p)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 976d57f5d8499193fb5546f920563c81a42954a4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 15:47:28 +0000 Subject: [PATCH 078/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM4 --- PM4/r-dml-401k.Rmd | 946 ++++ PM4/r-dml-401k.irnb | 4300 ++++++++--------- ...ation-analysis-of-401-k-example-w-dags.Rmd | 195 + ...tion-analysis-of-401-k-example-w-dags.irnb | 1224 +++-- ...d_ml_for_partially_linear_model_growth.Rmd | 185 + ..._ml_for_partially_linear_model_growth.irnb | 715 ++- PM4/r_dml_inference_for_gun_ownership.Rmd | 533 ++ PM4/r_dml_inference_for_gun_ownership.irnb | 1592 +++--- 8 files changed, 5660 insertions(+), 4030 deletions(-) create mode 100644 PM4/r-dml-401k.Rmd create mode 100644 PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd create mode 100644 PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd create mode 100644 PM4/r_dml_inference_for_gun_ownership.Rmd diff --git a/PM4/r-dml-401k.Rmd b/PM4/r-dml-401k.Rmd new file mode 100644 index 00000000..6c30cead --- /dev/null +++ b/PM4/r-dml-401k.Rmd @@ -0,0 +1,946 @@ +--- +title: An R Markdown document converted from "PM4/r-dml-401k.irnb" +output: html_document +--- + +# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models + +## Impact of 401(k) on Financial Wealth + +As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation +on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. + +One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. + +```{r} +install.packages("xtable") +install.packages("hdm") +install.packages("sandwich") +install.packages("ggplot2") +install.packages("randomForest") +install.packages("glmnet") +install.packages("rpart") +install.packages("gbm") + +library(xtable) +library(hdm) +library(sandwich) +library(ggplot2) +library(randomForest) +library(data.table) +library(glmnet) +library(rpart) +library(gbm) +``` + +### Data + +The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv). +The data set can be loaded from the `hdm` package for R directly by typing: + + +```{r} +data(pension) +data <- pension +dim(data) +``` + +See the "Details" section on the description of the data set, which can be accessed by + +```{r} +help(pension) +``` + +The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. + +Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. + +```{r} +hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() +hist_e401 +``` + +Eligibility is highly associated with financial wealth: + +```{r} +dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + + geom_density() + xlim(c(-20000, 150000)) + + facet_wrap(.~e401) + +dens_net_tfa +``` + +The unconditional APE of e401 is about $19559$: + +```{r} +e1 <- data[data$e401==1,] +e0 <- data[data$e401==0,] +round(mean(e1$net_tfa)-mean(e0$net_tfa),0) +``` + +Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: + +```{r} +p1 <- data[data$p401==1,] +p0 <- data[data$p401==0,] +round(mean(p1$net_tfa)-mean(p0$net_tfa),0) +``` + +As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. + +```{r} +# outcome variable +y <- data[,'net_tfa'] +# treatment variable +D <- data[,'e401'] +D2 <- data[,"p401"] +D3 <- data[,"a401"] + +columns_to_drop <- c('e401', 'p401', 'a401', 'tw', 'tfa', 'net_tfa', 'tfa_he', + 'hval', 'hmort', 'hequity', + 'nifa', 'net_nifa', 'net_n401', 'ira', + 'dum91', 'icat', 'ecat', 'zhat', + 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', + 'a1', 'a2', 'a3', 'a4', 'a5') + +# covariates +X <- data[, !(names(data) %in% columns_to_drop)] +``` + +```{r} +# Constructing the controls +X_formula = "~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" +X = as.data.table(model.frame(X_formula, pension)) +head(X) +``` + +## Estimating the ATE of 401(k) Eligibility on Net Financial Assets + +We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. We start using ML approaches to estimate the function $g_0$ and $m_0$ in the following PLR model: + +\begin{eqnarray} + & Y = D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid D,X]= 0,\\ + & D = m_0(X) + V, & E[V \mid X] = 0. +\end{eqnarray} + +## Partially Linear Regression Models (PLR) + +```{r} +DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=3, method = "regression") { + nobs <- nrow(x) #number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices + I <- split(1:nobs, foldid) #split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + + if (method == "regression") { + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + } else if (method == "randomforest") { + dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]],], type="prob")[,2] #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + } else if (method == "decisiontrees") { + dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]],])[,2] #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],]) #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + } else if (method == "boostedtrees") { + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + } + cat(b," ") + + } + rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other + coef.est <- coef(rfit)[2] #extract coefficient + se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals +} +``` + +```{r} +summaryPLR <- function(point, stderr, resD, resy, name) { + data <- data.frame( + estimate = point, # point estimate + stderr = stderr, # standard error + lower = point - 1.96 * stderr, # lower end of 95% confidence interval + upper = point + 1.96 * stderr, # upper end of 95% confidence interval + `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y + `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D + `accuracy D` = mean(abs(resD) < 0.5)# binary classification accuracy of model for D + ) + rownames(data) <- name + return(data) +} +``` + +#### Double Lasso with Cross-Fitting + +```{r} +# DML with LassoCV +set.seed(123) +cat(sprintf("\nDML with Lasso CV \n")) + +dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family="gaussian", alpha=1, nfolds=5)} +yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} + +DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.lasso.cv, yreg.lasso.cv, nfold=5) + +sum.lasso.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV') +tableplr <- data.frame() +tableplr <- rbind(sum.lasso.cv) +tableplr +``` + +```{r} +# Because residuals are output, reconstruct fitted values for use in ensemble +dhat.lasso <- D - DML2.results$dtil +yhat.lasso <- y - DML2.results$ytil +``` + +#### Using a $\ell_2$ Penalized Logistic Regression for D + +Note we are using the $\ell_2$ penalty here. You can use the $\ell_1$ penalty as well, but computation will take longer. + +```{r} +# DML with Lasso/Logistic +set.seed(123) +cat(sprintf("\nDML with Lasso/Logistic \n")) + +dreg.logistic.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} +yreg.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} + +DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.logistic.cv, yreg.lasso.cv, nfold=5) +sum.lasso_logistic.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV/LogisticCV') +tableplr <- rbind(tableplr, sum.lasso_logistic.cv) +tableplr +``` + +```{r} +# Because residuals are output, reconstruct fitted values for use in ensemble +dhat.lasso_logistic <- D - DML2.results$dtil +yhat.lasso_logistic <- y - DML2.results$ytil +``` + +#### Random Forests + +```{r} +# DML with Random Forest +set.seed(123) +cat(sprintf("\nDML with Random Forest \n")) + +dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest +yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest + +DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.rf, yreg.rf, nfold=5, method = "randomforest") +sum.rf <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Random Forest') +tableplr <- rbind(tableplr, sum.rf) +tableplr +``` + +```{r} +# Because residuals are output, reconstruct fitted values for use in ensemble +dhat.rf <- D - DML2.results$dtil +yhat.rf <- y - DML2.results$ytil +``` + +#### Decision Trees + +```{r} +# DML with Decision Trees +set.seed(123) +cat(sprintf("\nDML with Decision Trees \n")) + +dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} +yreg.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} + +DML2.results <- DML2.for.PLM(X, D, y, dreg.tr, yreg.tr, nfold=5, method = "decisiontrees") # decision tree takes in X as dataframe, not matrix/array +sum.tr <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Decision Trees') +tableplr <- rbind(tableplr, sum.tr) +tableplr +``` + +```{r} +# Because residuals are output, reconstruct fitted values for use in ensemble +dhat.tr <- D - DML2.results$dtil +yhat.tr <- y - DML2.results$ytil +``` + + +Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\sim X$, $D \sim X$, $Z\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract said model so we can re-use that in DML. + +#### Boosted Trees + +```{r} +# DML with Boosted Trees +set.seed(123) +cat(sprintf("\nDML with Boosted Trees \n")) + +# NB: early stopping cannot easily be implemented with gbm +## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) +dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} + +# passing these through regression as type="response", and D should not be factor! +DML2.results = DML2.for.PLM(X, D, y, dreg.boost, yreg.boost, nfold=5, method = "boostedtrees") +sum.boost <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Boosted Trees') +tableplr <- rbind(tableplr, sum.boost) +tableplr +``` + +```{r} +# Because residuals are output, reconstruct fitted values for use in ensemble +dhat.boost <- D - DML2.results$dtil +yhat.boost <- y - DML2.results$ytil +``` + +## Ensembles + +Boosted trees give the best RMSE for both Y and D, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case. + +```{r} +# Best fit is boosted trees for both D and Y + +sum.best <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Best') +tableplr <- rbind(tableplr, sum.best) +tableplr +``` + +We'll form a model average with unconstrained least squares weights. + +```{r} +# Least squares model average + +ma.dtil <- lm(D~dhat.lasso+dhat.lasso_logistic+dhat.rf+dhat.tr+dhat.boost)$residuals +ma.ytil <- lm(y~yhat.lasso+yhat.lasso_logistic+yhat.rf+yhat.tr+yhat.boost)$residuals + +rfit <- lm(ma.ytil ~ ma.dtil) #estimate the main parameter by regressing one residual on the other +coef.est <- coef(rfit)[2] #extract coefficient +se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error + +sum.ma <- summaryPLR(coef.est, se, ma.dtil, ma.ytil, name = 'Model Average') +tableplr <- rbind(tableplr, sum.ma) +tableplr +``` + +## Interactive Regression Model (IRM) + +Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous: + + \begin{eqnarray}\label{eq: HetPL1} + & Y = g_0(D, X) + U, & \quad E[U \mid X, D]= 0,\\ + & D = m_0(X) + V, & \quad E[V\mid X] = 0. +\end{eqnarray} + +To reduce the disproportionate impact of extreme propensity score weights in the interactive model +we trim the propensity scores which are close to the bounds. + +```{r} +DML2.for.IRM <- function(x, d, y, dreg, yreg0, yreg1, trimming=0.01, nfold=5, method="regression") { + yhat0 <- rep(0, length(y)) + yhat1 <- rep(0, length(y)) + Dhat <- rep(0, length(d)) + + nobs <- nrow(x) #number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices + I <- split(1:nobs, foldid) #split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + + cat("fold: ") + for(b in 1:length(I)){ + + # define helpful variables + Dnotb = d[-I[[b]]] + Xb = X[I[[b]],] + Xnotb = X[-I[[b]],] + + # training dfs subsetted on the -I[[b]] fold + XD0 = X[-I[[b]],][d[-I[[b]]]==0] + yD0 = y[-I[[b]]][d[-I[[b]]]==0] + XD1 = X[-I[[b]],][d[-I[[b]]]==1] + yD1 = y[-I[[b]]][d[-I[[b]]]==1] + + if (method == "regression") { + yfit0 <- yreg0(as.matrix(XD0), yD0) + yfit1 <- yreg1(as.matrix(XD1), yD1) + yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = "response" for glmnet family gaussian + yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb)) + } else if (method == "randomforest") { + yfit0 <- yreg0(XD0, yD0) + yfit1 <- yreg1(XD1, yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for rf + yhat1[I[[b]]] <- predict(yfit1, Xb) + } else if (method == "decisiontrees") { + yfit0 <- yreg0(XD0, yD0) + yfit1 <- yreg1(XD1, yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "vector" for decision + yhat1[I[[b]]] <- predict(yfit1, Xb) + } else if (method == "boostedtrees") { + yfit0 <- yreg0(as.data.frame(XD0), yD0) + yfit1 <- yreg1(as.data.frame(XD1), yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for boosted + yhat1[I[[b]]] <- predict(yfit1, Xb) + } + + # propensity scores: + if (method == "regression"){ + Dfit_b <- dreg(as.matrix(Xnotb), Dnotb) + Dhat_b <- predict(Dfit_b, as.matrix(Xb), type="response") # default is type="link" for family binomial! + } else if (method == "randomforest") { + Dfit_b <- dreg(Xnotb, as.factor(Dnotb)) + Dhat_b <- predict(Dfit_b, Xb, type = "prob")[,2] + } else if (method == "decisiontrees") { + Dfit_b <- dreg(Xnotb, Dnotb) + Dhat_b <- predict(Dfit_b, Xb)[,2] + } else if (method == "boostedtrees") { + Dfit_b <- dreg(as.data.frame(Xnotb), Dnotb) + Dhat_b <- predict(Dfit_b, Xb, type="response") + } + Dhat_b <- pmax(pmin(Dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] + Dhat[I[[b]]] <- Dhat_b + + + cat(b," ") + } + + # Prediction of treatment and outcome for observed instrument + yhat <- yhat0 * (1 - D) + yhat1 * D + # residuals + ytil <- y-yhat + dtil <- D-Dhat + # doubly robust quantity for every sample + drhat <- yhat1 - yhat0 + (y-yhat)* (D/Dhat - (1 - D)/(1 - Dhat)) + coef.est <- mean(drhat) + vari <- var(drhat) + se <- sqrt(vari/nrow(X)) + cat("point", coef.est) + cat("se", se) + return(list(coef.est = coef.est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat)) +} +``` + +```{r} +summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) { + summary_data <- data.frame(estimate = coef.est, # point estimate + se = se, # standard error + lower = coef.est - 1.96 * se, # lower end of 95% confidence interval + upper = coef.est + 1.96 * se, # upper end of 95% confidence interval + rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y + rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D + accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D + ) + row.names(summary_data) <- name + return(summary_data) +} +``` + +#### Repeat analysis in the IRM setting. + +```{r} +# DML with Lasso/Logistic +set.seed(123) +cat(sprintf("\nDML with LassoCV/Logistic \n")) + +dreg.lasso.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} +yreg0.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +yreg1.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} + +DML2.results <- DML2.for.IRM(X, D, y, dreg.lasso.cv, yreg0.lasso.cv, yreg1.lasso.cv, nfold=5) # more folds seems to help stabilize finite sample performance +sum.lasso.cv <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'LassoCVLogistic') +tableirm <- data.frame() +tableirm <- rbind(sum.lasso.cv) +tableirm + +yhat0.lasso <- DML2.results$yhat0 +yhat1.lasso <- DML2.results$yhat1 +dhat.lasso <- DML2.results$dhat +yhat.lasso <- DML2.results$yhat +``` + +```{r} +# DML with Random Forest +set.seed(123) +cat(sprintf("\nDML with Random Forest \n")) + +dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest +yreg0.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest +yreg1.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest + + +DML2.results <- DML2.for.IRM(as.matrix(X), D, y, dreg.rf, yreg0.rf, yreg1.rf, nfold=5, method = "randomforest") +sum.rf <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Random Forest') +tableirm <- rbind(tableirm, sum.rf) +tableirm + +yhat0.rf <- DML2.results$yhat0 +yhat1.rf <- DML2.results$yhat1 +dhat.rf <- DML2.results$dhat +yhat.rf <- DML2.results$yhat +``` + +```{r} +# DML with Decision Trees +set.seed(123) +cat(sprintf("\nDML with Decision Trees \n")) + +dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} +yreg0.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +yreg1.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} + +DML2.results <- DML2.for.IRM(X, D, y, dreg.tr, yreg0.tr, yreg1.tr, nfold=5, method = "decisiontrees") +sum.tr <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Decision Trees') +tableirm <- rbind(tableirm, sum.tr) +tableirm + +yhat0.tr <- DML2.results$yhat0 +yhat1.tr <- DML2.results$yhat1 +dhat.tr <- DML2.results$dhat +yhat.tr <- DML2.results$yhat +``` + +```{r} +# DML with Boosted Trees +set.seed(123) +cat(sprintf("\nDML with Boosted Trees \n")) + +# NB: early stopping cannot easily be implemented with gbm +## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) +dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg0.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +yreg1.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} + +# passing these through regression as type="response", and D should not be factor! +DML2.results = DML2.for.IRM(X, D, y, dreg.boost, yreg0.boost, yreg1.boost, nfold=5, method = "boostedtrees") +sum.boost <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Boosted Trees') +tableirm <- rbind(tableirm, sum.boost) +tableirm + +yhat0.boost <- DML2.results$yhat0 +yhat1.boost <- DML2.results$yhat1 +dhat.boost <- DML2.results$dhat +yhat.boost <- DML2.results$yhat +``` + +```{r} +# Ensembles + +# Best +# We'll look at model that does best for Y overall. Could also use different model for Y0 and Y1 +# Here, the best performance for Y is the random forest and for D the boosted tree + +# residuals +ytil <- y-yhat.rf +dtil <- D-dhat.boost +# doubly robust quantity for every sample +drhat <- yhat1.rf - yhat0.rf + (y-yhat.rf)* (D/dhat.boost - (1 - D)/(1 - dhat.boost)) +coef.est <- mean(drhat) +vari <- var(drhat) +se <- sqrt(vari/nrow(X)) + +sum.best <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Best') +tableirm <- rbind(tableirm, sum.best) +tableirm +``` + +```{r} +# Least squares model average +# We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1 + +ma.dw <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost)$coef +ma.yw <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost)$coef + +Dhats <- cbind(as.matrix(rep(1,nrow(X))),dhat.lasso,dhat.rf,dhat.tr,dhat.boost) +Y0s <- cbind(as.matrix(rep(1,nrow(X))),yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost) +Y1s <- cbind(as.matrix(rep(1,nrow(X))),yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost) + +dhat <- Dhats%*%as.matrix(ma.dw) +yhat0 <- Y0s%*%as.matrix(ma.yw) +yhat1 <- Y1s%*%as.matrix(ma.yw) + +# Prediction of treatment and outcome for observed instrument +yhat <- yhat0 * (1 - D) + yhat1 * D +# residuals +ytil <- y-yhat +dtil <- D-dhat +# doubly robust quantity for every sample +drhat <- yhat1 - yhat0 + (y-yhat)* (D/dhat - (1 - D)/(1 - dhat)) +coef.est <- mean(drhat) +vari <- var(drhat) +se <- sqrt(vari/nrow(X)) + +sum.ma <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Model Average') +tableirm <- rbind(tableirm, sum.ma) +tableirm +``` + +## Double ML package + +We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R). + +We run through PLR and IRM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. + +You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. + +```{r} +install.packages("DoubleML") +install.packages("mlr3learners") +install.packages("mlr3") +install.packages("data.table") +install.packages("randomForest") +install.packages("ranger") + +library(DoubleML) +library(mlr3learners) +library(mlr3) +library(data.table) +library(randomForest) +library(ranger) +``` + +```{r} +# Constructing the data (as DoubleMLData) +formula_flex = "net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" +model_flex = as.data.table(model.frame(formula_flex, pension)) +x_cols = colnames(model_flex)[-c(1,2)] +data_ml = DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols=x_cols) + + +p <- dim(model_flex)[2]-2 +p + +# complex model with two-way interactions +#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE) +``` + +As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. + +## Partially Linear Regression Models (PLR) + +```{r} +# Estimating the PLR +lgr::get_logger("mlr3")$set_threshold("warn") +lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") +lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds=5) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +lasso_plr <- dml_plr$coef +lasso_std_plr <- dml_plr$se +``` + +Let us check the predictive performance of this model. + +```{r} +dml_plr$params_names() +g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +``` + +```{r} +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +d <- as.matrix(pension$e401) +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +lasso_y_rmse <- sqrt(mean((y-predictions_y)^2)) +lasso_y_rmse +``` + +```{r} +# cross-fitted RMSE: treatment +d <- as.matrix(pension$e401) +lasso_d_rmse <- sqrt(mean((d-m_hat)^2)) +lasso_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +Then, we repeat this procedure for various machine learning methods. + +```{r} +# Random Forest +lgr::get_logger("mlr3")$set_threshold("warn") +randomForest <- lrn("regr.ranger") +randomForest_class <- lrn("classif.ranger") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = randomForest_class, n_folds=5) +dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model +dml_plr$summary() +forest_plr <- dml_plr$coef +forest_std_plr <- dml_plr$se +``` + +We can compare the accuracy of this model to the model that has been estimated with lasso. + +```{r} +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +forest_y_rmse <- sqrt(mean((y-predictions_y)^2)) +forest_y_rmse + +# cross-fitted RMSE: treatment +forest_d_rmse <- sqrt(mean((d-m_hat)^2)) +forest_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +# Trees +lgr::get_logger("mlr3")$set_threshold("warn") + +trees <- lrn("regr.rpart") +trees_class <- lrn("classif.rpart") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds=5) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +tree_plr <- dml_plr$coef +tree_std_plr <- dml_plr$se + +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +tree_y_rmse <- sqrt(mean((y-predictions_y)^2)) +tree_y_rmse + +# cross-fitted RMSE: treatment +tree_d_rmse <- sqrt(mean((d-m_hat)^2)) +tree_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +# needed to run boosting +remotes::install_github("mlr-org/mlr3extralearners") +install.packages("mlr3extralearners") +install.packages("mboost") +library(mlr3extralearners) +library(mboost) +``` + +```{r} +# Boosting +boost<- lrn("regr.glmboost") +boost_class <- lrn("classif.glmboost") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds=5) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +boost_plr <- dml_plr$coef +boost_std_plr <- dml_plr$se + +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +boost_y_rmse <- sqrt(mean((y-predictions_y)^2)) +boost_y_rmse + +# cross-fitted RMSE: treatment +boost_d_rmse <- sqrt(mean((d-m_hat)^2)) +boost_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +Let's sum up the results: + +```{r} +table <- matrix(0, 4, 4) +table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr) +table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr) +table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse) +table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +The best model with lowest RMSE is the PLR model estimated via lasso (or boosting based on the RSME Y). It gives the following estimate: + +```{r} +lasso_plr +``` + +## Interactive Regression Model (IRM) + +```{r} +lgr::get_logger("mlr3")$set_threshold("warn") +dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds=5) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +lasso_irm <- dml_irm$coef +lasso_std_irm <- dml_irm$se + + +# predictions +dml_irm$params_names() +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o +``` + +```{r} +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +lasso_y_irm <- sqrt(mean((y-g_hat)^2)) +lasso_y_irm + +# cross-fitted RMSE: treatment +lasso_d_irm <- sqrt(mean((d-m_hat)^2)) +lasso_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +##### forest ##### + +dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, + ml_m = randomForest_class, + trimming_threshold = 0.01, n_folds=5) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +forest_irm <- dml_irm$coef +forest_std_irm <- dml_plr$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0 + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +forest_y_irm <- sqrt(mean((y-g_hat)^2)) +forest_y_irm + +# cross-fitted RMSE: treatment +forest_d_irm <- sqrt(mean((d-m_hat)^2)) +forest_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) + +##### trees ##### + +dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class, + trimming_threshold = 0.01, n_folds=5) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +tree_irm <- dml_irm$coef +tree_std_irm <- dml_irm$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +tree_y_irm <- sqrt(mean((y-g_hat)^2)) +tree_y_irm + +# cross-fitted RMSE: treatment +tree_d_irm <- sqrt(mean((d-m_hat)^2)) +tree_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) + + +##### boosting ##### + +dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class, + trimming_threshold = 0.01, n_folds=5) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +boost_irm <- dml_irm$coef +boost_std_irm <- dml_irm$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +boost_y_irm <- sqrt(mean((y-g_hat)^2)) +boost_y_irm + +# cross-fitted RMSE: treatment +boost_d_irm <- sqrt(mean((d-m_hat)^2)) +boost_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +table <- matrix(0, 4, 4) +table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm) +table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm) +table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm) +table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility. + +```{r} +lgr::get_logger("mlr3")$set_threshold("warn") +dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds=5) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +best_irm <- dml_irm$coef +best_std_irm <- dml_irm$se +``` + +These estimates that flexibly account for confounding are +substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings. + diff --git a/PM4/r-dml-401k.irnb b/PM4/r-dml-401k.irnb index 69a8587d..a67f68de 100644 --- a/PM4/r-dml-401k.irnb +++ b/PM4/r-dml-401k.irnb @@ -1,2225 +1,2081 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f02fa044", - "metadata": { - "papermill": { - "duration": 0.012988, - "end_time": "2022-04-19T09:06:48.772902", - "exception": false, - "start_time": "2022-04-19T09:06:48.759914", - "status": "completed" - }, - "tags": [], - "id": "f02fa044" - }, - "source": [ - "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" - ] - }, - { - "cell_type": "markdown", - "id": "23154404", - "metadata": { - "papermill": { - "duration": 0.009437, - "end_time": "2022-04-19T09:06:48.791895", - "exception": false, - "start_time": "2022-04-19T09:06:48.782458", - "status": "completed" - }, - "tags": [], - "id": "23154404" - }, - "source": [ - "## Impact of 401(k) on Financial Wealth\n", - "\n", - "As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation\n", - "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", - "\n", - "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." - ] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"sandwich\")\n", - "install.packages(\"ggplot2\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"rpart\")\n", - "install.packages(\"gbm\")\n", - "\n", - "library(xtable)\n", - "library(hdm)\n", - "library(sandwich)\n", - "library(ggplot2)\n", - "library(randomForest)\n", - "library(data.table)\n", - "library(glmnet)\n", - "library(rpart)\n", - "library(gbm)" - ], - "metadata": { - "id": "KmAkbDiVE7wm" - }, - "id": "KmAkbDiVE7wm", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "7e23cba0", - "metadata": { - "papermill": { - "duration": 0.009588, - "end_time": "2022-04-19T09:06:48.810853", - "exception": false, - "start_time": "2022-04-19T09:06:48.801265", - "status": "completed" - }, - "tags": [], - "id": "7e23cba0" - }, - "source": [ - "### Data\n", - "\n", - "The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv).\n", - "The data set can be loaded from the `hdm` package for R directly by typing:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c442abdc", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:48.833250Z", - "iopub.status.busy": "2022-04-19T09:06:48.831101Z", - "iopub.status.idle": "2022-04-19T09:06:49.281559Z", - "shell.execute_reply": "2022-04-19T09:06:49.279778Z" - }, - "papermill": { - "duration": 0.46397, - "end_time": "2022-04-19T09:06:49.283933", - "exception": false, - "start_time": "2022-04-19T09:06:48.819963", - "status": "completed" - }, - "tags": [], - "id": "c442abdc" - }, - "outputs": [], - "source": [ - "data(pension)\n", - "data <- pension\n", - "dim(data)" - ] - }, - { - "cell_type": "markdown", - "id": "e47fa9d3", - "metadata": { - "papermill": { - "duration": 0.009462, - "end_time": "2022-04-19T09:06:49.302928", - "exception": false, - "start_time": "2022-04-19T09:06:49.293466", - "status": "completed" - }, - "tags": [], - "id": "e47fa9d3" - }, - "source": [ - "See the \"Details\" section on the description of the data set, which can be accessed by\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00e04b82", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:49.394579Z", - "iopub.status.busy": "2022-04-19T09:06:49.323826Z", - "iopub.status.idle": "2022-04-19T09:06:49.662556Z", - "shell.execute_reply": "2022-04-19T09:06:49.660433Z" - }, - "papermill": { - "duration": 0.35227, - "end_time": "2022-04-19T09:06:49.664810", - "exception": false, - "start_time": "2022-04-19T09:06:49.312540", - "status": "completed" - }, - "tags": [], - "id": "00e04b82" - }, - "outputs": [], - "source": [ - "help(pension)" - ] - }, - { - "cell_type": "markdown", - "id": "24b41e4a", - "metadata": { - "papermill": { - "duration": 0.009357, - "end_time": "2022-04-19T09:06:49.683784", - "exception": false, - "start_time": "2022-04-19T09:06:49.674427", - "status": "completed" - }, - "tags": [], - "id": "24b41e4a" - }, - "source": [ - "The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts." - ] - }, - { - "cell_type": "markdown", - "id": "ed9d4e82", - "metadata": { - "papermill": { - "duration": 0.009242, - "end_time": "2022-04-19T09:06:49.702401", - "exception": false, - "start_time": "2022-04-19T09:06:49.693159", - "status": "completed" - }, - "tags": [], - "id": "ed9d4e82" - }, - "source": [ - "Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63519184", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:49.724951Z", - "iopub.status.busy": "2022-04-19T09:06:49.723401Z", - "iopub.status.idle": "2022-04-19T09:06:50.327963Z", - "shell.execute_reply": "2022-04-19T09:06:50.326306Z" - }, - "papermill": { - "duration": 0.618528, - "end_time": "2022-04-19T09:06:50.330218", - "exception": false, - "start_time": "2022-04-19T09:06:49.711690", - "status": "completed" - }, - "tags": [], - "id": "63519184" - }, - "outputs": [], - "source": [ - "hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar()\n", - "hist_e401" - ] - }, - { - "cell_type": "markdown", - "id": "823d2628", - "metadata": { - "papermill": { - "duration": 0.009686, - "end_time": "2022-04-19T09:06:50.349766", - "exception": false, - "start_time": "2022-04-19T09:06:50.340080", - "status": "completed" - }, - "tags": [], - "id": "823d2628" - }, - "source": [ - "Eligibility is highly associated with financial wealth:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d8faf9c", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:50.372330Z", - "iopub.status.busy": "2022-04-19T09:06:50.370847Z", - "iopub.status.idle": "2022-04-19T09:06:50.912011Z", - "shell.execute_reply": "2022-04-19T09:06:50.910336Z" - }, - "papermill": { - "duration": 0.554613, - "end_time": "2022-04-19T09:06:50.914133", - "exception": false, - "start_time": "2022-04-19T09:06:50.359520", - "status": "completed" - }, - "tags": [], - "id": "5d8faf9c" - }, - "outputs": [], - "source": [ - "dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) +\n", - " geom_density() + xlim(c(-20000, 150000)) +\n", - " facet_wrap(.~e401)\n", - "\n", - "dens_net_tfa" - ] - }, - { - "cell_type": "markdown", - "id": "0f4f86a7", - "metadata": { - "papermill": { - "duration": 0.010335, - "end_time": "2022-04-19T09:06:50.935024", - "exception": false, - "start_time": "2022-04-19T09:06:50.924689", - "status": "completed" - }, - "tags": [], - "id": "0f4f86a7" - }, - "source": [ - "The unconditional APE of e401 is about $19559$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "836c6af7", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:50.959110Z", - "iopub.status.busy": "2022-04-19T09:06:50.957519Z", - "iopub.status.idle": "2022-04-19T09:06:50.981194Z", - "shell.execute_reply": "2022-04-19T09:06:50.979530Z" - }, - "papermill": { - "duration": 0.038096, - "end_time": "2022-04-19T09:06:50.983602", - "exception": false, - "start_time": "2022-04-19T09:06:50.945506", - "status": "completed" - }, - "tags": [], - "id": "836c6af7" - }, - "outputs": [], - "source": [ - "e1 <- data[data$e401==1,]\n", - "e0 <- data[data$e401==0,]\n", - "round(mean(e1$net_tfa)-mean(e0$net_tfa),0)" - ] - }, - { - "cell_type": "markdown", - "id": "22b09926", - "metadata": { - "papermill": { - "duration": 0.01047, - "end_time": "2022-04-19T09:06:51.004618", - "exception": false, - "start_time": "2022-04-19T09:06:50.994148", - "status": "completed" - }, - "tags": [], - "id": "22b09926" - }, - "source": [ - "Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e78aaa58", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:51.029140Z", - "iopub.status.busy": "2022-04-19T09:06:51.027462Z", - "iopub.status.idle": "2022-04-19T09:06:51.052361Z", - "shell.execute_reply": "2022-04-19T09:06:51.050591Z" - }, - "papermill": { - "duration": 0.039305, - "end_time": "2022-04-19T09:06:51.054616", - "exception": false, - "start_time": "2022-04-19T09:06:51.015311", - "status": "completed" - }, - "tags": [], - "id": "e78aaa58" - }, - "outputs": [], - "source": [ - "p1 <- data[data$p401==1,]\n", - "p0 <- data[data$p401==0,]\n", - "round(mean(p1$net_tfa)-mean(p0$net_tfa),0)" - ] - }, - { - "cell_type": "markdown", - "id": "e0af3c81", - "metadata": { - "papermill": { - "duration": 0.010831, - "end_time": "2022-04-19T09:06:51.076114", - "exception": false, - "start_time": "2022-04-19T09:06:51.065283", - "status": "completed" - }, - "tags": [], - "id": "e0af3c81" - }, - "source": [ - "As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation." - ] - }, - { - "cell_type": "code", - "source": [ - "# outcome variable\n", - "y <- data[,'net_tfa']\n", - "# treatment variable\n", - "D <- data[,'e401']\n", - "D2 <- data[,\"p401\"]\n", - "D3 <- data[,\"a401\"]\n", - "\n", - "columns_to_drop <- c('e401', 'p401', 'a401', 'tw', 'tfa', 'net_tfa', 'tfa_he',\n", - " 'hval', 'hmort', 'hequity',\n", - " 'nifa', 'net_nifa', 'net_n401', 'ira',\n", - " 'dum91', 'icat', 'ecat', 'zhat',\n", - " 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7',\n", - " 'a1', 'a2', 'a3', 'a4', 'a5')\n", - "\n", - "# covariates\n", - "X <- data[, !(names(data) %in% columns_to_drop)]" - ], - "metadata": { - "id": "1hBrSMQGzZBR" - }, - "id": "1hBrSMQGzZBR", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Constructing the controls\n", - "X_formula = \"~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", - "X = as.data.table(model.frame(X_formula, pension))\n", - "head(X)" - ], - "metadata": { - "id": "DD0Hwcb6z4u5" - }, - "id": "DD0Hwcb6z4u5", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Estimating the ATE of 401(k) Eligibility on Net Financial Assets" - ], - "metadata": { - "id": "MZThhulbKA9W" - }, - "id": "MZThhulbKA9W" - }, - { - "cell_type": "markdown", - "source": [ - "We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. We start using ML approaches to estimate the function $g_0$ and $m_0$ in the following PLR model:" - ], - "metadata": { - "id": "UuYqY89D0pvs" - }, - "id": "UuYqY89D0pvs" - }, - { - "cell_type": "markdown", - "source": [ - "\\begin{eqnarray}\n", - " & Y = D\\theta_0 + g_0(X) + \\zeta, & E[\\zeta \\mid D,X]= 0,\\\\\n", - " & D = m_0(X) + V, & E[V \\mid X] = 0.\n", - "\\end{eqnarray}" - ], - "metadata": { - "id": "vEAeB2ih0r8B" - }, - "id": "vEAeB2ih0r8B" - }, - { - "cell_type": "markdown", - "id": "cde447aa", - "metadata": { - "papermill": { - "duration": 0.011129, - "end_time": "2022-04-19T09:07:12.117442", - "exception": false, - "start_time": "2022-04-19T09:07:12.106313", - "status": "completed" - }, - "tags": [], - "id": "cde447aa" - }, - "source": [ - "## Partially Linear Regression Models (PLR)" - ] - }, - { - "cell_type": "code", - "source": [ - "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=3, method = \"regression\") {\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", - " ytil <- dtil <- rep(NA, nobs)\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - "\n", - " if (method == \"regression\") {\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " } else if (method == \"randomforest\") {\n", - " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"prob\")[,2] #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " } else if (method == \"decisiontrees\") {\n", - " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],])[,2] #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],]) #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " } else if (method == \"boostedtrees\") {\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " }\n", - " cat(b,\" \")\n", - "\n", - " }\n", - " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", - " coef.est <- coef(rfit)[2] #extract coefficient\n", - " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", - "}\n" - ], - "metadata": { - "id": "tqFlcClUNr9Z" - }, - "id": "tqFlcClUNr9Z", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "summaryPLR <- function(point, stderr, resD, resy, name) {\n", - " data <- data.frame(\n", - " estimate = point, # point estimate\n", - " stderr = stderr, # standard error\n", - " lower = point - 1.96 * stderr, # lower end of 95% confidence interval\n", - " upper = point + 1.96 * stderr, # upper end of 95% confidence interval\n", - " `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y\n", - " `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D\n", - " `accuracy D` = mean(abs(resD) < 0.5)# binary classification accuracy of model for D\n", - " )\n", - " rownames(data) <- name\n", - " return(data)\n", - "}" - ], - "metadata": { - "id": "sS0P4CVySjDP" - }, - "id": "sS0P4CVySjDP", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Double Lasso with Cross-Fitting" - ], - "metadata": { - "id": "pdGcjnngSn5Q" - }, - "id": "pdGcjnngSn5Q" - }, - { - "cell_type": "code", - "source": [ - "# DML with LassoCV\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Lasso CV \\n\"))\n", - "\n", - "dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.lasso.cv, yreg.lasso.cv, nfold=5)\n", - "\n", - "sum.lasso.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV')\n", - "tableplr <- data.frame()\n", - "tableplr <- rbind(sum.lasso.cv)\n", - "tableplr" - ], - "metadata": { - "id": "LOVuR5QO1bkB" - }, - "id": "LOVuR5QO1bkB", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.lasso <- D - DML2.results$dtil\n", - "yhat.lasso <- y - DML2.results$ytil" - ], - "metadata": { - "id": "KatOw36Z0ghO" - }, - "id": "KatOw36Z0ghO", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Using a $\\ell_2$ Penalized Logistic Regression for D\n", - "\n", - "Note we are using the $\\ell_2$ penalty here. You can use the $\\ell_1$ penalty as well, but computation will take longer." - ], - "metadata": { - "id": "4wvLEj12SpDf" - }, - "id": "4wvLEj12SpDf" - }, - { - "cell_type": "code", - "source": [ - "# DML with Lasso/Logistic\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", - "\n", - "dreg.logistic.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.logistic.cv, yreg.lasso.cv, nfold=5)\n", - "sum.lasso_logistic.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV/LogisticCV')\n", - "tableplr <- rbind(tableplr, sum.lasso_logistic.cv)\n", - "tableplr" - ], - "metadata": { - "id": "b9Nvp5ZlSuwB" - }, - "id": "b9Nvp5ZlSuwB", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.lasso_logistic <- D - DML2.results$dtil\n", - "yhat.lasso_logistic <- y - DML2.results$ytil" - ], - "metadata": { - "id": "hJqMdcZV05lr" - }, - "id": "hJqMdcZV05lr", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Random Forests" - ], - "metadata": { - "id": "txyv6IDXSu64" - }, - "id": "txyv6IDXSu64" - }, - { - "cell_type": "code", - "source": [ - "# DML with Random Forest\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "\n", - "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.rf, yreg.rf, nfold=5, method = \"randomforest\")\n", - "sum.rf <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Random Forest')\n", - "tableplr <- rbind(tableplr, sum.rf)\n", - "tableplr" - ], - "metadata": { - "id": "nt0oTHTfSwMr" - }, - "id": "nt0oTHTfSwMr", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.rf <- D - DML2.results$dtil\n", - "yhat.rf <- y - DML2.results$ytil" - ], - "metadata": { - "id": "TG476dPX1BI_" - }, - "id": "TG476dPX1BI_", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Decision Trees" - ], - "metadata": { - "id": "k8EFP-w_SwXZ" - }, - "id": "k8EFP-w_SwXZ" - }, - { - "cell_type": "code", - "source": [ - "# DML with Decision Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", - "\n", - "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results <- DML2.for.PLM(X, D, y, dreg.tr, yreg.tr, nfold=5, method = \"decisiontrees\") # decision tree takes in X as dataframe, not matrix/array\n", - "sum.tr <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Decision Trees')\n", - "tableplr <- rbind(tableplr, sum.tr)\n", - "tableplr" - ], - "metadata": { - "id": "3Nu4daQRSyRb" - }, - "id": "3Nu4daQRSyRb", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.tr <- D - DML2.results$dtil\n", - "yhat.tr <- y - DML2.results$ytil" - ], - "metadata": { - "id": "RnCGwVbN1KJJ" - }, - "id": "RnCGwVbN1KJJ", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\\sim X$, $D \\sim X$, $Z\\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract said model so we can re-use that in DML." - ], - "metadata": { - "id": "jODHt0hjntdP" - }, - "id": "jODHt0hjntdP" - }, - { - "cell_type": "markdown", - "source": [ - "#### Boosted Trees" - ], - "metadata": { - "id": "SaPGNW0SSxWk" - }, - "id": "SaPGNW0SSxWk" - }, - { - "cell_type": "code", - "source": [ - "# DML with Boosted Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", - "\n", - "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "\n", - "# passing these through regression as type=\"response\", and D should not be factor!\n", - "DML2.results = DML2.for.PLM(X, D, y, dreg.boost, yreg.boost, nfold=5, method = \"boostedtrees\")\n", - "sum.boost <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Boosted Trees')\n", - "tableplr <- rbind(tableplr, sum.boost)\n", - "tableplr" - ], - "metadata": { - "id": "Ekg5qeEOSxep" - }, - "id": "Ekg5qeEOSxep", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.boost <- D - DML2.results$dtil\n", - "yhat.boost <- y - DML2.results$ytil" - ], - "metadata": { - "id": "WSyqSd5Z1hne" - }, - "id": "WSyqSd5Z1hne", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Ensembles" - ], - "metadata": { - "id": "7UZphpPS10Hz" - }, - "id": "7UZphpPS10Hz" - }, - { - "cell_type": "markdown", - "source": [ - "Boosted trees give the best RMSE for both Y and D, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case." - ], - "metadata": { - "id": "Hqsqpgs6C4fJ" - }, - "id": "Hqsqpgs6C4fJ" - }, - { - "cell_type": "code", - "source": [ - "# Best fit is boosted trees for both D and Y\n", - "\n", - "sum.best <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Best')\n", - "tableplr <- rbind(tableplr, sum.best)\n", - "tableplr" - ], - "metadata": { - "id": "gDrZqZXR12hA" - }, - "id": "gDrZqZXR12hA", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We'll form a model average with unconstrained least squares weights." - ], - "metadata": { - "id": "pG8mmrQw2GRC" - }, - "id": "pG8mmrQw2GRC" - }, - { - "cell_type": "code", - "source": [ - "# Least squares model average\n", - "\n", - "ma.dtil <- lm(D~dhat.lasso+dhat.lasso_logistic+dhat.rf+dhat.tr+dhat.boost)$residuals\n", - "ma.ytil <- lm(y~yhat.lasso+yhat.lasso_logistic+yhat.rf+yhat.tr+yhat.boost)$residuals\n", - "\n", - "rfit <- lm(ma.ytil ~ ma.dtil) #estimate the main parameter by regressing one residual on the other\n", - "coef.est <- coef(rfit)[2] #extract coefficient\n", - "se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", - "\n", - "sum.ma <- summaryPLR(coef.est, se, ma.dtil, ma.ytil, name = 'Model Average')\n", - "tableplr <- rbind(tableplr, sum.ma)\n", - "tableplr\n" - ], - "metadata": { - "id": "Pkg7pw5h2N0z" - }, - "id": "Pkg7pw5h2N0z", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "67fa5873", - "metadata": { - "papermill": { - "duration": 0.013657, - "end_time": "2022-04-19T09:10:00.718448", - "exception": false, - "start_time": "2022-04-19T09:10:00.704791", - "status": "completed" - }, - "tags": [], - "id": "67fa5873" - }, - "source": [ - "## Interactive Regression Model (IRM)" - ] - }, - { - "cell_type": "markdown", - "id": "86393e4c", - "metadata": { - "papermill": { - "duration": 0.013488, - "end_time": "2022-04-19T09:10:00.745538", - "exception": false, - "start_time": "2022-04-19T09:10:00.732050", - "status": "completed" - }, - "tags": [], - "id": "86393e4c" - }, - "source": [ - "Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous:" - ] - }, - { - "cell_type": "markdown", - "id": "830bb508", - "metadata": { - "papermill": { - "duration": 0.013695, - "end_time": "2022-04-19T09:10:00.772756", - "exception": false, - "start_time": "2022-04-19T09:10:00.759061", - "status": "completed" - }, - "tags": [], - "id": "830bb508" - }, - "source": [ - " \\begin{eqnarray}\\label{eq: HetPL1}\n", - " & Y = g_0(D, X) + U, & \\quad E[U \\mid X, D]= 0,\\\\\n", - " & D = m_0(X) + V, & \\quad E[V\\mid X] = 0.\n", - "\\end{eqnarray}" - ] - }, - { - "cell_type": "markdown", - "id": "9e5ec32b", - "metadata": { - "papermill": { - "duration": 0.013592, - "end_time": "2022-04-19T09:10:00.799889", - "exception": false, - "start_time": "2022-04-19T09:10:00.786297", - "status": "completed" - }, - "tags": [], - "id": "9e5ec32b" - }, - "source": [ - "To reduce the disproportionate impact of extreme propensity score weights in the interactive model\n", - "we trim the propensity scores which are close to the bounds." - ] - }, - { - "cell_type": "code", - "source": [ - "DML2.for.IRM <- function(x, d, y, dreg, yreg0, yreg1, trimming=0.01, nfold=5, method=\"regression\") {\n", - " yhat0 <- rep(0, length(y))\n", - " yhat1 <- rep(0, length(y))\n", - " Dhat <- rep(0, length(d))\n", - "\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", - " ytil <- dtil <- rep(NA, nobs)\n", - "\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - "\n", - " # define helpful variables\n", - " Dnotb = d[-I[[b]]]\n", - " Xb = X[I[[b]],]\n", - " Xnotb = X[-I[[b]],]\n", - "\n", - " # training dfs subsetted on the -I[[b]] fold\n", - " XD0 = X[-I[[b]],][d[-I[[b]]]==0]\n", - " yD0 = y[-I[[b]]][d[-I[[b]]]==0]\n", - " XD1 = X[-I[[b]],][d[-I[[b]]]==1]\n", - " yD1 = y[-I[[b]]][d[-I[[b]]]==1]\n", - "\n", - " if (method == \"regression\") {\n", - " yfit0 <- yreg0(as.matrix(XD0), yD0)\n", - " yfit1 <- yreg1(as.matrix(XD1), yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = \"response\" for glmnet family gaussian\n", - " yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb))\n", - " } else if (method == \"randomforest\") {\n", - " yfit0 <- yreg0(XD0, yD0)\n", - " yfit1 <- yreg1(XD1, yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for rf\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", - " } else if (method == \"decisiontrees\") {\n", - " yfit0 <- yreg0(XD0, yD0)\n", - " yfit1 <- yreg1(XD1, yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"vector\" for decision\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", - " } else if (method == \"boostedtrees\") {\n", - " yfit0 <- yreg0(as.data.frame(XD0), yD0)\n", - " yfit1 <- yreg1(as.data.frame(XD1), yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for boosted\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", - " }\n", - "\n", - " # propensity scores:\n", - " if (method == \"regression\"){\n", - " Dfit_b <- dreg(as.matrix(Xnotb), Dnotb)\n", - " Dhat_b <- predict(Dfit_b, as.matrix(Xb), type=\"response\") # default is type=\"link\" for family binomial!\n", - " } else if (method == \"randomforest\") {\n", - " Dfit_b <- dreg(Xnotb, as.factor(Dnotb))\n", - " Dhat_b <- predict(Dfit_b, Xb, type = \"prob\")[,2]\n", - " } else if (method == \"decisiontrees\") {\n", - " Dfit_b <- dreg(Xnotb, Dnotb)\n", - " Dhat_b <- predict(Dfit_b, Xb)[,2]\n", - " } else if (method == \"boostedtrees\") {\n", - " Dfit_b <- dreg(as.data.frame(Xnotb), Dnotb)\n", - " Dhat_b <- predict(Dfit_b, Xb, type=\"response\")\n", - " }\n", - " Dhat_b <- pmax(pmin(Dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", - " Dhat[I[[b]]] <- Dhat_b\n", - "\n", - "\n", - " cat(b,\" \")\n", - " }\n", - "\n", - " # Prediction of treatment and outcome for observed instrument\n", - " yhat <- yhat0 * (1 - D) + yhat1 * D\n", - " # residuals\n", - " ytil <- y-yhat\n", - " dtil <- D-Dhat\n", - " # doubly robust quantity for every sample\n", - " drhat <- yhat1 - yhat0 + (y-yhat)* (D/Dhat - (1 - D)/(1 - Dhat))\n", - " coef.est <- mean(drhat)\n", - " vari <- var(drhat)\n", - " se <- sqrt(vari/nrow(X))\n", - " cat(\"point\", coef.est)\n", - " cat(\"se\", se)\n", - " return(list(coef.est = coef.est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat))\n", - "}" - ], - "metadata": { - "id": "-hCmnqC-N0nS" - }, - "id": "-hCmnqC-N0nS", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) {\n", - " summary_data <- data.frame(estimate = coef.est, # point estimate\n", - " se = se, # standard error\n", - " lower = coef.est - 1.96 * se, # lower end of 95% confidence interval\n", - " upper = coef.est + 1.96 * se, # upper end of 95% confidence interval\n", - " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", - " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", - " accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D\n", - " )\n", - " row.names(summary_data) <- name\n", - " return(summary_data)\n", - "}" - ], - "metadata": { - "id": "bCj1D8_MSg09" - }, - "id": "bCj1D8_MSg09", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Repeat analysis in the IRM setting." - ], - "metadata": { - "id": "6mCdfifchkgZ" - }, - "id": "6mCdfifchkgZ" - }, - { - "cell_type": "code", - "source": [ - "# DML with Lasso/Logistic\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with LassoCV/Logistic \\n\"))\n", - "\n", - "dreg.lasso.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg0.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg1.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.IRM(X, D, y, dreg.lasso.cv, yreg0.lasso.cv, yreg1.lasso.cv, nfold=5) # more folds seems to help stabilize finite sample performance\n", - "sum.lasso.cv <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'LassoCVLogistic')\n", - "tableirm <- data.frame()\n", - "tableirm <- rbind(sum.lasso.cv)\n", - "tableirm\n", - "\n", - "yhat0.lasso <- DML2.results$yhat0\n", - "yhat1.lasso <- DML2.results$yhat1\n", - "dhat.lasso <- DML2.results$dhat\n", - "yhat.lasso <- DML2.results$yhat" - ], - "metadata": { - "id": "AUiHMoNTvo-m" - }, - "id": "AUiHMoNTvo-m", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Random Forest\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "\n", - "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg0.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg1.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "\n", - "DML2.results <- DML2.for.IRM(as.matrix(X), D, y, dreg.rf, yreg0.rf, yreg1.rf, nfold=5, method = \"randomforest\")\n", - "sum.rf <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Random Forest')\n", - "tableirm <- rbind(tableirm, sum.rf)\n", - "tableirm\n", - "\n", - "yhat0.rf <- DML2.results$yhat0\n", - "yhat1.rf <- DML2.results$yhat1\n", - "dhat.rf <- DML2.results$dhat\n", - "yhat.rf <- DML2.results$yhat" - ], - "metadata": { - "id": "JPABXLYyvyqy" - }, - "id": "JPABXLYyvyqy", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Decision Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", - "\n", - "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg0.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "yreg1.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results <- DML2.for.IRM(X, D, y, dreg.tr, yreg0.tr, yreg1.tr, nfold=5, method = \"decisiontrees\")\n", - "sum.tr <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Decision Trees')\n", - "tableirm <- rbind(tableirm, sum.tr)\n", - "tableirm\n", - "\n", - "yhat0.tr <- DML2.results$yhat0\n", - "yhat1.tr <- DML2.results$yhat1\n", - "dhat.tr <- DML2.results$dhat\n", - "yhat.tr <- DML2.results$yhat" - ], - "metadata": { - "id": "SukZCfEbvyzC" - }, - "id": "SukZCfEbvyzC", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# DML with Boosted Trees\n", - "set.seed(123)\n", - "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", - "\n", - "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg0.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg1.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "\n", - "# passing these through regression as type=\"response\", and D should not be factor!\n", - "DML2.results = DML2.for.IRM(X, D, y, dreg.boost, yreg0.boost, yreg1.boost, nfold=5, method = \"boostedtrees\")\n", - "sum.boost <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Boosted Trees')\n", - "tableirm <- rbind(tableirm, sum.boost)\n", - "tableirm\n", - "\n", - "yhat0.boost <- DML2.results$yhat0\n", - "yhat1.boost <- DML2.results$yhat1\n", - "dhat.boost <- DML2.results$dhat\n", - "yhat.boost <- DML2.results$yhat" - ], - "metadata": { - "id": "bTfgiCabvy6f" - }, - "id": "bTfgiCabvy6f", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Ensembles\n", - "\n", - "# Best\n", - "# We'll look at model that does best for Y overall. Could also use different model for Y0 and Y1\n", - "# Here, the best performance for Y is the random forest and for D the boosted tree\n", - "\n", - "# residuals\n", - "ytil <- y-yhat.rf\n", - "dtil <- D-dhat.boost\n", - "# doubly robust quantity for every sample\n", - "drhat <- yhat1.rf - yhat0.rf + (y-yhat.rf)* (D/dhat.boost - (1 - D)/(1 - dhat.boost))\n", - "coef.est <- mean(drhat)\n", - "vari <- var(drhat)\n", - "se <- sqrt(vari/nrow(X))\n", - "\n", - "sum.best <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Best')\n", - "tableirm <- rbind(tableirm, sum.best)\n", - "tableirm" - ], - "metadata": { - "id": "7rxqwK-R4Z2q" - }, - "id": "7rxqwK-R4Z2q", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Least squares model average\n", - "# We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1\n", - "\n", - "ma.dw <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost)$coef\n", - "ma.yw <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost)$coef\n", - "\n", - "Dhats <- cbind(as.matrix(rep(1,nrow(X))),dhat.lasso,dhat.rf,dhat.tr,dhat.boost)\n", - "Y0s <- cbind(as.matrix(rep(1,nrow(X))),yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)\n", - "Y1s <- cbind(as.matrix(rep(1,nrow(X))),yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)\n", - "\n", - "dhat <- Dhats%*%as.matrix(ma.dw)\n", - "yhat0 <- Y0s%*%as.matrix(ma.yw)\n", - "yhat1 <- Y1s%*%as.matrix(ma.yw)\n", - "\n", - "# Prediction of treatment and outcome for observed instrument\n", - "yhat <- yhat0 * (1 - D) + yhat1 * D\n", - "# residuals\n", - "ytil <- y-yhat\n", - "dtil <- D-dhat\n", - "# doubly robust quantity for every sample\n", - "drhat <- yhat1 - yhat0 + (y-yhat)* (D/dhat - (1 - D)/(1 - dhat))\n", - "coef.est <- mean(drhat)\n", - "vari <- var(drhat)\n", - "se <- sqrt(vari/nrow(X))\n", - "\n", - "sum.ma <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Model Average')\n", - "tableirm <- rbind(tableirm, sum.ma)\n", - "tableirm\n" - ], - "metadata": { - "id": "0-c3NI0fCfqg" - }, - "id": "0-c3NI0fCfqg", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "id": "01de9f24", - "metadata": { - "papermill": { - "duration": 0.010725, - "end_time": "2022-04-19T09:06:51.098483", - "exception": false, - "start_time": "2022-04-19T09:06:51.087758", - "status": "completed" - }, - "tags": [], - "id": "01de9f24" - }, - "source": [ - "## Double ML package" - ] - }, - { - "cell_type": "markdown", - "id": "6cdc366f", - "metadata": { - "papermill": { - "duration": 0.010679, - "end_time": "2022-04-19T09:06:51.119780", - "exception": false, - "start_time": "2022-04-19T09:06:51.109101", - "status": "completed" - }, - "tags": [], - "id": "6cdc366f" - }, - "source": [ - "We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", - "\n", - "We run through PLR and IRM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", - "\n", - "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2846a36a", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:06:51.144230Z", - "iopub.status.busy": "2022-04-19T09:06:51.142682Z", - "iopub.status.idle": "2022-04-19T09:07:11.366508Z", - "shell.execute_reply": "2022-04-19T09:07:11.364676Z" - }, - "papermill": { - "duration": 20.239271, - "end_time": "2022-04-19T09:07:11.369618", - "exception": false, - "start_time": "2022-04-19T09:06:51.130347", - "status": "completed" - }, - "tags": [], - "id": "2846a36a" - }, - "outputs": [], - "source": [ - "install.packages(\"DoubleML\")\n", - "install.packages(\"mlr3learners\")\n", - "install.packages(\"mlr3\")\n", - "install.packages(\"data.table\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"ranger\")\n", - "\n", - "library(DoubleML)\n", - "library(mlr3learners)\n", - "library(mlr3)\n", - "library(data.table)\n", - "library(randomForest)\n", - "library(ranger)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a141248", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:12.009260Z", - "iopub.status.busy": "2022-04-19T09:07:12.007545Z", - "iopub.status.idle": "2022-04-19T09:07:12.092289Z", - "shell.execute_reply": "2022-04-19T09:07:12.090545Z" - }, - "papermill": { - "duration": 0.100382, - "end_time": "2022-04-19T09:07:12.094585", - "exception": false, - "start_time": "2022-04-19T09:07:11.994203", - "status": "completed" - }, - "tags": [], - "id": "2a141248" - }, - "outputs": [], - "source": [ - "# Constructing the data (as DoubleMLData)\n", - "formula_flex = \"net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown\"\n", - "model_flex = as.data.table(model.frame(formula_flex, pension))\n", - "x_cols = colnames(model_flex)[-c(1,2)]\n", - "data_ml = DoubleMLData$new(model_flex, y_col = \"net_tfa\", d_cols = \"e401\", x_cols=x_cols)\n", - "\n", - "\n", - "p <- dim(model_flex)[2]-2\n", - "p\n", - "\n", - "# complex model with two-way interactions\n", - "#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2e1c9339", - "metadata": { - "papermill": { - "duration": 0.010825, - "end_time": "2022-04-19T09:07:11.938062", - "exception": false, - "start_time": "2022-04-19T09:07:11.927237", - "status": "completed" - }, - "tags": [], - "id": "2e1c9339" - }, - "source": [ - "As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session." - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Partially Linear Regression Models (PLR)" - ], - "metadata": { - "id": "Cwmd7ELXKeIg" - }, - "id": "Cwmd7ELXKeIg" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a48e367d", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:12.188025Z", - "iopub.status.busy": "2022-04-19T09:07:12.186501Z", - "iopub.status.idle": "2022-04-19T09:07:34.643694Z", - "shell.execute_reply": "2022-04-19T09:07:34.641926Z" - }, - "papermill": { - "duration": 22.473331, - "end_time": "2022-04-19T09:07:34.646865", - "exception": false, - "start_time": "2022-04-19T09:07:12.173534", - "status": "completed" - }, - "tags": [], - "id": "a48e367d" - }, - "outputs": [], - "source": [ - "# Estimating the PLR\n", - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "lasso <- lrn(\"regr.cv_glmnet\",nfolds = 5, s = \"lambda.min\")\n", - "lasso_class <- lrn(\"classif.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", - "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", - "dml_plr$summary()\n", - "lasso_plr <- dml_plr$coef\n", - "lasso_std_plr <- dml_plr$se" - ] - }, - { - "cell_type": "markdown", - "id": "135275dc", - "metadata": { - "papermill": { - "duration": 0.011132, - "end_time": "2022-04-19T09:07:34.670166", - "exception": false, - "start_time": "2022-04-19T09:07:34.659034", - "status": "completed" - }, - "tags": [], - "id": "135275dc" - }, - "source": [ - "Let us check the predictive performance of this model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6d83bbe", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:34.695782Z", - "iopub.status.busy": "2022-04-19T09:07:34.694101Z", - "iopub.status.idle": "2022-04-19T09:07:34.716746Z", - "shell.execute_reply": "2022-04-19T09:07:34.714842Z" - }, - "papermill": { - "duration": 0.038389, - "end_time": "2022-04-19T09:07:34.719637", - "exception": false, - "start_time": "2022-04-19T09:07:34.681248", - "status": "completed" - }, - "tags": [], - "id": "e6d83bbe" - }, - "outputs": [], - "source": [ - "dml_plr$params_names()\n", - "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", - "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32c894fa", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:34.745097Z", - "iopub.status.busy": "2022-04-19T09:07:34.743570Z", - "iopub.status.idle": "2022-04-19T09:07:34.771488Z", - "shell.execute_reply": "2022-04-19T09:07:34.769302Z" - }, - "papermill": { - "duration": 0.043342, - "end_time": "2022-04-19T09:07:34.774113", - "exception": false, - "start_time": "2022-04-19T09:07:34.730771", - "status": "completed" - }, - "tags": [], - "id": "32c894fa" - }, - "outputs": [], - "source": [ - "# cross-fitted RMSE: outcome\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "d <- as.matrix(pension$e401)\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "lasso_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", - "lasso_y_rmse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da5b9334", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:34.800143Z", - "iopub.status.busy": "2022-04-19T09:07:34.798474Z", - "iopub.status.idle": "2022-04-19T09:07:34.826393Z", - "shell.execute_reply": "2022-04-19T09:07:34.824605Z" - }, - "papermill": { - "duration": 0.04333, - "end_time": "2022-04-19T09:07:34.828718", - "exception": false, - "start_time": "2022-04-19T09:07:34.785388", - "status": "completed" - }, - "tags": [], - "id": "da5b9334" - }, - "outputs": [], - "source": [ - "# cross-fitted RMSE: treatment\n", - "d <- as.matrix(pension$e401)\n", - "lasso_d_rmse <- sqrt(mean((d-m_hat)^2))\n", - "lasso_d_rmse\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "markdown", - "id": "c1481527", - "metadata": { - "papermill": { - "duration": 0.011351, - "end_time": "2022-04-19T09:07:34.851558", - "exception": false, - "start_time": "2022-04-19T09:07:34.840207", - "status": "completed" - }, - "tags": [], - "id": "c1481527" - }, - "source": [ - "Then, we repeat this procedure for various machine learning methods." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dac2d0fc", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:07:34.878296Z", - "iopub.status.busy": "2022-04-19T09:07:34.876743Z", - "iopub.status.idle": "2022-04-19T09:08:35.899658Z", - "shell.execute_reply": "2022-04-19T09:08:35.896930Z" - }, - "papermill": { - "duration": 61.046861, - "end_time": "2022-04-19T09:08:35.910116", - "exception": false, - "start_time": "2022-04-19T09:07:34.863255", - "status": "completed" - }, - "tags": [], - "id": "dac2d0fc" - }, - "outputs": [], - "source": [ - "# Random Forest\n", - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "randomForest <- lrn(\"regr.ranger\")\n", - "randomForest_class <- lrn(\"classif.ranger\")\n", - "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = randomForest_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model\n", - "dml_plr$summary()\n", - "forest_plr <- dml_plr$coef\n", - "forest_std_plr <- dml_plr$se" - ] - }, - { - "cell_type": "markdown", - "id": "c7c614e6", - "metadata": { - "papermill": { - "duration": 0.011382, - "end_time": "2022-04-19T09:08:35.932891", - "exception": false, - "start_time": "2022-04-19T09:08:35.921509", - "status": "completed" - }, - "tags": [], - "id": "c7c614e6" - }, - "source": [ - "We can compare the accuracy of this model to the model that has been estimated with lasso." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8af1a74", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:08:35.993655Z", - "iopub.status.busy": "2022-04-19T09:08:35.992094Z", - "iopub.status.idle": "2022-04-19T09:08:36.034450Z", - "shell.execute_reply": "2022-04-19T09:08:36.032502Z" - }, - "papermill": { - "duration": 0.092847, - "end_time": "2022-04-19T09:08:36.037154", - "exception": false, - "start_time": "2022-04-19T09:08:35.944307", - "status": "completed" - }, - "tags": [], - "id": "f8af1a74" - }, - "outputs": [], - "source": [ - "# Evaluation predictions\n", - "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", - "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", - "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "forest_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", - "forest_y_rmse\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "forest_d_rmse <- sqrt(mean((d-m_hat)^2))\n", - "forest_d_rmse\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61a94dff", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:08:36.064976Z", - "iopub.status.busy": "2022-04-19T09:08:36.063494Z", - "iopub.status.idle": "2022-04-19T09:08:37.268375Z", - "shell.execute_reply": "2022-04-19T09:08:37.266452Z" - }, - "papermill": { - "duration": 1.221303, - "end_time": "2022-04-19T09:08:37.271202", - "exception": false, - "start_time": "2022-04-19T09:08:36.049899", - "status": "completed" - }, - "tags": [], - "id": "61a94dff" - }, - "outputs": [], - "source": [ - "# Trees\n", - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "\n", - "trees <- lrn(\"regr.rpart\")\n", - "trees_class <- lrn(\"classif.rpart\")\n", - "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", - "dml_plr$summary()\n", - "tree_plr <- dml_plr$coef\n", - "tree_std_plr <- dml_plr$se\n", - "\n", - "# Evaluation predictions\n", - "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", - "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", - "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "tree_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", - "tree_y_rmse\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "tree_d_rmse <- sqrt(mean((d-m_hat)^2))\n", - "tree_d_rmse\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "885c94eb", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:08:37.299832Z", - "iopub.status.busy": "2022-04-19T09:08:37.297991Z", - "iopub.status.idle": "2022-04-19T09:09:58.069060Z", - "shell.execute_reply": "2022-04-19T09:09:58.066595Z" - }, - "papermill": { - "duration": 80.788944, - "end_time": "2022-04-19T09:09:58.072665", - "exception": false, - "start_time": "2022-04-19T09:08:37.283721", - "status": "completed" - }, - "tags": [], - "id": "885c94eb" - }, - "outputs": [], - "source": [ - "# needed to run boosting\n", - "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", - "install.packages(\"mlr3extralearners\")\n", - "install.packages(\"mboost\")\n", - "library(mlr3extralearners)\n", - "library(mboost)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0372eefe", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:09:58.106809Z", - "iopub.status.busy": "2022-04-19T09:09:58.105019Z", - "iopub.status.idle": "2022-04-19T09:10:00.492368Z", - "shell.execute_reply": "2022-04-19T09:10:00.490644Z" - }, - "papermill": { - "duration": 2.404791, - "end_time": "2022-04-19T09:10:00.494687", - "exception": false, - "start_time": "2022-04-19T09:09:58.089896", - "status": "completed" - }, - "tags": [], - "id": "0372eefe" - }, - "outputs": [], - "source": [ - "# Boosting\n", - "boost<- lrn(\"regr.glmboost\")\n", - "boost_class <- lrn(\"classif.glmboost\")\n", - "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", - "dml_plr$summary()\n", - "boost_plr <- dml_plr$coef\n", - "boost_std_plr <- dml_plr$se\n", - "\n", - "# Evaluation predictions\n", - "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", - "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", - "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "boost_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", - "boost_y_rmse\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "boost_d_rmse <- sqrt(mean((d-m_hat)^2))\n", - "boost_d_rmse\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "markdown", - "id": "ffa1e35a", - "metadata": { - "papermill": { - "duration": 0.013161, - "end_time": "2022-04-19T09:10:00.521404", - "exception": false, - "start_time": "2022-04-19T09:10:00.508243", - "status": "completed" - }, - "tags": [], - "id": "ffa1e35a" - }, - "source": [ - "Let's sum up the results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d322c48a", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:10:00.551222Z", - "iopub.status.busy": "2022-04-19T09:10:00.549553Z", - "iopub.status.idle": "2022-04-19T09:10:00.613728Z", - "shell.execute_reply": "2022-04-19T09:10:00.611934Z" - }, - "papermill": { - "duration": 0.081341, - "end_time": "2022-04-19T09:10:00.615953", - "exception": false, - "start_time": "2022-04-19T09:10:00.534612", - "status": "completed" - }, - "tags": [], - "id": "d322c48a" - }, - "outputs": [], - "source": [ - "table <- matrix(0, 4, 4)\n", - "table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr)\n", - "table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr)\n", - "table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse)\n", - "table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse)\n", - "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", - "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", - "tab<- xtable(table, digits = 2)\n", - "tab" - ] - }, - { - "cell_type": "markdown", - "id": "e8e9ffc8", - "metadata": { - "papermill": { - "duration": 0.013424, - "end_time": "2022-04-19T09:10:00.642931", - "exception": false, - "start_time": "2022-04-19T09:10:00.629507", - "status": "completed" - }, - "tags": [], - "id": "e8e9ffc8" - }, - "source": [ - "The best model with lowest RMSE is the PLR model estimated via lasso (or boosting based on the RSME Y). It gives the following estimate:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33fcc2b4", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:10:00.673204Z", - "iopub.status.busy": "2022-04-19T09:10:00.671479Z", - "iopub.status.idle": "2022-04-19T09:10:00.688330Z", - "shell.execute_reply": "2022-04-19T09:10:00.686511Z" - }, - "papermill": { - "duration": 0.034272, - "end_time": "2022-04-19T09:10:00.690621", - "exception": false, - "start_time": "2022-04-19T09:10:00.656349", - "status": "completed" - }, - "tags": [], - "id": "33fcc2b4" - }, - "outputs": [], - "source": [ - "lasso_plr" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Interactive Regression Model (IRM)" - ], - "metadata": { - "id": "Ebrv1spfKWxH" - }, - "id": "Ebrv1spfKWxH" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a7410a9", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:10:00.830326Z", - "iopub.status.busy": "2022-04-19T09:10:00.828843Z", - "iopub.status.idle": "2022-04-19T09:10:18.931564Z", - "shell.execute_reply": "2022-04-19T09:10:18.929808Z" - }, - "papermill": { - "duration": 18.121031, - "end_time": "2022-04-19T09:10:18.934550", - "exception": false, - "start_time": "2022-04-19T09:10:00.813519", - "status": "completed" - }, - "tags": [], - "id": "9a7410a9" - }, - "outputs": [], - "source": [ - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso,\n", - " ml_m = lasso_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", - "dml_irm$summary()\n", - "lasso_irm <- dml_irm$coef\n", - "lasso_std_irm <- dml_irm$se\n", - "\n", - "\n", - "# predictions\n", - "dml_irm$params_names()\n", - "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", - "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", - "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a34a9e8", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:10:18.965991Z", - "iopub.status.busy": "2022-04-19T09:10:18.964496Z", - "iopub.status.idle": "2022-04-19T09:10:18.999103Z", - "shell.execute_reply": "2022-04-19T09:10:18.997500Z" - }, - "papermill": { - "duration": 0.052429, - "end_time": "2022-04-19T09:10:19.001172", - "exception": false, - "start_time": "2022-04-19T09:10:18.948743", - "status": "completed" - }, - "tags": [], - "id": "1a34a9e8" - }, - "outputs": [], - "source": [ - "# cross-fitted RMSE: outcome\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "d <- as.matrix(pension$e401)\n", - "lasso_y_irm <- sqrt(mean((y-g_hat)^2))\n", - "lasso_y_irm\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "lasso_d_irm <- sqrt(mean((d-m_hat)^2))\n", - "lasso_d_irm\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0c93355", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:10:19.032688Z", - "iopub.status.busy": "2022-04-19T09:10:19.031200Z", - "iopub.status.idle": "2022-04-19T09:11:17.780954Z", - "shell.execute_reply": "2022-04-19T09:11:17.777618Z" - }, - "papermill": { - "duration": 58.769131, - "end_time": "2022-04-19T09:11:17.784652", - "exception": false, - "start_time": "2022-04-19T09:10:19.015521", - "status": "completed" - }, - "tags": [], - "id": "d0c93355" - }, - "outputs": [], - "source": [ - "##### forest #####\n", - "\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", - " ml_m = randomForest_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", - "dml_irm$summary()\n", - "forest_irm <- dml_irm$coef\n", - "forest_std_irm <- dml_plr$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", - "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", - "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "d <- as.matrix(pension$e401)\n", - "forest_y_irm <- sqrt(mean((y-g_hat)^2))\n", - "forest_y_irm\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "forest_d_irm <- sqrt(mean((d-m_hat)^2))\n", - "forest_d_irm\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)\n", - "\n", - "##### trees #####\n", - "\n", - "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", - "dml_irm$summary()\n", - "tree_irm <- dml_irm$coef\n", - "tree_std_irm <- dml_irm$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", - "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", - "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "d <- as.matrix(pension$e401)\n", - "tree_y_irm <- sqrt(mean((y-g_hat)^2))\n", - "tree_y_irm\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "tree_d_irm <- sqrt(mean((d-m_hat)^2))\n", - "tree_d_irm\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)\n", - "\n", - "\n", - "##### boosting #####\n", - "\n", - "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", - "dml_irm$summary()\n", - "boost_irm <- dml_irm$coef\n", - "boost_std_irm <- dml_irm$se\n", - "\n", - "# predictions\n", - "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", - "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", - "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", - "\n", - "# cross-fitted RMSE: outcome\n", - "y <- as.matrix(pension$net_tfa) # true observations\n", - "d <- as.matrix(pension$e401)\n", - "boost_y_irm <- sqrt(mean((y-g_hat)^2))\n", - "boost_y_irm\n", - "\n", - "# cross-fitted RMSE: treatment\n", - "boost_d_irm <- sqrt(mean((d-m_hat)^2))\n", - "boost_d_irm\n", - "\n", - "# cross-fitted ce: treatment\n", - "mean(ifelse(m_hat > 0.5, 1, 0) != d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf344442", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:11:17.820803Z", - "iopub.status.busy": "2022-04-19T09:11:17.819244Z", - "iopub.status.idle": "2022-04-19T09:11:17.858729Z", - "shell.execute_reply": "2022-04-19T09:11:17.856801Z" - }, - "papermill": { - "duration": 0.059911, - "end_time": "2022-04-19T09:11:17.861698", - "exception": false, - "start_time": "2022-04-19T09:11:17.801787", - "status": "completed" - }, - "tags": [], - "id": "bf344442" - }, - "outputs": [], - "source": [ - "table <- matrix(0, 4, 4)\n", - "table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm)\n", - "table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm)\n", - "table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm)\n", - "table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm)\n", - "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", - "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", - "tab<- xtable(table, digits = 2)\n", - "tab" - ] - }, - { - "cell_type": "markdown", - "id": "cddc45ff", - "metadata": { - "papermill": { - "duration": 0.015454, - "end_time": "2022-04-19T09:11:17.892511", - "exception": false, - "start_time": "2022-04-19T09:11:17.877057", - "status": "completed" - }, - "tags": [], - "id": "cddc45ff" - }, - "source": [ - "Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d4b8690", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-19T09:11:17.926513Z", - "iopub.status.busy": "2022-04-19T09:11:17.924857Z", - "iopub.status.idle": "2022-04-19T09:12:00.841438Z", - "shell.execute_reply": "2022-04-19T09:12:00.838577Z" - }, - "papermill": { - "duration": 42.950051, - "end_time": "2022-04-19T09:12:00.858025", - "exception": false, - "start_time": "2022-04-19T09:11:17.907974", - "status": "completed" - }, - "tags": [], - "id": "9d4b8690" - }, - "outputs": [], - "source": [ - "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", - " ml_m = lasso_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", - "dml_irm$summary()\n", - "best_irm <- dml_irm$coef\n", - "best_std_irm <- dml_irm$se" - ] - }, - { - "cell_type": "markdown", - "id": "92a77dd6", - "metadata": { - "papermill": { - "duration": 0.015461, - "end_time": "2022-04-19T09:12:00.888702", - "exception": false, - "start_time": "2022-04-19T09:12:00.873241", - "status": "completed" - }, - "tags": [], - "id": "92a77dd6" - }, - "source": [ - "These estimates that flexibly account for confounding are\n", - "substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.0.5" - }, + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "f02fa044", "papermill": { - "default_parameters": {}, - "duration": 427.936706, - "end_time": "2022-04-19T09:13:53.230849", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2022-04-19T09:06:45.294143", - "version": "2.3.4" - }, - "colab": { - "provenance": [] - } + "duration": 0.012988, + "end_time": "2022-04-19T09:06:48.772902", + "exception": false, + "start_time": "2022-04-19T09:06:48.759914", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "23154404", + "papermill": { + "duration": 0.009437, + "end_time": "2022-04-19T09:06:48.791895", + "exception": false, + "start_time": "2022-04-19T09:06:48.782458", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Impact of 401(k) on Financial Wealth\n", + "\n", + "As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation\n", + "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", + "\n", + "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "id": "KmAkbDiVE7wm" + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"sandwich\")\n", + "install.packages(\"ggplot2\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"rpart\")\n", + "install.packages(\"gbm\")\n", + "\n", + "library(xtable)\n", + "library(hdm)\n", + "library(sandwich)\n", + "library(ggplot2)\n", + "library(randomForest)\n", + "library(data.table)\n", + "library(glmnet)\n", + "library(rpart)\n", + "library(gbm)" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": { + "id": "7e23cba0", + "papermill": { + "duration": 0.009588, + "end_time": "2022-04-19T09:06:48.810853", + "exception": false, + "start_time": "2022-04-19T09:06:48.801265", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "### Data\n", + "\n", + "The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv).\n", + "The data set can be loaded from the `hdm` package for R directly by typing:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": { + "id": "c442abdc", + "papermill": { + "duration": 0.46397, + "end_time": "2022-04-19T09:06:49.283933", + "exception": false, + "start_time": "2022-04-19T09:06:48.819963", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data(pension)\n", + "data <- pension\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": { + "id": "e47fa9d3", + "papermill": { + "duration": 0.009462, + "end_time": "2022-04-19T09:06:49.302928", + "exception": false, + "start_time": "2022-04-19T09:06:49.293466", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "See the \"Details\" section on the description of the data set, which can be accessed by\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": { + "id": "00e04b82", + "papermill": { + "duration": 0.35227, + "end_time": "2022-04-19T09:06:49.664810", + "exception": false, + "start_time": "2022-04-19T09:06:49.312540", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "help(pension)" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": { + "id": "24b41e4a", + "papermill": { + "duration": 0.009357, + "end_time": "2022-04-19T09:06:49.683784", + "exception": false, + "start_time": "2022-04-19T09:06:49.674427", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts." + ] + }, + { + "cell_type": "markdown", + "id": "8", + "metadata": { + "id": "ed9d4e82", + "papermill": { + "duration": 0.009242, + "end_time": "2022-04-19T09:06:49.702401", + "exception": false, + "start_time": "2022-04-19T09:06:49.693159", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "id": "63519184", + "papermill": { + "duration": 0.618528, + "end_time": "2022-04-19T09:06:50.330218", + "exception": false, + "start_time": "2022-04-19T09:06:49.711690", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar()\n", + "hist_e401" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": { + "id": "823d2628", + "papermill": { + "duration": 0.009686, + "end_time": "2022-04-19T09:06:50.349766", + "exception": false, + "start_time": "2022-04-19T09:06:50.340080", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Eligibility is highly associated with financial wealth:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": { + "id": "5d8faf9c", + "papermill": { + "duration": 0.554613, + "end_time": "2022-04-19T09:06:50.914133", + "exception": false, + "start_time": "2022-04-19T09:06:50.359520", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) +\n", + " geom_density() + xlim(c(-20000, 150000)) +\n", + " facet_wrap(.~e401)\n", + "\n", + "dens_net_tfa" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": { + "id": "0f4f86a7", + "papermill": { + "duration": 0.010335, + "end_time": "2022-04-19T09:06:50.935024", + "exception": false, + "start_time": "2022-04-19T09:06:50.924689", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The unconditional APE of e401 is about $19559$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": { + "id": "836c6af7", + "papermill": { + "duration": 0.038096, + "end_time": "2022-04-19T09:06:50.983602", + "exception": false, + "start_time": "2022-04-19T09:06:50.945506", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "e1 <- data[data$e401==1,]\n", + "e0 <- data[data$e401==0,]\n", + "round(mean(e1$net_tfa)-mean(e0$net_tfa),0)" + ] + }, + { + "cell_type": "markdown", + "id": "14", + "metadata": { + "id": "22b09926", + "papermill": { + "duration": 0.01047, + "end_time": "2022-04-19T09:06:51.004618", + "exception": false, + "start_time": "2022-04-19T09:06:50.994148", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "id": "e78aaa58", + "papermill": { + "duration": 0.039305, + "end_time": "2022-04-19T09:06:51.054616", + "exception": false, + "start_time": "2022-04-19T09:06:51.015311", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "p1 <- data[data$p401==1,]\n", + "p0 <- data[data$p401==0,]\n", + "round(mean(p1$net_tfa)-mean(p0$net_tfa),0)" + ] + }, + { + "cell_type": "markdown", + "id": "16", + "metadata": { + "id": "e0af3c81", + "papermill": { + "duration": 0.010831, + "end_time": "2022-04-19T09:06:51.076114", + "exception": false, + "start_time": "2022-04-19T09:06:51.065283", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": { + "id": "1hBrSMQGzZBR" + }, + "outputs": [], + "source": [ + "# outcome variable\n", + "y <- data[,'net_tfa']\n", + "# treatment variable\n", + "D <- data[,'e401']\n", + "D2 <- data[,\"p401\"]\n", + "D3 <- data[,\"a401\"]\n", + "\n", + "columns_to_drop <- c('e401', 'p401', 'a401', 'tw', 'tfa', 'net_tfa', 'tfa_he',\n", + " 'hval', 'hmort', 'hequity',\n", + " 'nifa', 'net_nifa', 'net_n401', 'ira',\n", + " 'dum91', 'icat', 'ecat', 'zhat',\n", + " 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7',\n", + " 'a1', 'a2', 'a3', 'a4', 'a5')\n", + "\n", + "# covariates\n", + "X <- data[, !(names(data) %in% columns_to_drop)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": { + "id": "DD0Hwcb6z4u5" + }, + "outputs": [], + "source": [ + "# Constructing the controls\n", + "X_formula = \"~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", + "X = as.data.table(model.frame(X_formula, pension))\n", + "head(X)" + ] + }, + { + "cell_type": "markdown", + "id": "19", + "metadata": { + "id": "MZThhulbKA9W" + }, + "source": [ + "## Estimating the ATE of 401(k) Eligibility on Net Financial Assets" + ] + }, + { + "cell_type": "markdown", + "id": "20", + "metadata": { + "id": "UuYqY89D0pvs" + }, + "source": [ + "We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. We start using ML approaches to estimate the function $g_0$ and $m_0$ in the following PLR model:" + ] + }, + { + "cell_type": "markdown", + "id": "21", + "metadata": { + "id": "vEAeB2ih0r8B" + }, + "source": [ + "\\begin{eqnarray}\n", + " & Y = D\\theta_0 + g_0(X) + \\zeta, & E[\\zeta \\mid D,X]= 0,\\\\\n", + " & D = m_0(X) + V, & E[V \\mid X] = 0.\n", + "\\end{eqnarray}" + ] + }, + { + "cell_type": "markdown", + "id": "22", + "metadata": { + "id": "cde447aa", + "papermill": { + "duration": 0.011129, + "end_time": "2022-04-19T09:07:12.117442", + "exception": false, + "start_time": "2022-04-19T09:07:12.106313", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Partially Linear Regression Models (PLR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": { + "id": "tqFlcClUNr9Z" + }, + "outputs": [], + "source": [ + "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=3, method = \"regression\") {\n", + " nobs <- nrow(x) #number of observations\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", + " I <- split(1:nobs, foldid) #split observation indices into folds\n", + " ytil <- dtil <- rep(NA, nobs)\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + "\n", + " if (method == \"regression\") {\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " } else if (method == \"randomforest\") {\n", + " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"prob\")[,2] #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " } else if (method == \"decisiontrees\") {\n", + " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]],])[,2] #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],]) #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " } else if (method == \"boostedtrees\") {\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " }\n", + " cat(b,\" \")\n", + "\n", + " }\n", + " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", + " coef.est <- coef(rfit)[2] #extract coefficient\n", + " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", + " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24", + "metadata": { + "id": "sS0P4CVySjDP" + }, + "outputs": [], + "source": [ + "summaryPLR <- function(point, stderr, resD, resy, name) {\n", + " data <- data.frame(\n", + " estimate = point, # point estimate\n", + " stderr = stderr, # standard error\n", + " lower = point - 1.96 * stderr, # lower end of 95% confidence interval\n", + " upper = point + 1.96 * stderr, # upper end of 95% confidence interval\n", + " `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y\n", + " `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D\n", + " `accuracy D` = mean(abs(resD) < 0.5)# binary classification accuracy of model for D\n", + " )\n", + " rownames(data) <- name\n", + " return(data)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "25", + "metadata": { + "id": "pdGcjnngSn5Q" + }, + "source": [ + "#### Double Lasso with Cross-Fitting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26", + "metadata": { + "id": "LOVuR5QO1bkB" + }, + "outputs": [], + "source": [ + "# DML with LassoCV\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Lasso CV \\n\"))\n", + "\n", + "dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.lasso.cv, yreg.lasso.cv, nfold=5)\n", + "\n", + "sum.lasso.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV')\n", + "tableplr <- data.frame()\n", + "tableplr <- rbind(sum.lasso.cv)\n", + "tableplr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": { + "id": "KatOw36Z0ghO" + }, + "outputs": [], + "source": [ + "# Because residuals are output, reconstruct fitted values for use in ensemble\n", + "dhat.lasso <- D - DML2.results$dtil\n", + "yhat.lasso <- y - DML2.results$ytil" + ] + }, + { + "cell_type": "markdown", + "id": "28", + "metadata": { + "id": "4wvLEj12SpDf" + }, + "source": [ + "#### Using a $\\ell_2$ Penalized Logistic Regression for D\n", + "\n", + "Note we are using the $\\ell_2$ penalty here. You can use the $\\ell_1$ penalty as well, but computation will take longer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29", + "metadata": { + "id": "b9Nvp5ZlSuwB" + }, + "outputs": [], + "source": [ + "# DML with Lasso/Logistic\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", + "\n", + "dreg.logistic.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", + "yreg.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.logistic.cv, yreg.lasso.cv, nfold=5)\n", + "sum.lasso_logistic.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV/LogisticCV')\n", + "tableplr <- rbind(tableplr, sum.lasso_logistic.cv)\n", + "tableplr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30", + "metadata": { + "id": "hJqMdcZV05lr" + }, + "outputs": [], + "source": [ + "# Because residuals are output, reconstruct fitted values for use in ensemble\n", + "dhat.lasso_logistic <- D - DML2.results$dtil\n", + "yhat.lasso_logistic <- y - DML2.results$ytil" + ] + }, + { + "cell_type": "markdown", + "id": "31", + "metadata": { + "id": "txyv6IDXSu64" + }, + "source": [ + "#### Random Forests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32", + "metadata": { + "id": "nt0oTHTfSwMr" + }, + "outputs": [], + "source": [ + "# DML with Random Forest\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", + "\n", + "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "\n", + "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.rf, yreg.rf, nfold=5, method = \"randomforest\")\n", + "sum.rf <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Random Forest')\n", + "tableplr <- rbind(tableplr, sum.rf)\n", + "tableplr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33", + "metadata": { + "id": "TG476dPX1BI_" + }, + "outputs": [], + "source": [ + "# Because residuals are output, reconstruct fitted values for use in ensemble\n", + "dhat.rf <- D - DML2.results$dtil\n", + "yhat.rf <- y - DML2.results$ytil" + ] + }, + { + "cell_type": "markdown", + "id": "34", + "metadata": { + "id": "k8EFP-w_SwXZ" + }, + "source": [ + "#### Decision Trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35", + "metadata": { + "id": "3Nu4daQRSyRb" + }, + "outputs": [], + "source": [ + "# DML with Decision Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", + "\n", + "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "yreg.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "\n", + "DML2.results <- DML2.for.PLM(X, D, y, dreg.tr, yreg.tr, nfold=5, method = \"decisiontrees\") # decision tree takes in X as dataframe, not matrix/array\n", + "sum.tr <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Decision Trees')\n", + "tableplr <- rbind(tableplr, sum.tr)\n", + "tableplr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36", + "metadata": { + "id": "RnCGwVbN1KJJ" + }, + "outputs": [], + "source": [ + "# Because residuals are output, reconstruct fitted values for use in ensemble\n", + "dhat.tr <- D - DML2.results$dtil\n", + "yhat.tr <- y - DML2.results$ytil" + ] + }, + { + "cell_type": "markdown", + "id": "37", + "metadata": { + "id": "jODHt0hjntdP" + }, + "source": [ + "\n", + "Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\\sim X$, $D \\sim X$, $Z\\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract said model so we can re-use that in DML." + ] + }, + { + "cell_type": "markdown", + "id": "38", + "metadata": { + "id": "SaPGNW0SSxWk" + }, + "source": [ + "#### Boosted Trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39", + "metadata": { + "id": "Ekg5qeEOSxep" + }, + "outputs": [], + "source": [ + "# DML with Boosted Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", + "\n", + "# NB: early stopping cannot easily be implemented with gbm\n", + "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", + "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "\n", + "# passing these through regression as type=\"response\", and D should not be factor!\n", + "DML2.results = DML2.for.PLM(X, D, y, dreg.boost, yreg.boost, nfold=5, method = \"boostedtrees\")\n", + "sum.boost <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Boosted Trees')\n", + "tableplr <- rbind(tableplr, sum.boost)\n", + "tableplr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40", + "metadata": { + "id": "WSyqSd5Z1hne" + }, + "outputs": [], + "source": [ + "# Because residuals are output, reconstruct fitted values for use in ensemble\n", + "dhat.boost <- D - DML2.results$dtil\n", + "yhat.boost <- y - DML2.results$ytil" + ] + }, + { + "cell_type": "markdown", + "id": "41", + "metadata": { + "id": "7UZphpPS10Hz" + }, + "source": [ + "## Ensembles" + ] + }, + { + "cell_type": "markdown", + "id": "42", + "metadata": { + "id": "Hqsqpgs6C4fJ" + }, + "source": [ + "Boosted trees give the best RMSE for both Y and D, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43", + "metadata": { + "id": "gDrZqZXR12hA" + }, + "outputs": [], + "source": [ + "# Best fit is boosted trees for both D and Y\n", + "\n", + "sum.best <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Best')\n", + "tableplr <- rbind(tableplr, sum.best)\n", + "tableplr" + ] + }, + { + "cell_type": "markdown", + "id": "44", + "metadata": { + "id": "pG8mmrQw2GRC" + }, + "source": [ + "We'll form a model average with unconstrained least squares weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45", + "metadata": { + "id": "Pkg7pw5h2N0z" + }, + "outputs": [], + "source": [ + "# Least squares model average\n", + "\n", + "ma.dtil <- lm(D~dhat.lasso+dhat.lasso_logistic+dhat.rf+dhat.tr+dhat.boost)$residuals\n", + "ma.ytil <- lm(y~yhat.lasso+yhat.lasso_logistic+yhat.rf+yhat.tr+yhat.boost)$residuals\n", + "\n", + "rfit <- lm(ma.ytil ~ ma.dtil) #estimate the main parameter by regressing one residual on the other\n", + "coef.est <- coef(rfit)[2] #extract coefficient\n", + "se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", + "\n", + "sum.ma <- summaryPLR(coef.est, se, ma.dtil, ma.ytil, name = 'Model Average')\n", + "tableplr <- rbind(tableplr, sum.ma)\n", + "tableplr\n" + ] + }, + { + "cell_type": "markdown", + "id": "46", + "metadata": { + "id": "67fa5873", + "papermill": { + "duration": 0.013657, + "end_time": "2022-04-19T09:10:00.718448", + "exception": false, + "start_time": "2022-04-19T09:10:00.704791", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Interactive Regression Model (IRM)" + ] + }, + { + "cell_type": "markdown", + "id": "47", + "metadata": { + "id": "86393e4c", + "papermill": { + "duration": 0.013488, + "end_time": "2022-04-19T09:10:00.745538", + "exception": false, + "start_time": "2022-04-19T09:10:00.732050", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous:" + ] + }, + { + "cell_type": "markdown", + "id": "48", + "metadata": { + "id": "830bb508", + "papermill": { + "duration": 0.013695, + "end_time": "2022-04-19T09:10:00.772756", + "exception": false, + "start_time": "2022-04-19T09:10:00.759061", + "status": "completed" + }, + "tags": [] + }, + "source": [ + " \\begin{eqnarray}\\label{eq: HetPL1}\n", + " & Y = g_0(D, X) + U, & \\quad E[U \\mid X, D]= 0,\\\\\n", + " & D = m_0(X) + V, & \\quad E[V\\mid X] = 0.\n", + "\\end{eqnarray}" + ] + }, + { + "cell_type": "markdown", + "id": "49", + "metadata": { + "id": "9e5ec32b", + "papermill": { + "duration": 0.013592, + "end_time": "2022-04-19T09:10:00.799889", + "exception": false, + "start_time": "2022-04-19T09:10:00.786297", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To reduce the disproportionate impact of extreme propensity score weights in the interactive model\n", + "we trim the propensity scores which are close to the bounds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50", + "metadata": { + "id": "-hCmnqC-N0nS" + }, + "outputs": [], + "source": [ + "DML2.for.IRM <- function(x, d, y, dreg, yreg0, yreg1, trimming=0.01, nfold=5, method=\"regression\") {\n", + " yhat0 <- rep(0, length(y))\n", + " yhat1 <- rep(0, length(y))\n", + " Dhat <- rep(0, length(d))\n", + "\n", + " nobs <- nrow(x) #number of observations\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", + " I <- split(1:nobs, foldid) #split observation indices into folds\n", + " ytil <- dtil <- rep(NA, nobs)\n", + "\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + "\n", + " # define helpful variables\n", + " Dnotb = d[-I[[b]]]\n", + " Xb = X[I[[b]],]\n", + " Xnotb = X[-I[[b]],]\n", + "\n", + " # training dfs subsetted on the -I[[b]] fold\n", + " XD0 = X[-I[[b]],][d[-I[[b]]]==0]\n", + " yD0 = y[-I[[b]]][d[-I[[b]]]==0]\n", + " XD1 = X[-I[[b]],][d[-I[[b]]]==1]\n", + " yD1 = y[-I[[b]]][d[-I[[b]]]==1]\n", + "\n", + " if (method == \"regression\") {\n", + " yfit0 <- yreg0(as.matrix(XD0), yD0)\n", + " yfit1 <- yreg1(as.matrix(XD1), yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = \"response\" for glmnet family gaussian\n", + " yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb))\n", + " } else if (method == \"randomforest\") {\n", + " yfit0 <- yreg0(XD0, yD0)\n", + " yfit1 <- yreg1(XD1, yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for rf\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " } else if (method == \"decisiontrees\") {\n", + " yfit0 <- yreg0(XD0, yD0)\n", + " yfit1 <- yreg1(XD1, yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"vector\" for decision\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " } else if (method == \"boostedtrees\") {\n", + " yfit0 <- yreg0(as.data.frame(XD0), yD0)\n", + " yfit1 <- yreg1(as.data.frame(XD1), yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for boosted\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " }\n", + "\n", + " # propensity scores:\n", + " if (method == \"regression\"){\n", + " Dfit_b <- dreg(as.matrix(Xnotb), Dnotb)\n", + " Dhat_b <- predict(Dfit_b, as.matrix(Xb), type=\"response\") # default is type=\"link\" for family binomial!\n", + " } else if (method == \"randomforest\") {\n", + " Dfit_b <- dreg(Xnotb, as.factor(Dnotb))\n", + " Dhat_b <- predict(Dfit_b, Xb, type = \"prob\")[,2]\n", + " } else if (method == \"decisiontrees\") {\n", + " Dfit_b <- dreg(Xnotb, Dnotb)\n", + " Dhat_b <- predict(Dfit_b, Xb)[,2]\n", + " } else if (method == \"boostedtrees\") {\n", + " Dfit_b <- dreg(as.data.frame(Xnotb), Dnotb)\n", + " Dhat_b <- predict(Dfit_b, Xb, type=\"response\")\n", + " }\n", + " Dhat_b <- pmax(pmin(Dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", + " Dhat[I[[b]]] <- Dhat_b\n", + "\n", + "\n", + " cat(b,\" \")\n", + " }\n", + "\n", + " # Prediction of treatment and outcome for observed instrument\n", + " yhat <- yhat0 * (1 - D) + yhat1 * D\n", + " # residuals\n", + " ytil <- y-yhat\n", + " dtil <- D-Dhat\n", + " # doubly robust quantity for every sample\n", + " drhat <- yhat1 - yhat0 + (y-yhat)* (D/Dhat - (1 - D)/(1 - Dhat))\n", + " coef.est <- mean(drhat)\n", + " vari <- var(drhat)\n", + " se <- sqrt(vari/nrow(X))\n", + " cat(\"point\", coef.est)\n", + " cat(\"se\", se)\n", + " return(list(coef.est = coef.est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51", + "metadata": { + "id": "bCj1D8_MSg09" + }, + "outputs": [], + "source": [ + "summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) {\n", + " summary_data <- data.frame(estimate = coef.est, # point estimate\n", + " se = se, # standard error\n", + " lower = coef.est - 1.96 * se, # lower end of 95% confidence interval\n", + " upper = coef.est + 1.96 * se, # upper end of 95% confidence interval\n", + " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", + " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", + " accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D\n", + " )\n", + " row.names(summary_data) <- name\n", + " return(summary_data)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "52", + "metadata": { + "id": "6mCdfifchkgZ" + }, + "source": [ + "#### Repeat analysis in the IRM setting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53", + "metadata": { + "id": "AUiHMoNTvo-m" + }, + "outputs": [], + "source": [ + "# DML with Lasso/Logistic\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with LassoCV/Logistic \\n\"))\n", + "\n", + "dreg.lasso.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", + "yreg0.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "yreg1.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "\n", + "DML2.results <- DML2.for.IRM(X, D, y, dreg.lasso.cv, yreg0.lasso.cv, yreg1.lasso.cv, nfold=5) # more folds seems to help stabilize finite sample performance\n", + "sum.lasso.cv <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'LassoCVLogistic')\n", + "tableirm <- data.frame()\n", + "tableirm <- rbind(sum.lasso.cv)\n", + "tableirm\n", + "\n", + "yhat0.lasso <- DML2.results$yhat0\n", + "yhat1.lasso <- DML2.results$yhat1\n", + "dhat.lasso <- DML2.results$dhat\n", + "yhat.lasso <- DML2.results$yhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54", + "metadata": { + "id": "JPABXLYyvyqy" + }, + "outputs": [], + "source": [ + "# DML with Random Forest\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", + "\n", + "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg0.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "yreg1.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", + "\n", + "\n", + "DML2.results <- DML2.for.IRM(as.matrix(X), D, y, dreg.rf, yreg0.rf, yreg1.rf, nfold=5, method = \"randomforest\")\n", + "sum.rf <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Random Forest')\n", + "tableirm <- rbind(tableirm, sum.rf)\n", + "tableirm\n", + "\n", + "yhat0.rf <- DML2.results$yhat0\n", + "yhat1.rf <- DML2.results$yhat1\n", + "dhat.rf <- DML2.results$dhat\n", + "yhat.rf <- DML2.results$yhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55", + "metadata": { + "id": "SukZCfEbvyzC" + }, + "outputs": [], + "source": [ + "# DML with Decision Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", + "\n", + "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", + "yreg0.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "yreg1.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", + "\n", + "DML2.results <- DML2.for.IRM(X, D, y, dreg.tr, yreg0.tr, yreg1.tr, nfold=5, method = \"decisiontrees\")\n", + "sum.tr <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Decision Trees')\n", + "tableirm <- rbind(tableirm, sum.tr)\n", + "tableirm\n", + "\n", + "yhat0.tr <- DML2.results$yhat0\n", + "yhat1.tr <- DML2.results$yhat1\n", + "dhat.tr <- DML2.results$dhat\n", + "yhat.tr <- DML2.results$yhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56", + "metadata": { + "id": "bTfgiCabvy6f" + }, + "outputs": [], + "source": [ + "# DML with Boosted Trees\n", + "set.seed(123)\n", + "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", + "\n", + "# NB: early stopping cannot easily be implemented with gbm\n", + "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", + "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg0.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "yreg1.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "\n", + "# passing these through regression as type=\"response\", and D should not be factor!\n", + "DML2.results = DML2.for.IRM(X, D, y, dreg.boost, yreg0.boost, yreg1.boost, nfold=5, method = \"boostedtrees\")\n", + "sum.boost <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Boosted Trees')\n", + "tableirm <- rbind(tableirm, sum.boost)\n", + "tableirm\n", + "\n", + "yhat0.boost <- DML2.results$yhat0\n", + "yhat1.boost <- DML2.results$yhat1\n", + "dhat.boost <- DML2.results$dhat\n", + "yhat.boost <- DML2.results$yhat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57", + "metadata": { + "id": "7rxqwK-R4Z2q" + }, + "outputs": [], + "source": [ + "# Ensembles\n", + "\n", + "# Best\n", + "# We'll look at model that does best for Y overall. Could also use different model for Y0 and Y1\n", + "# Here, the best performance for Y is the random forest and for D the boosted tree\n", + "\n", + "# residuals\n", + "ytil <- y-yhat.rf\n", + "dtil <- D-dhat.boost\n", + "# doubly robust quantity for every sample\n", + "drhat <- yhat1.rf - yhat0.rf + (y-yhat.rf)* (D/dhat.boost - (1 - D)/(1 - dhat.boost))\n", + "coef.est <- mean(drhat)\n", + "vari <- var(drhat)\n", + "se <- sqrt(vari/nrow(X))\n", + "\n", + "sum.best <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Best')\n", + "tableirm <- rbind(tableirm, sum.best)\n", + "tableirm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58", + "metadata": { + "id": "0-c3NI0fCfqg" + }, + "outputs": [], + "source": [ + "# Least squares model average\n", + "# We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1\n", + "\n", + "ma.dw <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost)$coef\n", + "ma.yw <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost)$coef\n", + "\n", + "Dhats <- cbind(as.matrix(rep(1,nrow(X))),dhat.lasso,dhat.rf,dhat.tr,dhat.boost)\n", + "Y0s <- cbind(as.matrix(rep(1,nrow(X))),yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)\n", + "Y1s <- cbind(as.matrix(rep(1,nrow(X))),yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)\n", + "\n", + "dhat <- Dhats%*%as.matrix(ma.dw)\n", + "yhat0 <- Y0s%*%as.matrix(ma.yw)\n", + "yhat1 <- Y1s%*%as.matrix(ma.yw)\n", + "\n", + "# Prediction of treatment and outcome for observed instrument\n", + "yhat <- yhat0 * (1 - D) + yhat1 * D\n", + "# residuals\n", + "ytil <- y-yhat\n", + "dtil <- D-dhat\n", + "# doubly robust quantity for every sample\n", + "drhat <- yhat1 - yhat0 + (y-yhat)* (D/dhat - (1 - D)/(1 - dhat))\n", + "coef.est <- mean(drhat)\n", + "vari <- var(drhat)\n", + "se <- sqrt(vari/nrow(X))\n", + "\n", + "sum.ma <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Model Average')\n", + "tableirm <- rbind(tableirm, sum.ma)\n", + "tableirm\n" + ] + }, + { + "cell_type": "markdown", + "id": "59", + "metadata": { + "id": "01de9f24", + "papermill": { + "duration": 0.010725, + "end_time": "2022-04-19T09:06:51.098483", + "exception": false, + "start_time": "2022-04-19T09:06:51.087758", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Double ML package" + ] + }, + { + "cell_type": "markdown", + "id": "60", + "metadata": { + "id": "6cdc366f", + "papermill": { + "duration": 0.010679, + "end_time": "2022-04-19T09:06:51.119780", + "exception": false, + "start_time": "2022-04-19T09:06:51.109101", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", + "\n", + "We run through PLR and IRM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", + "\n", + "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61", + "metadata": { + "id": "2846a36a", + "papermill": { + "duration": 20.239271, + "end_time": "2022-04-19T09:07:11.369618", + "exception": false, + "start_time": "2022-04-19T09:06:51.130347", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "install.packages(\"DoubleML\")\n", + "install.packages(\"mlr3learners\")\n", + "install.packages(\"mlr3\")\n", + "install.packages(\"data.table\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"ranger\")\n", + "\n", + "library(DoubleML)\n", + "library(mlr3learners)\n", + "library(mlr3)\n", + "library(data.table)\n", + "library(randomForest)\n", + "library(ranger)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62", + "metadata": { + "id": "2a141248", + "papermill": { + "duration": 0.100382, + "end_time": "2022-04-19T09:07:12.094585", + "exception": false, + "start_time": "2022-04-19T09:07:11.994203", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Constructing the data (as DoubleMLData)\n", + "formula_flex = \"net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown\"\n", + "model_flex = as.data.table(model.frame(formula_flex, pension))\n", + "x_cols = colnames(model_flex)[-c(1,2)]\n", + "data_ml = DoubleMLData$new(model_flex, y_col = \"net_tfa\", d_cols = \"e401\", x_cols=x_cols)\n", + "\n", + "\n", + "p <- dim(model_flex)[2]-2\n", + "p\n", + "\n", + "# complex model with two-way interactions\n", + "#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE)\n" + ] + }, + { + "cell_type": "markdown", + "id": "63", + "metadata": { + "id": "2e1c9339", + "papermill": { + "duration": 0.010825, + "end_time": "2022-04-19T09:07:11.938062", + "exception": false, + "start_time": "2022-04-19T09:07:11.927237", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session." + ] + }, + { + "cell_type": "markdown", + "id": "64", + "metadata": { + "id": "Cwmd7ELXKeIg" + }, + "source": [ + "## Partially Linear Regression Models (PLR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65", + "metadata": { + "id": "a48e367d", + "papermill": { + "duration": 22.473331, + "end_time": "2022-04-19T09:07:34.646865", + "exception": false, + "start_time": "2022-04-19T09:07:12.173534", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Estimating the PLR\n", + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "lasso <- lrn(\"regr.cv_glmnet\",nfolds = 5, s = \"lambda.min\")\n", + "lasso_class <- lrn(\"classif.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", + "\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds=5)\n", + "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr$summary()\n", + "lasso_plr <- dml_plr$coef\n", + "lasso_std_plr <- dml_plr$se" + ] + }, + { + "cell_type": "markdown", + "id": "66", + "metadata": { + "id": "135275dc", + "papermill": { + "duration": 0.011132, + "end_time": "2022-04-19T09:07:34.670166", + "exception": false, + "start_time": "2022-04-19T09:07:34.659034", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let us check the predictive performance of this model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67", + "metadata": { + "id": "e6d83bbe", + "papermill": { + "duration": 0.038389, + "end_time": "2022-04-19T09:07:34.719637", + "exception": false, + "start_time": "2022-04-19T09:07:34.681248", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dml_plr$params_names()\n", + "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", + "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68", + "metadata": { + "id": "32c894fa", + "papermill": { + "duration": 0.043342, + "end_time": "2022-04-19T09:07:34.774113", + "exception": false, + "start_time": "2022-04-19T09:07:34.730771", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# cross-fitted RMSE: outcome\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", + "d <- as.matrix(pension$e401)\n", + "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", + "lasso_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "lasso_y_rmse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69", + "metadata": { + "id": "da5b9334", + "papermill": { + "duration": 0.04333, + "end_time": "2022-04-19T09:07:34.828718", + "exception": false, + "start_time": "2022-04-19T09:07:34.785388", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# cross-fitted RMSE: treatment\n", + "d <- as.matrix(pension$e401)\n", + "lasso_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "lasso_d_rmse\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "markdown", + "id": "70", + "metadata": { + "id": "c1481527", + "papermill": { + "duration": 0.011351, + "end_time": "2022-04-19T09:07:34.851558", + "exception": false, + "start_time": "2022-04-19T09:07:34.840207", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Then, we repeat this procedure for various machine learning methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71", + "metadata": { + "id": "dac2d0fc", + "papermill": { + "duration": 61.046861, + "end_time": "2022-04-19T09:08:35.910116", + "exception": false, + "start_time": "2022-04-19T09:07:34.863255", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Random Forest\n", + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "randomForest <- lrn(\"regr.ranger\")\n", + "randomForest_class <- lrn(\"classif.ranger\")\n", + "\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = randomForest_class, n_folds=5)\n", + "dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model\n", + "dml_plr$summary()\n", + "forest_plr <- dml_plr$coef\n", + "forest_std_plr <- dml_plr$se" + ] + }, + { + "cell_type": "markdown", + "id": "72", + "metadata": { + "id": "c7c614e6", + "papermill": { + "duration": 0.011382, + "end_time": "2022-04-19T09:08:35.932891", + "exception": false, + "start_time": "2022-04-19T09:08:35.921509", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can compare the accuracy of this model to the model that has been estimated with lasso." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73", + "metadata": { + "id": "f8af1a74", + "papermill": { + "duration": 0.092847, + "end_time": "2022-04-19T09:08:36.037154", + "exception": false, + "start_time": "2022-04-19T09:08:35.944307", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Evaluation predictions\n", + "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", + "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", + "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", + "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", + "forest_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "forest_y_rmse\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "forest_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "forest_d_rmse\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74", + "metadata": { + "id": "61a94dff", + "papermill": { + "duration": 1.221303, + "end_time": "2022-04-19T09:08:37.271202", + "exception": false, + "start_time": "2022-04-19T09:08:36.049899", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Trees\n", + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "\n", + "trees <- lrn(\"regr.rpart\")\n", + "trees_class <- lrn(\"classif.rpart\")\n", + "\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds=5)\n", + "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr$summary()\n", + "tree_plr <- dml_plr$coef\n", + "tree_std_plr <- dml_plr$se\n", + "\n", + "# Evaluation predictions\n", + "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", + "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", + "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", + "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", + "tree_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "tree_y_rmse\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "tree_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "tree_d_rmse\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75", + "metadata": { + "id": "885c94eb", + "papermill": { + "duration": 80.788944, + "end_time": "2022-04-19T09:09:58.072665", + "exception": false, + "start_time": "2022-04-19T09:08:37.283721", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# needed to run boosting\n", + "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", + "install.packages(\"mlr3extralearners\")\n", + "install.packages(\"mboost\")\n", + "library(mlr3extralearners)\n", + "library(mboost)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76", + "metadata": { + "id": "0372eefe", + "papermill": { + "duration": 2.404791, + "end_time": "2022-04-19T09:10:00.494687", + "exception": false, + "start_time": "2022-04-19T09:09:58.089896", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Boosting\n", + "boost<- lrn(\"regr.glmboost\")\n", + "boost_class <- lrn(\"classif.glmboost\")\n", + "\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds=5)\n", + "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr$summary()\n", + "boost_plr <- dml_plr$coef\n", + "boost_std_plr <- dml_plr$se\n", + "\n", + "# Evaluation predictions\n", + "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", + "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", + "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", + "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", + "boost_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "boost_y_rmse\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "boost_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "boost_d_rmse\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "markdown", + "id": "77", + "metadata": { + "id": "ffa1e35a", + "papermill": { + "duration": 0.013161, + "end_time": "2022-04-19T09:10:00.521404", + "exception": false, + "start_time": "2022-04-19T09:10:00.508243", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let's sum up the results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78", + "metadata": { + "id": "d322c48a", + "papermill": { + "duration": 0.081341, + "end_time": "2022-04-19T09:10:00.615953", + "exception": false, + "start_time": "2022-04-19T09:10:00.534612", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "table <- matrix(0, 4, 4)\n", + "table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr)\n", + "table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr)\n", + "table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse)\n", + "table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse)\n", + "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", + "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", + "tab<- xtable(table, digits = 2)\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "79", + "metadata": { + "id": "e8e9ffc8", + "papermill": { + "duration": 0.013424, + "end_time": "2022-04-19T09:10:00.642931", + "exception": false, + "start_time": "2022-04-19T09:10:00.629507", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The best model with lowest RMSE is the PLR model estimated via lasso (or boosting based on the RSME Y). It gives the following estimate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80", + "metadata": { + "id": "33fcc2b4", + "papermill": { + "duration": 0.034272, + "end_time": "2022-04-19T09:10:00.690621", + "exception": false, + "start_time": "2022-04-19T09:10:00.656349", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lasso_plr" + ] + }, + { + "cell_type": "markdown", + "id": "81", + "metadata": { + "id": "Ebrv1spfKWxH" + }, + "source": [ + "## Interactive Regression Model (IRM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82", + "metadata": { + "id": "9a7410a9", + "papermill": { + "duration": 18.121031, + "end_time": "2022-04-19T09:10:18.934550", + "exception": false, + "start_time": "2022-04-19T09:10:00.813519", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso,\n", + " ml_m = lasso_class,\n", + " trimming_threshold = 0.01, n_folds=5)\n", + "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm$summary()\n", + "lasso_irm <- dml_irm$coef\n", + "lasso_std_irm <- dml_irm$se\n", + "\n", + "\n", + "# predictions\n", + "dml_irm$params_names()\n", + "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", + "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", + "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83", + "metadata": { + "id": "1a34a9e8", + "papermill": { + "duration": 0.052429, + "end_time": "2022-04-19T09:10:19.001172", + "exception": false, + "start_time": "2022-04-19T09:10:18.948743", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# cross-fitted RMSE: outcome\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "d <- as.matrix(pension$e401)\n", + "lasso_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "lasso_y_irm\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "lasso_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "lasso_d_irm\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84", + "metadata": { + "id": "d0c93355", + "papermill": { + "duration": 58.769131, + "end_time": "2022-04-19T09:11:17.784652", + "exception": false, + "start_time": "2022-04-19T09:10:19.015521", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "##### forest #####\n", + "\n", + "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", + " ml_m = randomForest_class,\n", + " trimming_threshold = 0.01, n_folds=5)\n", + "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm$summary()\n", + "forest_irm <- dml_irm$coef\n", + "forest_std_irm <- dml_plr$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", + "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", + "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "d <- as.matrix(pension$e401)\n", + "forest_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "forest_y_irm\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "forest_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "forest_d_irm\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)\n", + "\n", + "##### trees #####\n", + "\n", + "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class,\n", + " trimming_threshold = 0.01, n_folds=5)\n", + "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm$summary()\n", + "tree_irm <- dml_irm$coef\n", + "tree_std_irm <- dml_irm$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", + "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", + "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "d <- as.matrix(pension$e401)\n", + "tree_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "tree_y_irm\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "tree_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "tree_d_irm\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)\n", + "\n", + "\n", + "##### boosting #####\n", + "\n", + "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class,\n", + " trimming_threshold = 0.01, n_folds=5)\n", + "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm$summary()\n", + "boost_irm <- dml_irm$coef\n", + "boost_std_irm <- dml_irm$se\n", + "\n", + "# predictions\n", + "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", + "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", + "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", + "\n", + "# cross-fitted RMSE: outcome\n", + "y <- as.matrix(pension$net_tfa) # true observations\n", + "d <- as.matrix(pension$e401)\n", + "boost_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "boost_y_irm\n", + "\n", + "# cross-fitted RMSE: treatment\n", + "boost_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "boost_d_irm\n", + "\n", + "# cross-fitted ce: treatment\n", + "mean(ifelse(m_hat > 0.5, 1, 0) != d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85", + "metadata": { + "id": "bf344442", + "papermill": { + "duration": 0.059911, + "end_time": "2022-04-19T09:11:17.861698", + "exception": false, + "start_time": "2022-04-19T09:11:17.801787", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "table <- matrix(0, 4, 4)\n", + "table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm)\n", + "table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm)\n", + "table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm)\n", + "table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm)\n", + "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", + "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", + "tab<- xtable(table, digits = 2)\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "86", + "metadata": { + "id": "cddc45ff", + "papermill": { + "duration": 0.015454, + "end_time": "2022-04-19T09:11:17.892511", + "exception": false, + "start_time": "2022-04-19T09:11:17.877057", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87", + "metadata": { + "id": "9d4b8690", + "papermill": { + "duration": 42.950051, + "end_time": "2022-04-19T09:12:00.858025", + "exception": false, + "start_time": "2022-04-19T09:11:17.907974", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", + "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", + " ml_m = lasso_class,\n", + " trimming_threshold = 0.01, n_folds=5)\n", + "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm$summary()\n", + "best_irm <- dml_irm$coef\n", + "best_std_irm <- dml_irm$se" + ] + }, + { + "cell_type": "markdown", + "id": "88", + "metadata": { + "id": "92a77dd6", + "papermill": { + "duration": 0.015461, + "end_time": "2022-04-19T09:12:00.888702", + "exception": false, + "start_time": "2022-04-19T09:12:00.873241", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "These estimates that flexibly account for confounding are\n", + "substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 427.936706, + "end_time": "2022-04-19T09:13:53.230849", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-04-19T09:06:45.294143", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd b/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd new file mode 100644 index 00000000..bf3f40c1 --- /dev/null +++ b/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd @@ -0,0 +1,195 @@ +--- +title: An R Markdown document converted from "PM4/r-identification-analysis-of-401-k-example-w-dags.irnb" +output: html_document +--- + +# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth + +```{r} +#install and load package +install.packages("dagitty") +install.packages("ggdag") +library(dagitty) +library(ggdag) +``` + +# Graphs for 401(K) Analsyis + +Here we have + * $Y$ -- net financial assets; + * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details); + * $F$ -- latent (unobserved) firm characteristics + * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$ + +**One graph (where F determines X):** + +```{r} +#generate a DAGs and plot them + +G1 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [uobserved, pos="0, -1"] +D -> Y +X -> D +F -> X +F -> D +X -> Y}') + + +ggdag(G1)+ theme_dag() +``` + +**List minimal adjustment sets to identify causal effects $D \to Y$** + + +```{r} +adjustmentSets( G1, "D", "Y",effect="total" ) +``` + +**What is the underlying principle?** + +Here conditioning on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so!) + +**Another Graph (wherere $X$ determines $F$):** + +```{r} +#generate a couple of DAGs and plot them + +G2 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [uobserved, pos="0, -1"] +D -> Y +X -> D +X -> F +F -> D +X -> Y}') + + +ggdag(G2)+ theme_dag() +``` + +```{r} +adjustmentSets( G2, "D", "Y", effect="total" ) +``` + +**One more graph (encompassing the previous ones), where (F, X) are jointly determined by latent factors $A$.** + +We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$. + +This is much more realistic graph to consider. + +```{r} +G3 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +X -> Y}') + +adjustmentSets( G3, "D", "Y", effect="total" ) + +ggdag(G3)+ theme_dag() +``` + +# Threat to Identification: + +What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case.) + +```{r} +G4 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +F -> Y +X -> Y}') + + +ggdag(G4)+ theme_dag() +``` + +```{r} +adjustmentSets( G4, "D", "Y",effect="total" ) +``` + +**Note that no output means that there is no valid adustment set (among observed variables).** + +**How can F affect Y directly? Is it reasonable?** + +Introduce Match Amount $M$. The match amount is a potential important mediator (why mediator?). $M$ is not observed. Luckily, adjusting for $X$ still works if there is no arrow $F \to M$. + +```{r} +G5 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +M [unobserved, pos="2, -.5"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +D -> M +M -> Y +X -> M +X -> Y}') + +print( adjustmentSets( G5, "D", "Y",effect="total" ) ) + +ggdag(G5)+ theme_dag() +``` + +If there is an $F \to M$ arrow, then adjusting for $X$ is not sufficient. + +```{r} +G6 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +M [uobserved, pos="2, -.5"] +D -> Y +X -> D +F -> D +A -> F +A -> X +D -> M +F -> M +A -> D +M -> Y +X -> M +X -> Y}') + +print( adjustmentSets( G6, "D", "Y" ),effect="total" ) + +ggdag(G6)+ theme_dag() +``` + +Again, note that no output was returned for the adjustment set. There is no valid adjustment set here. + + # Question: + +Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth? + diff --git a/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb b/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb index fc387283..b9e05adc 100644 --- a/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb +++ b/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb @@ -1,670 +1,610 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.011558, - "end_time": "2021-04-20T21:06:30.038601", - "exception": false, - "start_time": "2021-04-20T21:06:30.027043", - "status": "completed" - }, - "tags": [], - "id": "bhmud6TUPIE1" - }, - "source": [ - "# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "_execution_state": "idle", - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "execution": { - "iopub.execute_input": "2021-04-20T21:06:30.088435Z", - "iopub.status.busy": "2021-04-20T21:06:30.086566Z", - "iopub.status.idle": "2021-04-20T21:06:55.456113Z", - "shell.execute_reply": "2021-04-20T21:06:55.454477Z" - }, - "papermill": { - "duration": 25.408317, - "end_time": "2021-04-20T21:06:55.456487", - "exception": false, - "start_time": "2021-04-20T21:06:30.048170", - "status": "completed" - }, - "tags": [], - "id": "t1xb29BvPIE4" - }, - "outputs": [], - "source": [ - "#install and load package\n", - "install.packages(\"dagitty\")\n", - "install.packages(\"ggdag\")\n", - "library(dagitty)\n", - "library(ggdag)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.010708, - "end_time": "2021-04-20T21:06:55.479480", - "exception": false, - "start_time": "2021-04-20T21:06:55.468772", - "status": "completed" - }, - "tags": [], - "id": "C9Uqph8wPIE7" - }, - "source": [ - "# Graphs for 401(K) Analsyis\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.010554, - "end_time": "2021-04-20T21:06:55.500684", - "exception": false, - "start_time": "2021-04-20T21:06:55.490130", - "status": "completed" - }, - "tags": [], - "id": "B2_fiQOEPIE7" - }, - "source": [ - "Here we have\n", - " * $Y$ -- net financial assets;\n", - " * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details);\n", - " * $F$ -- latent (unobserved) firm characteristics\n", - " * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "bhmud6TUPIE1", + "papermill": { + "duration": 0.011558, + "end_time": "2021-04-20T21:06:30.038601", + "exception": false, + "start_time": "2021-04-20T21:06:30.027043", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.010886, - "end_time": "2021-04-20T21:06:55.522231", - "exception": false, - "start_time": "2021-04-20T21:06:55.511345", - "status": "completed" - }, - "tags": [], - "id": "-U6yQWBgPIE7" - }, - "source": [ - "**One graph (where F determines X):**\n" - ] + "tags": [] + }, + "source": [ + "# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "t1xb29BvPIE4", + "papermill": { + "duration": 25.408317, + "end_time": "2021-04-20T21:06:55.456487", + "exception": false, + "start_time": "2021-04-20T21:06:30.048170", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:55.576427Z", - "iopub.status.busy": "2021-04-20T21:06:55.547730Z", - "iopub.status.idle": "2021-04-20T21:06:56.492498Z", - "shell.execute_reply": "2021-04-20T21:06:56.492869Z" - }, - "papermill": { - "duration": 0.960286, - "end_time": "2021-04-20T21:06:56.493094", - "exception": false, - "start_time": "2021-04-20T21:06:55.532808", - "status": "completed" - }, - "tags": [], - "id": "WmA30w14PIE7" - }, - "outputs": [], - "source": [ - "#generate a DAGs and plot them\n", - "\n", - "G1 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [uobserved, pos=\"0, -1\"]\n", - "D -> Y\n", - "X -> D\n", - "F -> X\n", - "F -> D\n", - "X -> Y}')\n", - "\n", - "\n", - "ggdag(G1)+ theme_dag()" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "#install and load package\n", + "install.packages(\"dagitty\")\n", + "install.packages(\"ggdag\")\n", + "library(dagitty)\n", + "library(ggdag)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C9Uqph8wPIE7", + "papermill": { + "duration": 0.010708, + "end_time": "2021-04-20T21:06:55.479480", + "exception": false, + "start_time": "2021-04-20T21:06:55.468772", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.011939, - "end_time": "2021-04-20T21:06:56.517753", - "exception": false, - "start_time": "2021-04-20T21:06:56.505814", - "status": "completed" - }, - "tags": [], - "id": "xwy7BUBGPIE7" - }, - "source": [ - "**List minimal adjustment sets to identify causal effects $D \\to Y$**\n", - "\n" - ] + "tags": [] + }, + "source": [ + "# Graphs for 401(K) Analsyis\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B2_fiQOEPIE7", + "papermill": { + "duration": 0.010554, + "end_time": "2021-04-20T21:06:55.500684", + "exception": false, + "start_time": "2021-04-20T21:06:55.490130", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:56.546113Z", - "iopub.status.busy": "2021-04-20T21:06:56.544950Z", - "iopub.status.idle": "2021-04-20T21:06:56.576357Z", - "shell.execute_reply": "2021-04-20T21:06:56.575272Z" - }, - "papermill": { - "duration": 0.047008, - "end_time": "2021-04-20T21:06:56.576507", - "exception": false, - "start_time": "2021-04-20T21:06:56.529499", - "status": "completed" - }, - "tags": [], - "id": "MGD0C6SbPIE8" - }, - "outputs": [], - "source": [ - "adjustmentSets( G1, \"D\", \"Y\",effect=\"total\" )" - ] + "tags": [] + }, + "source": [ + "Here we have\n", + " * $Y$ -- net financial assets;\n", + " * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details);\n", + " * $F$ -- latent (unobserved) firm characteristics\n", + " * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-U6yQWBgPIE7", + "papermill": { + "duration": 0.010886, + "end_time": "2021-04-20T21:06:55.522231", + "exception": false, + "start_time": "2021-04-20T21:06:55.511345", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.012126, - "end_time": "2021-04-20T21:06:56.600815", - "exception": false, - "start_time": "2021-04-20T21:06:56.588689", - "status": "completed" - }, - "tags": [], - "id": "gkQCVQpoPIE8" - }, - "source": [ - "**What is the underlying principle?**\n", - "\n", - "Here conditioning on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so!)" - ] + "tags": [] + }, + "source": [ + "**One graph (where F determines X):**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WmA30w14PIE7", + "papermill": { + "duration": 0.960286, + "end_time": "2021-04-20T21:06:56.493094", + "exception": false, + "start_time": "2021-04-20T21:06:55.532808", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.012073, - "end_time": "2021-04-20T21:06:56.625022", - "exception": false, - "start_time": "2021-04-20T21:06:56.612949", - "status": "completed" - }, - "tags": [], - "id": "SyCZsVL2PIE8" - }, - "source": [ - "**Another Graph (wherere $X$ determines $F$):**" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "#generate a DAGs and plot them\n", + "\n", + "G1 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [uobserved, pos=\"0, -1\"]\n", + "D -> Y\n", + "X -> D\n", + "F -> X\n", + "F -> D\n", + "X -> Y}')\n", + "\n", + "\n", + "ggdag(G1)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xwy7BUBGPIE7", + "papermill": { + "duration": 0.011939, + "end_time": "2021-04-20T21:06:56.517753", + "exception": false, + "start_time": "2021-04-20T21:06:56.505814", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:56.654884Z", - "iopub.status.busy": "2021-04-20T21:06:56.653684Z", - "iopub.status.idle": "2021-04-20T21:06:57.016513Z", - "shell.execute_reply": "2021-04-20T21:06:57.017098Z" - }, - "papermill": { - "duration": 0.380125, - "end_time": "2021-04-20T21:06:57.017337", - "exception": false, - "start_time": "2021-04-20T21:06:56.637212", - "status": "completed" - }, - "tags": [], - "id": "6l83sAd8PIE8" - }, - "outputs": [], - "source": [ - "#generate a couple of DAGs and plot them\n", - "\n", - "G2 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [uobserved, pos=\"0, -1\"]\n", - "D -> Y\n", - "X -> D\n", - "X -> F\n", - "F -> D\n", - "X -> Y}')\n", - "\n", - "\n", - "ggdag(G2)+ theme_dag()" - ] + "tags": [] + }, + "source": [ + "**List minimal adjustment sets to identify causal effects $D \\to Y$**\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MGD0C6SbPIE8", + "papermill": { + "duration": 0.047008, + "end_time": "2021-04-20T21:06:56.576507", + "exception": false, + "start_time": "2021-04-20T21:06:56.529499", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.060217Z", - "iopub.status.busy": "2021-04-20T21:06:57.058501Z", - "iopub.status.idle": "2021-04-20T21:06:57.094192Z", - "shell.execute_reply": "2021-04-20T21:06:57.092241Z" - }, - "papermill": { - "duration": 0.056051, - "end_time": "2021-04-20T21:06:57.094387", - "exception": false, - "start_time": "2021-04-20T21:06:57.038336", - "status": "completed" - }, - "tags": [], - "id": "qXhEMpdXPIE9" - }, - "outputs": [], - "source": [ - "adjustmentSets( G2, \"D\", \"Y\", effect=\"total\" )\n" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "adjustmentSets( G1, \"D\", \"Y\",effect=\"total\" )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gkQCVQpoPIE8", + "papermill": { + "duration": 0.012126, + "end_time": "2021-04-20T21:06:56.600815", + "exception": false, + "start_time": "2021-04-20T21:06:56.588689", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.014464, - "end_time": "2021-04-20T21:06:57.130426", - "exception": false, - "start_time": "2021-04-20T21:06:57.115962", - "status": "completed" - }, - "tags": [], - "id": "A6kFc2R1PIE9" - }, - "source": [ - "**One more graph (encompassing the previous ones), where (F, X) are jointly determined by latent factors $A$.**\n", - "\n", - "We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$." - ] + "tags": [] + }, + "source": [ + "**What is the underlying principle?**\n", + "\n", + "Here conditioning on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so!)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SyCZsVL2PIE8", + "papermill": { + "duration": 0.012073, + "end_time": "2021-04-20T21:06:56.625022", + "exception": false, + "start_time": "2021-04-20T21:06:56.612949", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.014208, - "end_time": "2021-04-20T21:06:57.159030", - "exception": false, - "start_time": "2021-04-20T21:06:57.144822", - "status": "completed" - }, - "tags": [], - "id": "HJYKqepTPIE-" - }, - "source": [ - "This is much more realistic graph to consider." - ] + "tags": [] + }, + "source": [ + "**Another Graph (wherere $X$ determines $F$):**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6l83sAd8PIE8", + "papermill": { + "duration": 0.380125, + "end_time": "2021-04-20T21:06:57.017337", + "exception": false, + "start_time": "2021-04-20T21:06:56.637212", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.194754Z", - "iopub.status.busy": "2021-04-20T21:06:57.192851Z", - "iopub.status.idle": "2021-04-20T21:06:57.567286Z", - "shell.execute_reply": "2021-04-20T21:06:57.565727Z" - }, - "papermill": { - "duration": 0.393034, - "end_time": "2021-04-20T21:06:57.567429", - "exception": false, - "start_time": "2021-04-20T21:06:57.174395", - "status": "completed" - }, - "tags": [], - "id": "7O0MbTDCPIE-" - }, - "outputs": [], - "source": [ - "G3 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [unobserved, pos=\"0, -1\"]\n", - "A [unobserved, pos=\"-1, -1\"]\n", - "D -> Y\n", - "X -> D\n", - "F -> D\n", - "A -> F\n", - "A -> X\n", - "A -> D\n", - "X -> Y}')\n", - "\n", - "adjustmentSets( G3, \"D\", \"Y\", effect=\"total\" )\n", - "\n", - "ggdag(G3)+ theme_dag()" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "#generate a couple of DAGs and plot them\n", + "\n", + "G2 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [uobserved, pos=\"0, -1\"]\n", + "D -> Y\n", + "X -> D\n", + "X -> F\n", + "F -> D\n", + "X -> Y}')\n", + "\n", + "\n", + "ggdag(G2)+ theme_dag()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qXhEMpdXPIE9", + "papermill": { + "duration": 0.056051, + "end_time": "2021-04-20T21:06:57.094387", + "exception": false, + "start_time": "2021-04-20T21:06:57.038336", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.016255, - "end_time": "2021-04-20T21:06:57.599705", - "exception": false, - "start_time": "2021-04-20T21:06:57.583450", - "status": "completed" - }, - "tags": [], - "id": "Mjus09VjPIE-" - }, - "source": [ - "# Threat to Identification:\n", - "\n", - "What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case.)" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "adjustmentSets( G2, \"D\", \"Y\", effect=\"total\" )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A6kFc2R1PIE9", + "papermill": { + "duration": 0.014464, + "end_time": "2021-04-20T21:06:57.130426", + "exception": false, + "start_time": "2021-04-20T21:06:57.115962", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.637392Z", - "iopub.status.busy": "2021-04-20T21:06:57.635609Z", - "iopub.status.idle": "2021-04-20T21:06:58.024479Z", - "shell.execute_reply": "2021-04-20T21:06:58.022247Z" - }, - "papermill": { - "duration": 0.40878, - "end_time": "2021-04-20T21:06:58.024720", - "exception": false, - "start_time": "2021-04-20T21:06:57.615940", - "status": "completed" - }, - "tags": [], - "id": "NjqrWLhXPIE-" - }, - "outputs": [], - "source": [ - "G4 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [unobserved, pos=\"0, -1\"]\n", - "A [unobserved, pos=\"-1, -1\"]\n", - "D -> Y\n", - "X -> D\n", - "F -> D\n", - "A -> F\n", - "A -> X\n", - "A -> D\n", - "F -> Y\n", - "X -> Y}')\n", - "\n", - "\n", - "ggdag(G4)+ theme_dag()" - ] + "tags": [] + }, + "source": [ + "**One more graph (encompassing the previous ones), where (F, X) are jointly determined by latent factors $A$.**\n", + "\n", + "We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HJYKqepTPIE-", + "papermill": { + "duration": 0.014208, + "end_time": "2021-04-20T21:06:57.159030", + "exception": false, + "start_time": "2021-04-20T21:06:57.144822", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:58.065152Z", - "iopub.status.busy": "2021-04-20T21:06:58.063890Z", - "iopub.status.idle": "2021-04-20T21:06:58.089473Z", - "shell.execute_reply": "2021-04-20T21:06:58.088587Z" - }, - "papermill": { - "duration": 0.046811, - "end_time": "2021-04-20T21:06:58.089616", - "exception": false, - "start_time": "2021-04-20T21:06:58.042805", - "status": "completed" - }, - "tags": [], - "id": "BfQZaKqFPIE_" - }, - "outputs": [], - "source": [ - "adjustmentSets( G4, \"D\", \"Y\",effect=\"total\" )" - ] + "tags": [] + }, + "source": [ + "This is much more realistic graph to consider." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7O0MbTDCPIE-", + "papermill": { + "duration": 0.393034, + "end_time": "2021-04-20T21:06:57.567429", + "exception": false, + "start_time": "2021-04-20T21:06:57.174395", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.017351, - "end_time": "2021-04-20T21:06:58.124909", - "exception": false, - "start_time": "2021-04-20T21:06:58.107558", - "status": "completed" - }, - "tags": [], - "id": "1Vz8Ply-PIE_" - }, - "source": [ - "**Note that no output means that there is no valid adustment set (among observed variables).**" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "G3 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [unobserved, pos=\"0, -1\"]\n", + "A [unobserved, pos=\"-1, -1\"]\n", + "D -> Y\n", + "X -> D\n", + "F -> D\n", + "A -> F\n", + "A -> X\n", + "A -> D\n", + "X -> Y}')\n", + "\n", + "adjustmentSets( G3, \"D\", \"Y\", effect=\"total\" )\n", + "\n", + "ggdag(G3)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mjus09VjPIE-", + "papermill": { + "duration": 0.016255, + "end_time": "2021-04-20T21:06:57.599705", + "exception": false, + "start_time": "2021-04-20T21:06:57.583450", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.017324, - "end_time": "2021-04-20T21:06:58.159612", - "exception": false, - "start_time": "2021-04-20T21:06:58.142288", - "status": "completed" - }, - "tags": [], - "id": "yF5-jrbKPIE_" - }, - "source": [ - "**How can F affect Y directly? Is it reasonable?**" - ] + "tags": [] + }, + "source": [ + "# Threat to Identification:\n", + "\n", + "What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NjqrWLhXPIE-", + "papermill": { + "duration": 0.40878, + "end_time": "2021-04-20T21:06:58.024720", + "exception": false, + "start_time": "2021-04-20T21:06:57.615940", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.017002, - "end_time": "2021-04-20T21:06:58.193722", - "exception": false, - "start_time": "2021-04-20T21:06:58.176720", - "status": "completed" - }, - "tags": [], - "id": "Tn6bYdP6PIE_" - }, - "source": [ - "Introduce Match Amount $M$. The match amount is a potential important mediator (why mediator?). $M$ is not observed. Luckily, adjusting for $X$ still works if there is no arrow $F \\to M$." - ] + "tags": [] + }, + "outputs": [], + "source": [ + "G4 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [unobserved, pos=\"0, -1\"]\n", + "A [unobserved, pos=\"-1, -1\"]\n", + "D -> Y\n", + "X -> D\n", + "F -> D\n", + "A -> F\n", + "A -> X\n", + "A -> D\n", + "F -> Y\n", + "X -> Y}')\n", + "\n", + "\n", + "ggdag(G4)+ theme_dag()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BfQZaKqFPIE_", + "papermill": { + "duration": 0.046811, + "end_time": "2021-04-20T21:06:58.089616", + "exception": false, + "start_time": "2021-04-20T21:06:58.042805", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:58.234225Z", - "iopub.status.busy": "2021-04-20T21:06:58.232383Z", - "iopub.status.idle": "2021-04-20T21:06:58.596392Z", - "shell.execute_reply": "2021-04-20T21:06:58.594515Z" - }, - "papermill": { - "duration": 0.385562, - "end_time": "2021-04-20T21:06:58.596529", - "exception": false, - "start_time": "2021-04-20T21:06:58.210967", - "status": "completed" - }, - "tags": [], - "id": "hZVzQJjbPIFA" - }, - "outputs": [], - "source": [ - "G5 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [unobserved, pos=\"0, -1\"]\n", - "A [unobserved, pos=\"-1, -1\"]\n", - "M [unobserved, pos=\"2, -.5\"]\n", - "D -> Y\n", - "X -> D\n", - "F -> D\n", - "A -> F\n", - "A -> X\n", - "A -> D\n", - "D -> M\n", - "M -> Y\n", - "X -> M\n", - "X -> Y}')\n", - "\n", - "print( adjustmentSets( G5, \"D\", \"Y\",effect=\"total\" ) )\n", - "\n", - "ggdag(G5)+ theme_dag()" - ] + "tags": [] + }, + "outputs": [], + "source": [ + "adjustmentSets( G4, \"D\", \"Y\",effect=\"total\" )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1Vz8Ply-PIE_", + "papermill": { + "duration": 0.017351, + "end_time": "2021-04-20T21:06:58.124909", + "exception": false, + "start_time": "2021-04-20T21:06:58.107558", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.019066, - "end_time": "2021-04-20T21:06:58.635211", - "exception": false, - "start_time": "2021-04-20T21:06:58.616145", - "status": "completed" - }, - "tags": [], - "id": "o9-rScGhPIFA" - }, - "source": [ - "If there is an $F \\to M$ arrow, then adjusting for $X$ is not sufficient." - ] + "tags": [] + }, + "source": [ + "**Note that no output means that there is no valid adustment set (among observed variables).**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yF5-jrbKPIE_", + "papermill": { + "duration": 0.017324, + "end_time": "2021-04-20T21:06:58.159612", + "exception": false, + "start_time": "2021-04-20T21:06:58.142288", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:58.679151Z", - "iopub.status.busy": "2021-04-20T21:06:58.677648Z", - "iopub.status.idle": "2021-04-20T21:06:59.019377Z", - "shell.execute_reply": "2021-04-20T21:06:59.018555Z" - }, - "papermill": { - "duration": 0.365274, - "end_time": "2021-04-20T21:06:59.019538", - "exception": false, - "start_time": "2021-04-20T21:06:58.654264", - "status": "completed" - }, - "tags": [], - "id": "IGI7piKFPIFA" - }, - "outputs": [], - "source": [ - "G6 = dagitty('dag{\n", - "Y [outcome,pos=\"4, 0\"]\n", - "D [exposure,pos=\"0, 0\"]\n", - "X [confounder, pos=\"2,-2\"]\n", - "F [unobserved, pos=\"0, -1\"]\n", - "A [unobserved, pos=\"-1, -1\"]\n", - "M [uobserved, pos=\"2, -.5\"]\n", - "D -> Y\n", - "X -> D\n", - "F -> D\n", - "A -> F\n", - "A -> X\n", - "D -> M\n", - "F -> M\n", - "A -> D\n", - "M -> Y\n", - "X -> M\n", - "X -> Y}')\n", - "\n", - "print( adjustmentSets( G6, \"D\", \"Y\" ),effect=\"total\" )\n", - "\n", - "ggdag(G6)+ theme_dag()" - ] + "tags": [] + }, + "source": [ + "**How can F affect Y directly? Is it reasonable?**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tn6bYdP6PIE_", + "papermill": { + "duration": 0.017002, + "end_time": "2021-04-20T21:06:58.193722", + "exception": false, + "start_time": "2021-04-20T21:06:58.176720", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "Again, note that no output was returned for the adjustment set. There is no valid adjustment set here." - ], - "metadata": { - "id": "9DlpbknyvuI0" - } + "tags": [] + }, + "source": [ + "Introduce Match Amount $M$. The match amount is a potential important mediator (why mediator?). $M$ is not observed. Luckily, adjusting for $X$ still works if there is no arrow $F \\to M$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hZVzQJjbPIFA", + "papermill": { + "duration": 0.385562, + "end_time": "2021-04-20T21:06:58.596529", + "exception": false, + "start_time": "2021-04-20T21:06:58.210967", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.020751, - "end_time": "2021-04-20T21:06:59.062138", - "exception": false, - "start_time": "2021-04-20T21:06:59.041387", - "status": "completed" - }, - "tags": [], - "id": "oMDvjvSNPIFA" - }, - "source": [ - " # Question:\n", - "\n", - "Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth?\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "outputs": [], + "source": [ + "G5 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [unobserved, pos=\"0, -1\"]\n", + "A [unobserved, pos=\"-1, -1\"]\n", + "M [unobserved, pos=\"2, -.5\"]\n", + "D -> Y\n", + "X -> D\n", + "F -> D\n", + "A -> F\n", + "A -> X\n", + "A -> D\n", + "D -> M\n", + "M -> Y\n", + "X -> M\n", + "X -> Y}')\n", + "\n", + "print( adjustmentSets( G5, \"D\", \"Y\",effect=\"total\" ) )\n", + "\n", + "ggdag(G5)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o9-rScGhPIFA", + "papermill": { + "duration": 0.019066, + "end_time": "2021-04-20T21:06:58.635211", + "exception": false, + "start_time": "2021-04-20T21:06:58.616145", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "source": [ + "If there is an $F \\to M$ arrow, then adjusting for $X$ is not sufficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IGI7piKFPIFA", + "papermill": { + "duration": 0.365274, + "end_time": "2021-04-20T21:06:59.019538", + "exception": false, + "start_time": "2021-04-20T21:06:58.654264", + "status": "completed" }, + "tags": [] + }, + "outputs": [], + "source": [ + "G6 = dagitty('dag{\n", + "Y [outcome,pos=\"4, 0\"]\n", + "D [exposure,pos=\"0, 0\"]\n", + "X [confounder, pos=\"2,-2\"]\n", + "F [unobserved, pos=\"0, -1\"]\n", + "A [unobserved, pos=\"-1, -1\"]\n", + "M [uobserved, pos=\"2, -.5\"]\n", + "D -> Y\n", + "X -> D\n", + "F -> D\n", + "A -> F\n", + "A -> X\n", + "D -> M\n", + "F -> M\n", + "A -> D\n", + "M -> Y\n", + "X -> M\n", + "X -> Y}')\n", + "\n", + "print( adjustmentSets( G6, \"D\", \"Y\" ),effect=\"total\" )\n", + "\n", + "ggdag(G6)+ theme_dag()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9DlpbknyvuI0" + }, + "source": [ + "Again, note that no output was returned for the adjustment set. There is no valid adjustment set here." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oMDvjvSNPIFA", "papermill": { - "default_parameters": {}, - "duration": 32.278732, - "end_time": "2021-04-20T21:06:59.193141", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-04-20T21:06:26.914409", - "version": "2.2.2" + "duration": 0.020751, + "end_time": "2021-04-20T21:06:59.062138", + "exception": false, + "start_time": "2021-04-20T21:06:59.041387", + "status": "completed" }, - "colab": { - "provenance": [] - } + "tags": [] + }, + "source": [ + " # Question:\n", + "\n", + "Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth?\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "papermill": { + "default_parameters": {}, + "duration": 32.278732, + "end_time": "2021-04-20T21:06:59.193141", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-04-20T21:06:26.914409", + "version": "2.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd b/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd new file mode 100644 index 00000000..406f0f9d --- /dev/null +++ b/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd @@ -0,0 +1,185 @@ +--- +title: An R Markdown document converted from "PM4/r_debiased_ml_for_partially_linear_model_growth.irnb" +output: html_document +--- + +# Double/Debiased Machine Learning for the Partially Linear Regression Model + +This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model, which provides an application of DML inference to determine the causal effect of countries' intitial wealth on the rate of economic growth. + + +Reference: + +- https://arxiv.org/abs/1608.00060 +- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 + +The code is based on the book. + +```{r} +install.packages("xtable") +install.packages("hdm") +install.packages("randomForest") +install.packages("glmnet") +install.packages("sandwich") + +library(xtable) +library(randomForest) +library(hdm) +library(glmnet) +library(sandwich) + +set.seed(1) +``` + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/GrowthData.csv" +data <- read.csv(file) +data <- subset(data, select = -1) # get rid of index column +head(data) +dim(data) +``` + +```{r} +y = as.matrix(data[,1]) # outcome: growth rate +d = as.matrix(data[,3]) # treatment: initial wealth +x = as.matrix(data[,-c(1,2,3)]) # controls: country characteristics + +# some summary statistics +cat(sprintf("\nThe length of y is %g \n", length(y) )) +cat(sprintf("\nThe number of features in x is %g \n", dim(x)[2] )) + +lres=summary(lm(y~d +x))$coef[2,1:2] +cat(sprintf("\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \ncoef (se) = %g (%g)\n", lres[1] , lres[2])) +``` + +# DML algorithm + +Here we perform estimation and inference of predictive coefficient $\alpha$ in the partially linear statistical model, +$$ +Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. +$$ +For $\tilde Y = Y- E(Y|X)$ and $\tilde D= D- E(D|X)$, we can write +$$ +\tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0. +$$ +Parameter $\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\tilde D$ and $\tilde Y$. +The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\tilde Y$ and $\tilde D$, where +the residuals are obtained by cross-validation (cross-fitting). + +The statistical parameter $\alpha$ has a causal interpretation of being the effect of $D$ on $Y$ in the causal DAG $$ D\to Y, \quad X\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$: +$$ +Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D). +$$ + +```{r} +DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) { + nobs <- nrow(x) #number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices + I <- split(1:nobs, foldid) #split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + cat(b," ") + } + rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other + coef.est <- coef(rfit)[2] #extract coefficient + se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals +} +``` + +We now run through DML using as first stage models: + 1. OLS + 2. (Rigorous) Lasso + 3. Random Forests + 4. Mix of Random Forest and Lasso + +```{r} +#DML with OLS +cat(sprintf("\nDML with OLS w/o feature selection \n")) +dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs +yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS +DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) + + +#DML with Lasso: +cat(sprintf("\nDML with Lasso \n")) +dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm +yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm +DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) + + +#DML with Random Forest: +cat(sprintf("\nDML with Random Forest \n")) +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) + +#DML MIX: +cat(sprintf("\nDML with Lasso for D and Random Forest for Y \n")) +dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +DML2.mix = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) +``` + +Now we examine the RMSE of D and Y to see which method performs well in the first-stage. We print all results below in the following table: + +```{r} +prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2), mean((DML2.mix$dtil)^2)); +prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2),mean((DML2.mix$ytil)^2)); +prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y)); +rownames(prRes)<- c("RMSE D", "RMSE Y"); +colnames(prRes)<- c("OLS", "Lasso", "RF", "Mix") +``` + +```{r} +table <- matrix(0,4,4) + +# Point Estimate +table[1,1] <- as.numeric(DML2.OLS$coef.est) +table[2,1] <- as.numeric(DML2.lasso$coef.est) +table[3,1] <- as.numeric(DML2.RF$coef.est) +table[4,1] <- as.numeric(DML2.mix$coef.est) + +# SE +table[1,2] <- as.numeric(DML2.OLS$se) +table[2,2] <- as.numeric(DML2.lasso$se) +table[3,2] <- as.numeric(DML2.RF$se) +table[4,2] <- as.numeric(DML2.mix$se) + +# RMSE Y +table[1,3] <- as.numeric(prRes[2,1]) +table[2,3] <- as.numeric(prRes[2,2]) +table[3,3] <- as.numeric(prRes[2,3]) +table[4,3] <- as.numeric(prRes[2,4]) + +# RMSE D +table[1,4] <- as.numeric(prRes[1,1]) +table[2,4] <- as.numeric(prRes[1,2]) +table[3,4] <- as.numeric(prRes[1,3]) +table[4,4] <- as.numeric(prRes[1,4]) + + + +# print results +colnames(table) <- c("Estimate","Standard Error", "RMSE Y", "RMSE D") +rownames(table) <- c("OLS", "Lasso", "RF", "RF/Lasso Mix") +table +``` + +```{r} +print(table, digit=3) +``` + +```{r} +tab<- xtable(table, digits=3) +print(tab, type="latex") +``` + diff --git a/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb b/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb index bd1946ab..26f2cc70 100644 --- a/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb +++ b/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb @@ -1,379 +1,354 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.005873, - "end_time": "2021-02-13T17:32:35.332473", - "exception": false, - "start_time": "2021-02-13T17:32:35.326600", - "status": "completed" - }, - "tags": [], - "id": "pb2sy6fp2HrD" - }, - "source": [ - "# Double/Debiased Machine Learning for the Partially Linear Regression Model\n", - "\n", - "This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model, which provides an application of DML inference to determine the causal effect of countries' intitial wealth on the rate of economic growth.\n", - "\n", - "\n", - "Reference:\n", - "\n", - "- https://arxiv.org/abs/1608.00060\n", - "- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778\n", - "\n", - "The code is based on the book." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "execution": { - "iopub.execute_input": "2021-02-13T17:32:35.378544Z", - "iopub.status.busy": "2021-02-13T17:32:35.375857Z", - "iopub.status.idle": "2021-02-13T17:33:08.345196Z", - "shell.execute_reply": "2021-02-13T17:33:08.343218Z" - }, - "papermill": { - "duration": 33.00131, - "end_time": "2021-02-13T17:33:08.345470", - "exception": false, - "start_time": "2021-02-13T17:32:35.344160", - "status": "completed" - }, - "tags": [], - "id": "cbZVJg6j2HrH" - }, - "outputs": [], - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"sandwich\")\n", - "\n", - "library(xtable)\n", - "library(randomForest)\n", - "library(hdm)\n", - "library(glmnet)\n", - "library(sandwich)\n", - "\n", - "set.seed(1)" - ] - }, - { - "cell_type": "code", - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/GrowthData.csv\"\n", - "data <- read.csv(file)\n", - "data <- subset(data, select = -1) # get rid of index column\n", - "head(data)\n", - "dim(data)" - ], - "metadata": { - "id": "1I8mEMEM33fS" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "y = as.matrix(data[,1]) # outcome: growth rate\n", - "d = as.matrix(data[,3]) # treatment: initial wealth\n", - "x = as.matrix(data[,-c(1,2,3)]) # controls: country characteristics\n", - "\n", - "# some summary statistics\n", - "cat(sprintf(\"\\nThe length of y is %g \\n\", length(y) ))\n", - "cat(sprintf(\"\\nThe number of features in x is %g \\n\", dim(x)[2] ))\n", - "\n", - "lres=summary(lm(y~d +x))$coef[2,1:2]\n", - "cat(sprintf(\"\\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \\ncoef (se) = %g (%g)\\n\", lres[1] , lres[2]))" - ], - "metadata": { - "id": "hiwEVN6i4FIH" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.003613, - "end_time": "2021-02-13T17:32:35.340781", - "exception": false, - "start_time": "2021-02-13T17:32:35.337168", - "status": "completed" - }, - "tags": [], - "id": "OLPq-MxG2HrH" - }, - "source": [ - "# DML algorithm\n", - "\n", - "Here we perform estimation and inference of predictive coefficient $\\alpha$ in the partially linear statistical model,\n", - "$$\n", - "Y = D\\alpha + g(X) + U, \\quad E (U | D, X) = 0.\n", - "$$\n", - "For $\\tilde Y = Y- E(Y|X)$ and $\\tilde D= D- E(D|X)$, we can write\n", - "$$\n", - "\\tilde Y = \\alpha \\tilde D + U, \\quad E (U |\\tilde D) =0.\n", - "$$\n", - "Parameter $\\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\\tilde D$ and $\\tilde Y$.\n", - "The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\\tilde Y$ and $\\tilde D$, where\n", - "the residuals are obtained by cross-validation (cross-fitting).\n", - "\n", - "The statistical parameter $\\alpha$ has a causal interpretation of being the effect of $D$ on $Y$ in the causal DAG $$ D\\to Y, \\quad X\\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$:\n", - "$$\n", - "Y(d) = d\\alpha + g(X) + U(d),\\quad U(d) \\text{ indep } D |X, \\quad Y = Y(D), \\quad U = U(D).\n", - "$$\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-02-13T17:33:08.406107Z", - "iopub.status.busy": "2021-02-13T17:33:08.360037Z", - "iopub.status.idle": "2021-02-13T17:33:08.423732Z", - "shell.execute_reply": "2021-02-13T17:33:08.422038Z" - }, - "papermill": { - "duration": 0.07299, - "end_time": "2021-02-13T17:33:08.423882", - "exception": false, - "start_time": "2021-02-13T17:33:08.350892", - "status": "completed" - }, - "tags": [], - "id": "2HTwak-I2HrK" - }, - "outputs": [], - "source": [ - "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) {\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", - " ytil <- dtil <- rep(NA, nobs)\n", - " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " cat(b,\" \")\n", - " }\n", - " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", - " coef.est <- coef(rfit)[2] #extract coefficient\n", - " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "We now run through DML using as first stage models:\n", - " 1. OLS\n", - " 2. (Rigorous) Lasso\n", - " 3. Random Forests\n", - " 4. Mix of Random Forest and Lasso" - ], - "metadata": { - "id": "IZ7zixqZCE3s" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-02-13T17:33:08.438648Z", - "iopub.status.busy": "2021-02-13T17:33:08.436622Z", - "iopub.status.idle": "2021-02-13T17:33:27.087838Z", - "shell.execute_reply": "2021-02-13T17:33:27.086924Z" - }, - "papermill": { - "duration": 18.659748, - "end_time": "2021-02-13T17:33:27.088051", - "exception": false, - "start_time": "2021-02-13T17:33:08.428303", - "status": "completed" - }, - "tags": [], - "id": "6-yAopZY2HrK" - }, - "outputs": [], - "source": [ - "#DML with OLS\n", - "cat(sprintf(\"\\nDML with OLS w/o feature selection \\n\"))\n", - "dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs\n", - "yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS\n", - "DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", - "\n", - "\n", - "#DML with Lasso:\n", - "cat(sprintf(\"\\nDML with Lasso \\n\"))\n", - "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm\n", - "yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm\n", - "DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", - "\n", - "\n", - "#DML with Random Forest:\n", - "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", - "\n", - "#DML MIX:\n", - "cat(sprintf(\"\\nDML with Lasso for D and Random Forest for Y \\n\"))\n", - "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "DML2.mix = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Now we examine the RMSE of D and Y to see which method performs well in the first-stage. We print all results below in the following table:" - ], - "metadata": { - "id": "KvnMvuYo_gm1" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2021-02-13T17:33:27.115049Z", - "iopub.status.busy": "2021-02-13T17:33:27.113322Z", - "iopub.status.idle": "2021-02-13T17:33:27.269334Z", - "shell.execute_reply": "2021-02-13T17:33:27.268359Z" - }, - "papermill": { - "duration": 0.169839, - "end_time": "2021-02-13T17:33:27.269543", - "exception": false, - "start_time": "2021-02-13T17:33:27.099704", - "status": "completed" - }, - "tags": [], - "id": "JLXHbUwN2HrL" - }, - "outputs": [], - "source": [ - "prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2), mean((DML2.mix$dtil)^2));\n", - "prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2),mean((DML2.mix$ytil)^2));\n", - "prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y));\n", - "rownames(prRes)<- c(\"RMSE D\", \"RMSE Y\");\n", - "colnames(prRes)<- c(\"OLS\", \"Lasso\", \"RF\", \"Mix\")" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "pb2sy6fp2HrD", + "papermill": { + "duration": 0.005873, + "end_time": "2021-02-13T17:32:35.332473", + "exception": false, + "start_time": "2021-02-13T17:32:35.326600", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "table <- matrix(0,4,4)\n", - "\n", - "# Point Estimate\n", - "table[1,1] <- as.numeric(DML2.OLS$coef.est)\n", - "table[2,1] <- as.numeric(DML2.lasso$coef.est)\n", - "table[3,1] <- as.numeric(DML2.RF$coef.est)\n", - "table[4,1] <- as.numeric(DML2.mix$coef.est)\n", - "\n", - "# SE\n", - "table[1,2] <- as.numeric(DML2.OLS$se)\n", - "table[2,2] <- as.numeric(DML2.lasso$se)\n", - "table[3,2] <- as.numeric(DML2.RF$se)\n", - "table[4,2] <- as.numeric(DML2.mix$se)\n", - "\n", - "# RMSE Y\n", - "table[1,3] <- as.numeric(prRes[2,1])\n", - "table[2,3] <- as.numeric(prRes[2,2])\n", - "table[3,3] <- as.numeric(prRes[2,3])\n", - "table[4,3] <- as.numeric(prRes[2,4])\n", - "\n", - "# RMSE D\n", - "table[1,4] <- as.numeric(prRes[1,1])\n", - "table[2,4] <- as.numeric(prRes[1,2])\n", - "table[3,4] <- as.numeric(prRes[1,3])\n", - "table[4,4] <- as.numeric(prRes[1,4])\n", - "\n", - "\n", - "\n", - "# print results\n", - "colnames(table) <- c(\"Estimate\",\"Standard Error\", \"RMSE Y\", \"RMSE D\")\n", - "rownames(table) <- c(\"OLS\", \"Lasso\", \"RF\", \"RF/Lasso Mix\")\n", - "table" - ], - "metadata": { - "id": "a7WC2-_6_wMl" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "# Double/Debiased Machine Learning for the Partially Linear Regression Model\n", + "\n", + "This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model, which provides an application of DML inference to determine the causal effect of countries' intitial wealth on the rate of economic growth.\n", + "\n", + "\n", + "Reference:\n", + "\n", + "- https://arxiv.org/abs/1608.00060\n", + "- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778\n", + "\n", + "The code is based on the book." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cbZVJg6j2HrH", + "papermill": { + "duration": 33.00131, + "end_time": "2021-02-13T17:33:08.345470", + "exception": false, + "start_time": "2021-02-13T17:32:35.344160", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "print(table, digit=3)" - ], - "metadata": { - "id": "zZMbnWTE_yYd" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"sandwich\")\n", + "\n", + "library(xtable)\n", + "library(randomForest)\n", + "library(hdm)\n", + "library(glmnet)\n", + "library(sandwich)\n", + "\n", + "set.seed(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1I8mEMEM33fS" + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/GrowthData.csv\"\n", + "data <- read.csv(file)\n", + "data <- subset(data, select = -1) # get rid of index column\n", + "head(data)\n", + "dim(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hiwEVN6i4FIH" + }, + "outputs": [], + "source": [ + "y = as.matrix(data[,1]) # outcome: growth rate\n", + "d = as.matrix(data[,3]) # treatment: initial wealth\n", + "x = as.matrix(data[,-c(1,2,3)]) # controls: country characteristics\n", + "\n", + "# some summary statistics\n", + "cat(sprintf(\"\\nThe length of y is %g \\n\", length(y) ))\n", + "cat(sprintf(\"\\nThe number of features in x is %g \\n\", dim(x)[2] ))\n", + "\n", + "lres=summary(lm(y~d +x))$coef[2,1:2]\n", + "cat(sprintf(\"\\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \\ncoef (se) = %g (%g)\\n\", lres[1] , lres[2]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OLPq-MxG2HrH", + "papermill": { + "duration": 0.003613, + "end_time": "2021-02-13T17:32:35.340781", + "exception": false, + "start_time": "2021-02-13T17:32:35.337168", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "tab<- xtable(table, digits=3)\n", - "print(tab, type=\"latex\")" - ], - "metadata": { - "id": "pNDCsznF_zyR" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "source": [ + "# DML algorithm\n", + "\n", + "Here we perform estimation and inference of predictive coefficient $\\alpha$ in the partially linear statistical model,\n", + "$$\n", + "Y = D\\alpha + g(X) + U, \\quad E (U | D, X) = 0.\n", + "$$\n", + "For $\\tilde Y = Y- E(Y|X)$ and $\\tilde D= D- E(D|X)$, we can write\n", + "$$\n", + "\\tilde Y = \\alpha \\tilde D + U, \\quad E (U |\\tilde D) =0.\n", + "$$\n", + "Parameter $\\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\\tilde D$ and $\\tilde Y$.\n", + "The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\\tilde Y$ and $\\tilde D$, where\n", + "the residuals are obtained by cross-validation (cross-fitting).\n", + "\n", + "The statistical parameter $\\alpha$ has a causal interpretation of being the effect of $D$ on $Y$ in the causal DAG $$ D\\to Y, \\quad X\\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$:\n", + "$$\n", + "Y(d) = d\\alpha + g(X) + U(d),\\quad U(d) \\text{ indep } D |X, \\quad Y = Y(D), \\quad U = U(D).\n", + "$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2HTwak-I2HrK", + "papermill": { + "duration": 0.07299, + "end_time": "2021-02-13T17:33:08.423882", + "exception": false, + "start_time": "2021-02-13T17:33:08.350892", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "outputs": [], + "source": [ + "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) {\n", + " nobs <- nrow(x) #number of observations\n", + " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", + " I <- split(1:nobs, foldid) #split observation indices into folds\n", + " ytil <- dtil <- rep(NA, nobs)\n", + " cat(\"fold: \")\n", + " for(b in 1:length(I)){\n", + " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", + " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", + " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " cat(b,\" \")\n", + " }\n", + " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", + " coef.est <- coef(rfit)[2] #extract coefficient\n", + " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", + " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IZ7zixqZCE3s" + }, + "source": [ + "We now run through DML using as first stage models:\n", + " 1. OLS\n", + " 2. (Rigorous) Lasso\n", + " 3. Random Forests\n", + " 4. Mix of Random Forest and Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-yAopZY2HrK", + "papermill": { + "duration": 18.659748, + "end_time": "2021-02-13T17:33:27.088051", + "exception": false, + "start_time": "2021-02-13T17:33:08.428303", + "status": "completed" }, + "tags": [] + }, + "outputs": [], + "source": [ + "#DML with OLS\n", + "cat(sprintf(\"\\nDML with OLS w/o feature selection \\n\"))\n", + "dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs\n", + "yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS\n", + "DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "\n", + "\n", + "#DML with Lasso:\n", + "cat(sprintf(\"\\nDML with Lasso \\n\"))\n", + "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm\n", + "yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm\n", + "DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "\n", + "\n", + "#DML with Random Forest:\n", + "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", + "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", + "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", + "DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "\n", + "#DML MIX:\n", + "cat(sprintf(\"\\nDML with Lasso for D and Random Forest for Y \\n\"))\n", + "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest\n", + "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", + "DML2.mix = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KvnMvuYo_gm1" + }, + "source": [ + "Now we examine the RMSE of D and Y to see which method performs well in the first-stage. We print all results below in the following table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JLXHbUwN2HrL", "papermill": { - "duration": 55.808166, - "end_time": "2021-02-13T17:33:28.352086", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-02-13T17:32:32.543920", - "version": "2.1.3" + "duration": 0.169839, + "end_time": "2021-02-13T17:33:27.269543", + "exception": false, + "start_time": "2021-02-13T17:33:27.099704", + "status": "completed" }, - "colab": { - "provenance": [] - } + "tags": [] + }, + "outputs": [], + "source": [ + "prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2), mean((DML2.mix$dtil)^2));\n", + "prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2),mean((DML2.mix$ytil)^2));\n", + "prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y));\n", + "rownames(prRes)<- c(\"RMSE D\", \"RMSE Y\");\n", + "colnames(prRes)<- c(\"OLS\", \"Lasso\", \"RF\", \"Mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a7WC2-_6_wMl" + }, + "outputs": [], + "source": [ + "table <- matrix(0,4,4)\n", + "\n", + "# Point Estimate\n", + "table[1,1] <- as.numeric(DML2.OLS$coef.est)\n", + "table[2,1] <- as.numeric(DML2.lasso$coef.est)\n", + "table[3,1] <- as.numeric(DML2.RF$coef.est)\n", + "table[4,1] <- as.numeric(DML2.mix$coef.est)\n", + "\n", + "# SE\n", + "table[1,2] <- as.numeric(DML2.OLS$se)\n", + "table[2,2] <- as.numeric(DML2.lasso$se)\n", + "table[3,2] <- as.numeric(DML2.RF$se)\n", + "table[4,2] <- as.numeric(DML2.mix$se)\n", + "\n", + "# RMSE Y\n", + "table[1,3] <- as.numeric(prRes[2,1])\n", + "table[2,3] <- as.numeric(prRes[2,2])\n", + "table[3,3] <- as.numeric(prRes[2,3])\n", + "table[4,3] <- as.numeric(prRes[2,4])\n", + "\n", + "# RMSE D\n", + "table[1,4] <- as.numeric(prRes[1,1])\n", + "table[2,4] <- as.numeric(prRes[1,2])\n", + "table[3,4] <- as.numeric(prRes[1,3])\n", + "table[4,4] <- as.numeric(prRes[1,4])\n", + "\n", + "\n", + "\n", + "# print results\n", + "colnames(table) <- c(\"Estimate\",\"Standard Error\", \"RMSE Y\", \"RMSE D\")\n", + "rownames(table) <- c(\"OLS\", \"Lasso\", \"RF\", \"RF/Lasso Mix\")\n", + "table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zZMbnWTE_yYd" + }, + "outputs": [], + "source": [ + "print(table, digit=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pNDCsznF_zyR" + }, + "outputs": [], + "source": [ + "tab<- xtable(table, digits=3)\n", + "print(tab, type=\"latex\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "papermill": { + "duration": 55.808166, + "end_time": "2021-02-13T17:33:28.352086", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-02-13T17:32:32.543920", + "version": "2.1.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/PM4/r_dml_inference_for_gun_ownership.Rmd b/PM4/r_dml_inference_for_gun_ownership.Rmd new file mode 100644 index 00000000..3a60890b --- /dev/null +++ b/PM4/r_dml_inference_for_gun_ownership.Rmd @@ -0,0 +1,533 @@ +--- +title: An R Markdown document converted from "PM4/r_dml_inference_for_gun_ownership.irnb" +output: html_document +--- + +# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates + +We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we perform inference on $\beta$ in the following the partially linear model: +$$ +Y_{j, t}=\beta D_{j,(t-1)}+g\left(X_{j, t}, \bar{X}_j, \bar{X}_t, X_{j, 0}, Y_{j, 0}, t\right)+\epsilon_{j, t} +$$ +$Y_{j, t}$ is the log homicide rate in county $j$ at time $t. D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership $G_{j, t}$, which is not observed. $X_{j, t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. We use $\bar{X}_j$ to denote the within county average of $X_{j, t}$ and $\bar{X}_t$ to denote the within time period average of $X_{j, t} . X_{j, 0}$ and $Y_{j, 0}$ denote initial conditions in county $j$. We use $Z_{j, t}$ to denote the set of observed control variables $\left\{X_{j, t}, \bar{X}_j, \bar{X}_t, X_{j, 0}, Y_{j, 0}, t\right\}$, so that our model is + +$$ + Y_{i,t} = \beta D_{i,(t-1)} + g(Z_{i,t}) + \epsilon_{i,t}. +$$ + +## Data + +$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. Assuming the firearm suicide rate is a good proxy for gun ownership, the parameter $\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics. + +The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations. + +```{r} +install.packages("glmnet") +install.packages("randomForest") +install.packages("xgboost") +install.packages("keras") +install.packages("tensorflow") +install.packages("xtable") +install.packages("dplyr") +install.packages("sandwich") + +library(glmnet) +library(randomForest) +library(xgboost) +library(keras) +library(tensorflow) +library(xtable) +library(dplyr) +library(sandwich) +``` + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/gun_clean.csv" +data <- read.csv(file) +dim(data) +``` + +## Preprocessing + +To attempt to flexibly account for fixed heterogeneity across counties, common time factors, and deterministic time trends, we include county-level averages, time period averages, initial conditions, and the time index as additional control variables. This strategy is related to strategies for addressing latent sources of heterogeneity via conditioning. + +We first reweight time and county variables as the data are population weighted. + +```{r} +# Note: These data are population weighted. Specifically, +# looking at the JBES replication files, they seem to be multiplied +# by sqrt((1/T sum_t population_{j,t})/100000). To get the +# unweighted variables need to divide by this number - which we can +# get from the time effects. We are mostly just going to use the weighted +# variables as inputs - except for time and county. We'll take +# cross-sectional and time series means of these weighted variables +# as well. Note that there is nothing wrong with this, but it does not +# reproduce a weighted regression in a setting where covariates may +# enter nonlinearly and flexibly. + +## County FE +county.vars <- select(data, starts_with('X_J')) + +## Time variables and population weights +# Pull out time variables +time.vars <- select(data, starts_with('X_T')) + +# Use these to construct population weights +popweights <- rowSums(time.vars) + +# Unweighted time variables +time.vars <- time.vars/popweights + +# For any columns with only zero (like the first one), just drop +time.vars <- time.vars[, colSums(time.vars != 0) > 0] + +# Create time index +time.ind <- rowSums(time.vars*(seq(1:20))) +``` + +Now we create initial conditions, county-level averages, and time period averages. + +```{r} + ###### Create new data frame with variables we'll use + +# Function to find variable names +varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { + vars <- character(0) + if (any(type %in% "numeric")) { + vars <- c(vars,names(df)[sapply(df,is.numeric)]) + } + if (any(type %in% "factor")) { + vars <- c(vars,names(df)[sapply(df,is.factor)]) + } + if (any(type %in% "character")) { + vars <- c(vars,names(df)[sapply(df,is.character)]) + } + vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] +} + +# census control variables +census <- NULL +census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") + +for(i in 1:length(census_var)){ + census <- append(census, varlist(data, pattern=census_var[i])) +} + +# other control variables +X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") +X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") + +# "treatment" variable +d <- "logfssl" + +# outcome variable +y <- "logghomr" + +# new data frame for time index +usedata <- as.data.frame(time.ind) +colnames(usedata) <- "time.ind" +usedata[,"weights"] <- popweights + +varlist <- c(y,d,X1,X2,census) +for(i in 1:length(varlist)){ + usedata[, varlist[i]] <- data[,varlist[i]] +} + +####################### Construct county specific means, +# time specific means, initial conditions + +# Initial conditions +varlist0 <- c(y,X1,X2,census) +for(i in 1:length(varlist0)){ + usedata[, paste(varlist0[i],"0" , sep="")] <- kronecker(usedata[time.ind == 1,varlist0[i]], + rep(1,20)) +} + +# County means +varlistJ <- c(X1,X2,census) +county.vars <- as.matrix(county.vars) +for(i in 1:length(varlistJ)){ + usedata[, paste(varlistJ[i],"J" , sep="")] <- + county.vars%*%qr.solve(county.vars,as.matrix(usedata[,varlistJ[i]])) +} + +# Time means +time.vars <- as.matrix(time.vars) +for(i in 1:length(varlistJ)){ + usedata[, paste(varlistJ[i],"T" , sep="")] <- + time.vars%*%qr.solve(time.vars,as.matrix(usedata[,varlistJ[i]])) +} +``` + +# Estimation + + +## Baseline OLS Estimates + +After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls in the full data set. + +```{r} +# Simple regression +lm0 <- lm(logghomr ~ logfssl, data = usedata) +vc0 <- vcovHC(lm0) +cat("Baseline OLS:",lm0$coefficients[2]," (",sqrt(vc0[2,2]),")\n") +# Confidence Interval with HC3 covariance +tt <- qt(c(0.025,0.975),summary(lm0)$df[2]) +se <- sqrt(diag(vc0)) +ci <- coef(lm0) + se %o% tt +cat("2.5%: ", ci[2,1],"97.5%: ", ci[2,2]) +``` + +The point estimate is $0.302$ with the confidence interval ranging from 0.277 to 0.327. This +suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% then the predicted gun homicide rate goes up by 0.3%, without controlling for counties' characteristics. + + +Next we estimate with the baseline set of controls. + +```{r} +# Regression on baseline controls +varlist <- c(d,X1,X2,census) +lmC <- lm(paste("logghomr ~",paste(varlist, collapse = "+")), data = usedata) +vcC <- vcovHC(lmC) +cat("OLS with Controls:",lmC$coefficients["logfssl"]," (",sqrt(vcC["logfssl","logfssl"]),")\n") +``` + + + +We can also run our regression with time and space averages as controls. + +```{r} +# Regression on time and cross sectional averages +varlistX <- c(X1,X2,census) +varlistMeans <- c(d,X1,X2,census) +for(i in 1:length(varlistX)){ + varlistMeans <- c(varlistMeans,paste(varlistX[i],"J" , sep="")) +} +for(i in 1:length(varlistX)){ + varlistMeans <- c(varlistMeans,paste(varlistX[i],"T" , sep="")) +} +lmM <- lm(paste("logghomr ~",paste(varlistMeans, collapse = "+")), data = usedata) +vcM <- vcovHC(lmM) +cat("OLS with Averages:",lmM$coefficients["logfssl"]," (",sqrt(vcM["logfssl","logfssl"]),")\n") +``` + +Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we now include all controls. + +```{r} +# Regression on all controls +lmA <- lm(logghomr ~ ., data = usedata) +vcA <- vcovHC(lmA) +cat("OLS All:",lmA$coefficients["logfssl"]," (",sqrt(vcA["logfssl","logfssl"]),")\n") +``` + +After controlling for a rich set of characteristics, the point estimate of gun ownership attenuates to 0.179. + +***NB***: In the background, `lm()` is dropping variables based on collinearity diagnostics. These depend on system linear algebra routines and can lead to large differences in high-dimensional or other ill-conditioned settings when using otherwise identical code across languages and/or machines. + +Now we turn to our double machine learning framework, employing linear and flexible estimation methods with cross-fitting. + +## DML Estimates + +We perform inference on $\beta$ in the following the partially linear model: + $$ +Y_{j, t}=\beta D_{j,(t-1)}+g(Z_{j,t})+\epsilon_{j, t}. +$$ +In the first stage, using cross-fitting, we employ modern regression methods to build estimators $\hat \ell(Z_{j,t})$ and $\hat m(Z_{j,t})$, where +- $\ell(Z_{j,t}):=E(Y_{j,t}|Z_{j,t})$ +- $m(Z_{j,t}):=E(D_{j,t}|Z_{j,t})$ + +Using these, we obtain the estimates of the residualized quantities +- $\tilde Y_{j,t} = Y_{j,t}- E(Y_{j,t}|Z_{j,t})$ +- $\tilde D_{j,t}= D_{j,t}- E(D_{j,t}|Z_{j,t})$ + +Using these residualized quantities, we note our model can be written as +$$ +\tilde Y_{j,t} = \beta \tilde D_{j,t} + \epsilon_{j,t}, \quad E (\epsilon_{j,t} |\tilde D_{j,t}) =0. +$$ +In the final stage, using ordinary least squares of $\tilde Y_{j,t}$ on $\tilde D_{j,t}$, we obtain the +estimate of $\beta$. + +In the following, we consider 10 different methods for the first-stage models for $\ell(\cdot)$ and $m(\cdot)$ covering linear, penalized linear, and flexible methods. We also report the first-stage RMSE scores for estimating $Y$ and $D$. + +```{r} +# NB: this cell takes > 3 hours to runon colab. To reduce computation time, +# reduce the number of cross-fitting folds. Note this may affect stability +# of estimates. + +set.seed(123) + +# Cross-fitting +n <- nrow(usedata) +Kf <- 5 # Number of cross-fitting folds +sampleframe <- rep(1:Kf, ceiling(n/Kf)) +cvgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups + +# Initialize variables for cross-fit predictions +yhat.r <- matrix(NA,n,10) # Going to consider 10 learners +dhat.r <- matrix(NA,n,10) + +# Cross-fitting loop +for(k in 1:Kf) { + cat("fold: ", k,"\n") + indk <- cvgroup == k + + ktrain <- usedata[!indk,] + ktest <- usedata[indk,] + + #### Simple regression models #### + + # Simple regression + yhat.r[indk,1] <- ktest$logghomr - mean(ktrain$logghomr) + dhat.r[indk,1] <- ktest$logfssl - mean(ktrain$logfssl) + + # Baseline controls + varlist <- c(X1,X2,census) + lmyk.C <- lm(paste("logghomr ~",paste(varlist, collapse = "+")), data = ktrain) + yhat.r[indk,2] <- ktest$logghomr - predict(lmyk.C, ktest) + lmdk.C <- lm(paste("logfssl ~",paste(varlist, collapse = "+")), data = ktrain) + dhat.r[indk,2] <- ktest$logfssl - predict(lmdk.C, ktest) + + # All controls + lmyk.A <- lm(logghomr ~ .-logfssl, data = ktrain) + yhat.r[indk,3] <- ktest$logghomr - predict(lmyk.A, ktest) + lmdk.A <- lm(logfssl ~ .-logghomr, data = ktrain) + dhat.r[indk,3] <- ktest$logfssl - predict(lmdk.A, ktest) + + #### Penalized Linear Models #### + + # Lasso - default CV tuning + ytrain <- as.matrix(usedata[!indk,"logghomr"]) + dtrain <- as.matrix(usedata[!indk,"logfssl"]) + xtrain <- as.matrix(usedata[!indk,!names(usedata) %in% + c("logghomr", "logfssl")]) + ytest <- as.matrix(usedata[indk,"logghomr"]) + dtest <- as.matrix(usedata[indk,"logfssl"]) + xtest <- as.matrix(usedata[indk,!names(usedata) %in% + c("logghomr", "logfssl")]) + + lassoyk <- cv.glmnet(xtrain,ytrain) + yhat.r[indk,4] <- ytest - predict(lassoyk, newx = xtest, s = "lambda.min") + + lassodk <- cv.glmnet(xtrain,dtrain) + dhat.r[indk,4] <- dtest - predict(lassodk, newx = xtest, s = "lambda.min") + + # Ridge + ridgeyk <- cv.glmnet(xtrain,ytrain,alpha = 0) + yhat.r[indk,5] <- ytest - predict(ridgeyk, newx = xtest, s = "lambda.min") + + ridgedk <- cv.glmnet(xtrain,dtrain, alpha = 0) + dhat.r[indk,5] <- dtest - predict(ridgedk, newx = xtest, s = "lambda.min") + + # EN, .5 - no cv over alpha + enyk <- cv.glmnet(xtrain,ytrain,alpha = .5) + yhat.r[indk,6] <- ytest - predict(enyk, newx = xtest, s = "lambda.min") + + endk <- cv.glmnet(xtrain,dtrain, alpha = .5) + dhat.r[indk,6] <- dtest - predict(endk, newx = xtest, s = "lambda.min") + + #### Flexible regression models #### + + # Random forest + rfyk <- randomForest(logghomr ~ .-logfssl, data = ktrain) + yhat.r[indk,7] <- ktest$logghomr - predict(rfyk, ktest) + rfdk <- randomForest(logfssl ~ .-logghomr, data = ktrain) + dhat.r[indk,7] <- ktest$logfssl - predict(rfdk, ktest) + + # Boosted tree - depth 4 + xgb_train.y = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[!indk,"logghomr"])) + xgb_test.y = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[indk,"logghomr"])) + xgb_train.d = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[!indk,"logfssl"])) + xgb_test.d = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[indk,"logfssl"])) + + byk = xgb.cv(data = xgb_train.y, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5) + best.iter = which.min(as.matrix(byk$evaluation_log[,4])) + byk = xgboost(data = xgb_train.y, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4) + yhat.r[indk,8] = ktest$logghomr - predict(byk, newdata = xgb_test.y, + iterationrange = c(1,(best.iter+1))) + + bdk = xgb.cv(data = xgb_train.d, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5) + best.iter = which.min(as.matrix(bdk$evaluation_log[,4])) + bdk = xgboost(data = xgb_train.d, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4) + dhat.r[indk,8] = ktest$logfssl - predict(bdk, newdata = xgb_test.d, + iterationrange = c(1,(best.iter+1))) + + #### Neural Networks #### + + # normalize the covariate data + mean <- apply(xtrain, 2, mean) + std <- apply(xtrain, 2, sd) + xtrainNN <- scale(xtrain, center = mean, scale = std) + xtestNN <- scale(xtest, center = mean, scale = std) + + xtestNN <- xtestNN[,which(!is.nan(colMeans(xtrainNN)))] + xtrainNN <- xtrainNN[,which(!is.nan(colMeans(xtrainNN)))] + + # DNN 50/50/50/50, .5 dropout + NNmodely <- keras_model_sequential() + NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 1) + + NNmodely %>% compile( + loss = "mse", + optimizer = optimizer_rmsprop()) + + fit.NNmodely <- NNmodely %>% fit( + xtrainNN, ytrain, + epochs = 200, batch_size = 200, + validation_split = .2, verbose = 0 + ) + yhat.r[indk,9] <- ktest$logghomr - predict(NNmodely, xtestNN) + + NNmodeld <- keras_model_sequential() + NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dropout(rate = .5) %>% + layer_dense(units = 1) + + NNmodeld %>% compile( + loss = "mse", + optimizer = optimizer_rmsprop()) + + fit.NNmodeld <- NNmodeld %>% fit( + xtrainNN, dtrain, + epochs = 200, batch_size = 200, + validation_split = .2, verbose = 0 + ) + dhat.r[indk,9] <- ktest$logfssl - predict(NNmodeld, xtestNN) + + # DNN 50/50/50/50, early stopping + NNmodely <- keras_model_sequential() + NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 1) + + NNmodely %>% compile( + loss = "mse", + optimizer = optimizer_rmsprop()) + + early.stop <- callback_early_stopping(monitor = "val_loss", patience = 25, + restore_best_weights = TRUE) + + fit.NNmodely <- NNmodely %>% fit( + xtrainNN, ytrain, + epochs = 200, batch_size = 200, + validation_split = .2, verbose = 0, + callbacks = list(early.stop) + ) + yhat.r[indk,10] <- ktest$logghomr - predict(NNmodely, xtestNN) + + NNmodeld <- keras_model_sequential() + NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 1) + + NNmodeld %>% compile( + loss = "mse", + optimizer = optimizer_rmsprop()) + + early.stop <- callback_early_stopping(monitor = "val_loss", patience = 25, + restore_best_weights = TRUE) + + fit.NNmodeld <- NNmodeld %>% fit( + xtrainNN, dtrain, + epochs = 200, batch_size = 200, + validation_split = .2, verbose = 0, + callbacks = list(early.stop) + ) + dhat.r[indk,10] <- ktest$logfssl - predict(NNmodeld, xtestNN) + +} + +################################################################################ +# Predictions done, now DML + +RMSE.y <- sqrt(colMeans(yhat.r^2)) +RMSE.d <- sqrt(colMeans(dhat.r^2)) + +# dml coefficient estimates +b.dml <- rep(NA,10) +s.dml <- rep(NA,10) +for(k in 1:10){ + lm.k <- lm(yhat.r[,k] ~ dhat.r[,k]-1) + v.k <- vcovHC(lm.k) + b.dml[k] <- lm.k$coefficients + s.dml[k] <- sqrt(v.k) +} + +# "best" coefficient estimate +lm.k <- lm(yhat.r[,which.min(RMSE.y)] ~ dhat.r[,which.min(RMSE.d)]-1) +v.k <- vcovHC(lm.k) +b.dml[11] <- lm.k$coefficients +s.dml[11] <- sqrt(v.k) + +# ls model average +yhat <- usedata$logghomr - yhat.r +dhat <- usedata$logfssl - dhat.r + +ma.y <- lm(usedata$logghomr ~ yhat-1) +ma.d <- lm(usedata$logfssl ~ dhat-1) +weights.y <- ma.y$coefficients +weights.d <- ma.d$coefficients +lm.k <- lm(ma.y$residuals ~ ma.d$residuals-1) +v.k <- vcovHC(lm.k) +b.dml[12] <- lm.k$coefficients +s.dml[12] <- sqrt(v.k) + +## Display results +table1 <- matrix(0, 10, 2) +table1[,1] <- RMSE.y +table1[,2] <- RMSE.d +colnames(table1)<- c("RMSE Y","RMSE D") +rownames(table1)<- c("OLS - No Controls", "OLS - Basic", "OLS - All", + "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", + "Random Forest","Boosted trees - depth 4", + "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping") +tab1<- xtable(table1, digits =c(0,4,4)) +tab1 + +table2 <- matrix(0, 12, 2) +table2[,1] <- b.dml +table2[,2] <- s.dml +colnames(table2)<- c("Point Estimate","Std. Error") +rownames(table2)<- c("OLS - No Controls", "OLS - Basic", "OLS - All", + "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", + "Random Forest","Boosted trees - depth 4", + "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping", + "Best","Least Squares Model Average") +tab2<- xtable(table2, digits =c(0,4,4)) +tab2 +``` + +```{r} +print(xtable(table1,type="latex")) +print(xtable(table2,type="latex")) +``` + diff --git a/PM4/r_dml_inference_for_gun_ownership.irnb b/PM4/r_dml_inference_for_gun_ownership.irnb index 260958dd..28e94869 100644 --- a/PM4/r_dml_inference_for_gun_ownership.irnb +++ b/PM4/r_dml_inference_for_gun_ownership.irnb @@ -1,807 +1,807 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "a19sSgshu-SA", + "papermill": { + "duration": 0.024906, + "end_time": "2021-07-23T16:17:55.704014", + "exception": false, + "start_time": "2021-07-23T16:17:55.679108", + "status": "completed" }, - "kernelspec": { - "name": "ir", - "display_name": "R" - }, - "language_info": { - "name": "R" - } + "tags": [] + }, + "source": [ + "# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.024906, - "end_time": "2021-07-23T16:17:55.704014", - "exception": false, - "start_time": "2021-07-23T16:17:55.679108", - "status": "completed" - }, - "tags": [], - "id": "a19sSgshu-SA" - }, - "source": [ - "# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.024533, - "end_time": "2021-07-23T16:17:55.753444", - "exception": false, - "start_time": "2021-07-23T16:17:55.728911", - "status": "completed" - }, - "tags": [], - "id": "R4ZopCRVu-SA" - }, - "source": [ - "We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we perform inference on $\\beta$ in the following the partially linear model:\n", - "$$\n", - "Y_{j, t}=\\beta D_{j,(t-1)}+g\\left(X_{j, t}, \\bar{X}_j, \\bar{X}_t, X_{j, 0}, Y_{j, 0}, t\\right)+\\epsilon_{j, t}\n", - "$$\n", - "$Y_{j, t}$ is the log homicide rate in county $j$ at time $t. D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership $G_{j, t}$, which is not observed. $X_{j, t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. We use $\\bar{X}_j$ to denote the within county average of $X_{j, t}$ and $\\bar{X}_t$ to denote the within time period average of $X_{j, t} . X_{j, 0}$ and $Y_{j, 0}$ denote initial conditions in county $j$. We use $Z_{j, t}$ to denote the set of observed control variables $\\left\\{X_{j, t}, \\bar{X}_j, \\bar{X}_t, X_{j, 0}, Y_{j, 0}, t\\right\\}$, so that our model is\n", - "\n", - "$$\n", - " Y_{i,t} = \\beta D_{i,(t-1)} + g(Z_{i,t}) + \\epsilon_{i,t}.\n", - "$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.024711, - "end_time": "2021-07-23T16:17:55.803109", - "exception": false, - "start_time": "2021-07-23T16:17:55.778398", - "status": "completed" - }, - "tags": [], - "id": "ubu-QI2Ju-SB" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.025115, - "end_time": "2021-07-23T16:17:55.854426", - "exception": false, - "start_time": "2021-07-23T16:17:55.829311", - "status": "completed" - }, - "tags": [], - "id": "fV3y0eiCu-SB" - }, - "source": [ - "$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. Assuming the firearm suicide rate is a good proxy for gun ownership, the parameter $\\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics.\n", - "\n", - "The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations." - ] - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"glmnet\")\n", - "install.packages(\"randomForest\")\n", - "install.packages(\"xgboost\")\n", - "install.packages(\"keras\")\n", - "install.packages(\"tensorflow\")\n", - "install.packages(\"xtable\")\n", - "install.packages(\"dplyr\")\n", - "install.packages(\"sandwich\")\n", - "\n", - "library(glmnet)\n", - "library(randomForest)\n", - "library(xgboost)\n", - "library(keras)\n", - "library(tensorflow)\n", - "library(xtable)\n", - "library(dplyr)\n", - "library(sandwich)" - ], - "metadata": { - "id": "nIdoZ226yN1a" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/gun_clean.csv\"\n", - "data <- read.csv(file)\n", - "dim(data)" - ], - "metadata": { - "id": "WHTx8goy46e9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.025977, - "end_time": "2021-07-23T16:17:57.064860", - "exception": false, - "start_time": "2021-07-23T16:17:57.038883", - "status": "completed" - }, - "tags": [], - "id": "TkxefAQ7u-SD" - }, - "source": [ - "## Preprocessing\n", - "\n", - "To attempt to flexibly account for fixed heterogeneity across counties, common time factors, and deterministic time trends, we include county-level averages, time period averages, initial conditions, and the time index as additional control variables. This strategy is related to strategies for addressing latent sources of heterogeneity via conditioning." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.024998, - "end_time": "2021-07-23T16:17:57.115009", - "exception": false, - "start_time": "2021-07-23T16:17:57.090011", - "status": "completed" - }, - "tags": [], - "id": "FR0sUlnYu-SD" - }, - "source": [ - "We first reweight time and county variables as the data are population weighted." - ] - }, - { - "cell_type": "code", - "source": [ - "# Note: These data are population weighted. Specifically,\n", - "# looking at the JBES replication files, they seem to be multiplied\n", - "# by sqrt((1/T sum_t population_{j,t})/100000). To get the\n", - "# unweighted variables need to divide by this number - which we can\n", - "# get from the time effects. We are mostly just going to use the weighted\n", - "# variables as inputs - except for time and county. We'll take\n", - "# cross-sectional and time series means of these weighted variables\n", - "# as well. Note that there is nothing wrong with this, but it does not\n", - "# reproduce a weighted regression in a setting where covariates may\n", - "# enter nonlinearly and flexibly.\n", - "\n", - "## County FE\n", - "county.vars <- select(data, starts_with('X_J'))\n", - "\n", - "## Time variables and population weights\n", - "# Pull out time variables\n", - "time.vars <- select(data, starts_with('X_T'))\n", - "\n", - "# Use these to construct population weights\n", - "popweights <- rowSums(time.vars)\n", - "\n", - "# Unweighted time variables\n", - "time.vars <- time.vars/popweights\n", - "\n", - "# For any columns with only zero (like the first one), just drop\n", - "time.vars <- time.vars[, colSums(time.vars != 0) > 0]\n", - "\n", - "# Create time index\n", - "time.ind <- rowSums(time.vars*(seq(1:20)))" - ], - "metadata": { - "id": "no2XXU9F460B" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now we create initial conditions, county-level averages, and time period averages." - ], - "metadata": { - "id": "pKPGeFnurGys" - } - }, - { - "cell_type": "code", - "source": [ - " ###### Create new data frame with variables we'll use\n", - "\n", - "# Function to find variable names\n", - "varlist <- function (df=NULL,type=c(\"numeric\",\"factor\",\"character\"), pattern=\"\", exclude=NULL) {\n", - " vars <- character(0)\n", - " if (any(type %in% \"numeric\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.numeric)])\n", - " }\n", - " if (any(type %in% \"factor\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.factor)])\n", - " }\n", - " if (any(type %in% \"character\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.character)])\n", - " }\n", - " vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)]\n", - "}\n", - "\n", - "# census control variables\n", - "census <- NULL\n", - "census_var <- c(\"^AGE\", \"^BN\", \"^BP\", \"^BZ\", \"^ED\", \"^EL\",\"^HI\", \"^HS\", \"^INC\", \"^LF\", \"^LN\", \"^PI\", \"^PO\", \"^PP\", \"^PV\", \"^SPR\", \"^VS\")\n", - "\n", - "for(i in 1:length(census_var)){\n", - " census <- append(census, varlist(data, pattern=census_var[i]))\n", - "}\n", - "\n", - "# other control variables\n", - "X1 <- c(\"logrobr\", \"logburg\", \"burg_missing\", \"robrate_missing\")\n", - "X2 <- c(\"newblack\", \"newfhh\", \"newmove\", \"newdens\", \"newmal\")\n", - "\n", - "# \"treatment\" variable\n", - "d <- \"logfssl\"\n", - "\n", - "# outcome variable\n", - "y <- \"logghomr\"\n", - "\n", - "# new data frame for time index\n", - "usedata <- as.data.frame(time.ind)\n", - "colnames(usedata) <- \"time.ind\"\n", - "usedata[,\"weights\"] <- popweights\n", - "\n", - "varlist <- c(y,d,X1,X2,census)\n", - "for(i in 1:length(varlist)){\n", - " usedata[, varlist[i]] <- data[,varlist[i]]\n", - "}\n", - "\n", - "####################### Construct county specific means,\n", - "# time specific means, initial conditions\n", - "\n", - "# Initial conditions\n", - "varlist0 <- c(y,X1,X2,census)\n", - "for(i in 1:length(varlist0)){\n", - " usedata[, paste(varlist0[i],\"0\" , sep=\"\")] <- kronecker(usedata[time.ind == 1,varlist0[i]],\n", - " rep(1,20))\n", - "}\n", - "\n", - "# County means\n", - "varlistJ <- c(X1,X2,census)\n", - "county.vars <- as.matrix(county.vars)\n", - "for(i in 1:length(varlistJ)){\n", - " usedata[, paste(varlistJ[i],\"J\" , sep=\"\")] <-\n", - " county.vars%*%qr.solve(county.vars,as.matrix(usedata[,varlistJ[i]]))\n", - "}\n", - "\n", - "# Time means\n", - "time.vars <- as.matrix(time.vars)\n", - "for(i in 1:length(varlistJ)){\n", - " usedata[, paste(varlistJ[i],\"T\" , sep=\"\")] <-\n", - " time.vars%*%qr.solve(time.vars,as.matrix(usedata[,varlistJ[i]]))\n", - "}" - ], - "metadata": { - "id": "0yv3j0wJ464e" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.02615, - "end_time": "2021-07-23T16:18:24.461261", - "exception": false, - "start_time": "2021-07-23T16:18:24.435111", - "status": "completed" - }, - "tags": [], - "id": "s7ngh8j2u-SF" - }, - "source": [ - "# Estimation\n", - "\n" - ] + { + "cell_type": "markdown", + "metadata": { + "id": "R4ZopCRVu-SA", + "papermill": { + "duration": 0.024533, + "end_time": "2021-07-23T16:17:55.753444", + "exception": false, + "start_time": "2021-07-23T16:17:55.728911", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.02615, - "end_time": "2021-07-23T16:18:24.513673", - "exception": false, - "start_time": "2021-07-23T16:18:24.487523", - "status": "completed" - }, - "tags": [], - "id": "d-qK9imxu-SF" - }, - "source": [ - "## Baseline OLS Estimates" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "papermill": { - "duration": 0.027888, - "end_time": "2021-07-23T16:18:24.568278", - "exception": false, - "start_time": "2021-07-23T16:18:24.540390", - "status": "completed" - }, - "tags": [], - "id": "tiBCuqUdu-SG" - }, - "source": [ - "After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls in the full data set." - ] - }, - { - "cell_type": "code", - "source": [ - "# Simple regression\n", - "lm0 <- lm(logghomr ~ logfssl, data = usedata)\n", - "vc0 <- vcovHC(lm0)\n", - "cat(\"Baseline OLS:\",lm0$coefficients[2],\" (\",sqrt(vc0[2,2]),\")\\n\")\n", - "# Confidence Interval with HC3 covariance\n", - "tt <- qt(c(0.025,0.975),summary(lm0)$df[2])\n", - "se <- sqrt(diag(vc0))\n", - "ci <- coef(lm0) + se %o% tt\n", - "cat(\"2.5%: \", ci[2,1],\"97.5%: \", ci[2,2])" - ], - "metadata": { - "id": "yX0GRnnlryxu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The point estimate is $0.302$ with the confidence interval ranging from 0.277 to 0.327. This\n", - "suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% then the predicted gun homicide rate goes up by 0.3%, without controlling for counties' characteristics.\n", - "\n" - ], - "metadata": { - "id": "WfcEZxr7rxB2" - } - }, - { - "cell_type": "markdown", - "source": [ - "Next we estimate with the baseline set of controls." - ], - "metadata": { - "id": "SCc5D-QhNIsG" - } - }, - { - "cell_type": "code", - "source": [ - "# Regression on baseline controls\n", - "varlist <- c(d,X1,X2,census)\n", - "lmC <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = usedata)\n", - "vcC <- vcovHC(lmC)\n", - "cat(\"OLS with Controls:\",lmC$coefficients[\"logfssl\"],\" (\",sqrt(vcC[\"logfssl\",\"logfssl\"]),\")\\n\")" - ], - "metadata": { - "id": "ljFlAr5Isjzd" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "\n", - "We can also run our regression with time and space averages as controls." - ], - "metadata": { - "id": "e-b9PUBBs2rE" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iOFCWtUKyFK2" - }, - "outputs": [], - "source": [ - "# Regression on time and cross sectional averages\n", - "varlistX <- c(X1,X2,census)\n", - "varlistMeans <- c(d,X1,X2,census)\n", - "for(i in 1:length(varlistX)){\n", - " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"J\" , sep=\"\"))\n", - "}\n", - "for(i in 1:length(varlistX)){\n", - " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"T\" , sep=\"\"))\n", - "}\n", - "lmM <- lm(paste(\"logghomr ~\",paste(varlistMeans, collapse = \"+\")), data = usedata)\n", - "vcM <- vcovHC(lmM)\n", - "cat(\"OLS with Averages:\",lmM$coefficients[\"logfssl\"],\" (\",sqrt(vcM[\"logfssl\",\"logfssl\"]),\")\\n\")" - ] + "tags": [] + }, + "source": [ + "We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we perform inference on $\\beta$ in the following the partially linear model:\n", + "$$\n", + "Y_{j, t}=\\beta D_{j,(t-1)}+g\\left(X_{j, t}, \\bar{X}_j, \\bar{X}_t, X_{j, 0}, Y_{j, 0}, t\\right)+\\epsilon_{j, t}\n", + "$$\n", + "$Y_{j, t}$ is the log homicide rate in county $j$ at time $t. D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership $G_{j, t}$, which is not observed. $X_{j, t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. We use $\\bar{X}_j$ to denote the within county average of $X_{j, t}$ and $\\bar{X}_t$ to denote the within time period average of $X_{j, t} . X_{j, 0}$ and $Y_{j, 0}$ denote initial conditions in county $j$. We use $Z_{j, t}$ to denote the set of observed control variables $\\left\\{X_{j, t}, \\bar{X}_j, \\bar{X}_t, X_{j, 0}, Y_{j, 0}, t\\right\\}$, so that our model is\n", + "\n", + "$$\n", + " Y_{i,t} = \\beta D_{i,(t-1)} + g(Z_{i,t}) + \\epsilon_{i,t}.\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubu-QI2Ju-SB", + "papermill": { + "duration": 0.024711, + "end_time": "2021-07-23T16:17:55.803109", + "exception": false, + "start_time": "2021-07-23T16:17:55.778398", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we now include all controls." - ], - "metadata": { - "id": "EdhH_81itPev" - } + "tags": [] + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fV3y0eiCu-SB", + "papermill": { + "duration": 0.025115, + "end_time": "2021-07-23T16:17:55.854426", + "exception": false, + "start_time": "2021-07-23T16:17:55.829311", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "# Regression on all controls\n", - "lmA <- lm(logghomr ~ ., data = usedata)\n", - "vcA <- vcovHC(lmA)\n", - "cat(\"OLS All:\",lmA$coefficients[\"logfssl\"],\" (\",sqrt(vcA[\"logfssl\",\"logfssl\"]),\")\\n\")" - ], - "metadata": { - "id": "wBMWYpbBtKzy" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. Assuming the firearm suicide rate is a good proxy for gun ownership, the parameter $\\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics.\n", + "\n", + "The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nIdoZ226yN1a" + }, + "outputs": [], + "source": [ + "install.packages(\"glmnet\")\n", + "install.packages(\"randomForest\")\n", + "install.packages(\"xgboost\")\n", + "install.packages(\"keras\")\n", + "install.packages(\"tensorflow\")\n", + "install.packages(\"xtable\")\n", + "install.packages(\"dplyr\")\n", + "install.packages(\"sandwich\")\n", + "\n", + "library(glmnet)\n", + "library(randomForest)\n", + "library(xgboost)\n", + "library(keras)\n", + "library(tensorflow)\n", + "library(xtable)\n", + "library(dplyr)\n", + "library(sandwich)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WHTx8goy46e9" + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/gun_clean.csv\"\n", + "data <- read.csv(file)\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TkxefAQ7u-SD", + "papermill": { + "duration": 0.025977, + "end_time": "2021-07-23T16:17:57.064860", + "exception": false, + "start_time": "2021-07-23T16:17:57.038883", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "After controlling for a rich set of characteristics, the point estimate of gun ownership attenuates to 0.179.\n", - "\n", - "***NB***: In the background, `lm()` is dropping variables based on collinearity diagnostics. These depend on system linear algebra routines and can lead to large differences in high-dimensional or other ill-conditioned settings when using otherwise identical code across languages and/or machines.\n", - "\n", - "Now we turn to our double machine learning framework, employing linear and flexible estimation methods with cross-fitting." - ], - "metadata": { - "id": "b60ollfHydRw" - } + "tags": [] + }, + "source": [ + "## Preprocessing\n", + "\n", + "To attempt to flexibly account for fixed heterogeneity across counties, common time factors, and deterministic time trends, we include county-level averages, time period averages, initial conditions, and the time index as additional control variables. This strategy is related to strategies for addressing latent sources of heterogeneity via conditioning." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FR0sUlnYu-SD", + "papermill": { + "duration": 0.024998, + "end_time": "2021-07-23T16:17:57.115009", + "exception": false, + "start_time": "2021-07-23T16:17:57.090011", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "## DML Estimates\n", - "\n", - "We perform inference on $\\beta$ in the following the partially linear model:\n", - " $$\n", - "Y_{j, t}=\\beta D_{j,(t-1)}+g(Z_{j,t})+\\epsilon_{j, t}.\n", - "$$\n", - "In the first stage, using cross-fitting, we employ modern regression methods to build estimators $\\hat \\ell(Z_{j,t})$ and $\\hat m(Z_{j,t})$, where\n", - "- $\\ell(Z_{j,t}):=E(Y_{j,t}|Z_{j,t})$\n", - "- $m(Z_{j,t}):=E(D_{j,t}|Z_{j,t})$\n", - "\n", - "Using these, we obtain the estimates of the residualized quantities\n", - "- $\\tilde Y_{j,t} = Y_{j,t}- E(Y_{j,t}|Z_{j,t})$\n", - "- $\\tilde D_{j,t}= D_{j,t}- E(D_{j,t}|Z_{j,t})$\n", - "\n", - "Using these residualized quantities, we note our model can be written as\n", - "$$\n", - "\\tilde Y_{j,t} = \\beta \\tilde D_{j,t} + \\epsilon_{j,t}, \\quad E (\\epsilon_{j,t} |\\tilde D_{j,t}) =0.\n", - "$$\n", - "In the final stage, using ordinary least squares of $\\tilde Y_{j,t}$ on $\\tilde D_{j,t}$, we obtain the\n", - "estimate of $\\beta$." - ], - "metadata": { - "id": "702RF417z6-1" - } + "tags": [] + }, + "source": [ + "We first reweight time and county variables as the data are population weighted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "no2XXU9F460B" + }, + "outputs": [], + "source": [ + "# Note: These data are population weighted. Specifically,\n", + "# looking at the JBES replication files, they seem to be multiplied\n", + "# by sqrt((1/T sum_t population_{j,t})/100000). To get the\n", + "# unweighted variables need to divide by this number - which we can\n", + "# get from the time effects. We are mostly just going to use the weighted\n", + "# variables as inputs - except for time and county. We'll take\n", + "# cross-sectional and time series means of these weighted variables\n", + "# as well. Note that there is nothing wrong with this, but it does not\n", + "# reproduce a weighted regression in a setting where covariates may\n", + "# enter nonlinearly and flexibly.\n", + "\n", + "## County FE\n", + "county.vars <- select(data, starts_with('X_J'))\n", + "\n", + "## Time variables and population weights\n", + "# Pull out time variables\n", + "time.vars <- select(data, starts_with('X_T'))\n", + "\n", + "# Use these to construct population weights\n", + "popweights <- rowSums(time.vars)\n", + "\n", + "# Unweighted time variables\n", + "time.vars <- time.vars/popweights\n", + "\n", + "# For any columns with only zero (like the first one), just drop\n", + "time.vars <- time.vars[, colSums(time.vars != 0) > 0]\n", + "\n", + "# Create time index\n", + "time.ind <- rowSums(time.vars*(seq(1:20)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pKPGeFnurGys" + }, + "source": [ + "Now we create initial conditions, county-level averages, and time period averages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0yv3j0wJ464e" + }, + "outputs": [], + "source": [ + " ###### Create new data frame with variables we'll use\n", + "\n", + "# Function to find variable names\n", + "varlist <- function (df=NULL,type=c(\"numeric\",\"factor\",\"character\"), pattern=\"\", exclude=NULL) {\n", + " vars <- character(0)\n", + " if (any(type %in% \"numeric\")) {\n", + " vars <- c(vars,names(df)[sapply(df,is.numeric)])\n", + " }\n", + " if (any(type %in% \"factor\")) {\n", + " vars <- c(vars,names(df)[sapply(df,is.factor)])\n", + " }\n", + " if (any(type %in% \"character\")) {\n", + " vars <- c(vars,names(df)[sapply(df,is.character)])\n", + " }\n", + " vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)]\n", + "}\n", + "\n", + "# census control variables\n", + "census <- NULL\n", + "census_var <- c(\"^AGE\", \"^BN\", \"^BP\", \"^BZ\", \"^ED\", \"^EL\",\"^HI\", \"^HS\", \"^INC\", \"^LF\", \"^LN\", \"^PI\", \"^PO\", \"^PP\", \"^PV\", \"^SPR\", \"^VS\")\n", + "\n", + "for(i in 1:length(census_var)){\n", + " census <- append(census, varlist(data, pattern=census_var[i]))\n", + "}\n", + "\n", + "# other control variables\n", + "X1 <- c(\"logrobr\", \"logburg\", \"burg_missing\", \"robrate_missing\")\n", + "X2 <- c(\"newblack\", \"newfhh\", \"newmove\", \"newdens\", \"newmal\")\n", + "\n", + "# \"treatment\" variable\n", + "d <- \"logfssl\"\n", + "\n", + "# outcome variable\n", + "y <- \"logghomr\"\n", + "\n", + "# new data frame for time index\n", + "usedata <- as.data.frame(time.ind)\n", + "colnames(usedata) <- \"time.ind\"\n", + "usedata[,\"weights\"] <- popweights\n", + "\n", + "varlist <- c(y,d,X1,X2,census)\n", + "for(i in 1:length(varlist)){\n", + " usedata[, varlist[i]] <- data[,varlist[i]]\n", + "}\n", + "\n", + "####################### Construct county specific means,\n", + "# time specific means, initial conditions\n", + "\n", + "# Initial conditions\n", + "varlist0 <- c(y,X1,X2,census)\n", + "for(i in 1:length(varlist0)){\n", + " usedata[, paste(varlist0[i],\"0\" , sep=\"\")] <- kronecker(usedata[time.ind == 1,varlist0[i]],\n", + " rep(1,20))\n", + "}\n", + "\n", + "# County means\n", + "varlistJ <- c(X1,X2,census)\n", + "county.vars <- as.matrix(county.vars)\n", + "for(i in 1:length(varlistJ)){\n", + " usedata[, paste(varlistJ[i],\"J\" , sep=\"\")] <-\n", + " county.vars%*%qr.solve(county.vars,as.matrix(usedata[,varlistJ[i]]))\n", + "}\n", + "\n", + "# Time means\n", + "time.vars <- as.matrix(time.vars)\n", + "for(i in 1:length(varlistJ)){\n", + " usedata[, paste(varlistJ[i],\"T\" , sep=\"\")] <-\n", + " time.vars%*%qr.solve(time.vars,as.matrix(usedata[,varlistJ[i]]))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s7ngh8j2u-SF", + "papermill": { + "duration": 0.02615, + "end_time": "2021-07-23T16:18:24.461261", + "exception": false, + "start_time": "2021-07-23T16:18:24.435111", + "status": "completed" }, - { - "cell_type": "markdown", - "source": [ - "In the following, we consider 10 different methods for the first-stage models for $\\ell(\\cdot)$ and $m(\\cdot)$ covering linear, penalized linear, and flexible methods. We also report the first-stage RMSE scores for estimating $Y$ and $D$." - ], - "metadata": { - "id": "Y1rLIZVx1LNv" - } + "tags": [] + }, + "source": [ + "# Estimation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d-qK9imxu-SF", + "papermill": { + "duration": 0.02615, + "end_time": "2021-07-23T16:18:24.513673", + "exception": false, + "start_time": "2021-07-23T16:18:24.487523", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "# NB: this cell takes > 3 hours to runon colab. To reduce computation time,\n", - "# reduce the number of cross-fitting folds. Note this may affect stability\n", - "# of estimates.\n", - "\n", - "set.seed(123)\n", - "\n", - "# Cross-fitting\n", - "n <- nrow(usedata)\n", - "Kf <- 5 # Number of cross-fitting folds\n", - "sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - "cvgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", - "\n", - "# Initialize variables for cross-fit predictions\n", - "yhat.r <- matrix(NA,n,10) # Going to consider 10 learners\n", - "dhat.r <- matrix(NA,n,10)\n", - "\n", - "# Cross-fitting loop\n", - "for(k in 1:Kf) {\n", - " cat(\"fold: \", k,\"\\n\")\n", - " indk <- cvgroup == k\n", - "\n", - " ktrain <- usedata[!indk,]\n", - " ktest <- usedata[indk,]\n", - "\n", - " #### Simple regression models ####\n", - "\n", - " # Simple regression\n", - " yhat.r[indk,1] <- ktest$logghomr - mean(ktrain$logghomr)\n", - " dhat.r[indk,1] <- ktest$logfssl - mean(ktrain$logfssl)\n", - "\n", - " # Baseline controls\n", - " varlist <- c(X1,X2,census)\n", - " lmyk.C <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", - " yhat.r[indk,2] <- ktest$logghomr - predict(lmyk.C, ktest)\n", - " lmdk.C <- lm(paste(\"logfssl ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", - " dhat.r[indk,2] <- ktest$logfssl - predict(lmdk.C, ktest)\n", - "\n", - " # All controls\n", - " lmyk.A <- lm(logghomr ~ .-logfssl, data = ktrain)\n", - " yhat.r[indk,3] <- ktest$logghomr - predict(lmyk.A, ktest)\n", - " lmdk.A <- lm(logfssl ~ .-logghomr, data = ktrain)\n", - " dhat.r[indk,3] <- ktest$logfssl - predict(lmdk.A, ktest)\n", - "\n", - " #### Penalized Linear Models ####\n", - "\n", - " # Lasso - default CV tuning\n", - " ytrain <- as.matrix(usedata[!indk,\"logghomr\"])\n", - " dtrain <- as.matrix(usedata[!indk,\"logfssl\"])\n", - " xtrain <- as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")])\n", - " ytest <- as.matrix(usedata[indk,\"logghomr\"])\n", - " dtest <- as.matrix(usedata[indk,\"logfssl\"])\n", - " xtest <- as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")])\n", - "\n", - " lassoyk <- cv.glmnet(xtrain,ytrain)\n", - " yhat.r[indk,4] <- ytest - predict(lassoyk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " lassodk <- cv.glmnet(xtrain,dtrain)\n", - " dhat.r[indk,4] <- dtest - predict(lassodk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " # Ridge\n", - " ridgeyk <- cv.glmnet(xtrain,ytrain,alpha = 0)\n", - " yhat.r[indk,5] <- ytest - predict(ridgeyk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " ridgedk <- cv.glmnet(xtrain,dtrain, alpha = 0)\n", - " dhat.r[indk,5] <- dtest - predict(ridgedk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " # EN, .5 - no cv over alpha\n", - " enyk <- cv.glmnet(xtrain,ytrain,alpha = .5)\n", - " yhat.r[indk,6] <- ytest - predict(enyk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " endk <- cv.glmnet(xtrain,dtrain, alpha = .5)\n", - " dhat.r[indk,6] <- dtest - predict(endk, newx = xtest, s = \"lambda.min\")\n", - "\n", - " #### Flexible regression models ####\n", - "\n", - " # Random forest\n", - " rfyk <- randomForest(logghomr ~ .-logfssl, data = ktrain)\n", - " yhat.r[indk,7] <- ktest$logghomr - predict(rfyk, ktest)\n", - " rfdk <- randomForest(logfssl ~ .-logghomr, data = ktrain)\n", - " dhat.r[indk,7] <- ktest$logfssl - predict(rfdk, ktest)\n", - "\n", - " # Boosted tree - depth 4\n", - " xgb_train.y = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[!indk,\"logghomr\"]))\n", - " xgb_test.y = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[indk,\"logghomr\"]))\n", - " xgb_train.d = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[!indk,\"logfssl\"]))\n", - " xgb_test.d = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[indk,\"logfssl\"]))\n", - "\n", - " byk = xgb.cv(data = xgb_train.y,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", - " best.iter = which.min(as.matrix(byk$evaluation_log[,4]))\n", - " byk = xgboost(data = xgb_train.y,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", - " yhat.r[indk,8] = ktest$logghomr - predict(byk, newdata = xgb_test.y,\n", - " iterationrange = c(1,(best.iter+1)))\n", - "\n", - " bdk = xgb.cv(data = xgb_train.d,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", - " best.iter = which.min(as.matrix(bdk$evaluation_log[,4]))\n", - " bdk = xgboost(data = xgb_train.d,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", - " dhat.r[indk,8] = ktest$logfssl - predict(bdk, newdata = xgb_test.d,\n", - " iterationrange = c(1,(best.iter+1)))\n", - "\n", - " #### Neural Networks ####\n", - "\n", - " # normalize the covariate data\n", - " mean <- apply(xtrain, 2, mean)\n", - " std <- apply(xtrain, 2, sd)\n", - " xtrainNN <- scale(xtrain, center = mean, scale = std)\n", - " xtestNN <- scale(xtest, center = mean, scale = std)\n", - "\n", - " xtestNN <- xtestNN[,which(!is.nan(colMeans(xtrainNN)))]\n", - " xtrainNN <- xtrainNN[,which(!is.nan(colMeans(xtrainNN)))]\n", - "\n", - " # DNN 50/50/50/50, .5 dropout\n", - " NNmodely <- keras_model_sequential()\n", - " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 1)\n", - "\n", - " NNmodely %>% compile(\n", - " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", - "\n", - " fit.NNmodely <- NNmodely %>% fit(\n", - " xtrainNN, ytrain,\n", - " epochs = 200, batch_size = 200,\n", - " validation_split = .2, verbose = 0\n", - " )\n", - " yhat.r[indk,9] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", - "\n", - " NNmodeld <- keras_model_sequential()\n", - " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 1)\n", - "\n", - " NNmodeld %>% compile(\n", - " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", - "\n", - " fit.NNmodeld <- NNmodeld %>% fit(\n", - " xtrainNN, dtrain,\n", - " epochs = 200, batch_size = 200,\n", - " validation_split = .2, verbose = 0\n", - " )\n", - " dhat.r[indk,9] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", - "\n", - " # DNN 50/50/50/50, early stopping\n", - " NNmodely <- keras_model_sequential()\n", - " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 1)\n", - "\n", - " NNmodely %>% compile(\n", - " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", - "\n", - " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", - " restore_best_weights = TRUE)\n", - "\n", - " fit.NNmodely <- NNmodely %>% fit(\n", - " xtrainNN, ytrain,\n", - " epochs = 200, batch_size = 200,\n", - " validation_split = .2, verbose = 0,\n", - " callbacks = list(early.stop)\n", - " )\n", - " yhat.r[indk,10] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", - "\n", - " NNmodeld <- keras_model_sequential()\n", - " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 1)\n", - "\n", - " NNmodeld %>% compile(\n", - " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", - "\n", - " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", - " restore_best_weights = TRUE)\n", - "\n", - " fit.NNmodeld <- NNmodeld %>% fit(\n", - " xtrainNN, dtrain,\n", - " epochs = 200, batch_size = 200,\n", - " validation_split = .2, verbose = 0,\n", - " callbacks = list(early.stop)\n", - " )\n", - " dhat.r[indk,10] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", - "\n", - "}\n", - "\n", - "################################################################################\n", - "# Predictions done, now DML\n", - "\n", - "RMSE.y <- sqrt(colMeans(yhat.r^2))\n", - "RMSE.d <- sqrt(colMeans(dhat.r^2))\n", - "\n", - "# dml coefficient estimates\n", - "b.dml <- rep(NA,10)\n", - "s.dml <- rep(NA,10)\n", - "for(k in 1:10){\n", - " lm.k <- lm(yhat.r[,k] ~ dhat.r[,k]-1)\n", - " v.k <- vcovHC(lm.k)\n", - " b.dml[k] <- lm.k$coefficients\n", - " s.dml[k] <- sqrt(v.k)\n", - "}\n", - "\n", - "# \"best\" coefficient estimate\n", - "lm.k <- lm(yhat.r[,which.min(RMSE.y)] ~ dhat.r[,which.min(RMSE.d)]-1)\n", - "v.k <- vcovHC(lm.k)\n", - "b.dml[11] <- lm.k$coefficients\n", - "s.dml[11] <- sqrt(v.k)\n", - "\n", - "# ls model average\n", - "yhat <- usedata$logghomr - yhat.r\n", - "dhat <- usedata$logfssl - dhat.r\n", - "\n", - "ma.y <- lm(usedata$logghomr ~ yhat-1)\n", - "ma.d <- lm(usedata$logfssl ~ dhat-1)\n", - "weights.y <- ma.y$coefficients\n", - "weights.d <- ma.d$coefficients\n", - "lm.k <- lm(ma.y$residuals ~ ma.d$residuals-1)\n", - "v.k <- vcovHC(lm.k)\n", - "b.dml[12] <- lm.k$coefficients\n", - "s.dml[12] <- sqrt(v.k)\n", - "\n", - "## Display results\n", - "table1 <- matrix(0, 10, 2)\n", - "table1[,1] <- RMSE.y\n", - "table1[,2] <- RMSE.d\n", - "colnames(table1)<- c(\"RMSE Y\",\"RMSE D\")\n", - "rownames(table1)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", - " \"Random Forest\",\"Boosted trees - depth 4\",\n", - " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\")\n", - "tab1<- xtable(table1, digits =c(0,4,4))\n", - "tab1\n", - "\n", - "table2 <- matrix(0, 12, 2)\n", - "table2[,1] <- b.dml\n", - "table2[,2] <- s.dml\n", - "colnames(table2)<- c(\"Point Estimate\",\"Std. Error\")\n", - "rownames(table2)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", - " \"Random Forest\",\"Boosted trees - depth 4\",\n", - " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\",\n", - " \"Best\",\"Least Squares Model Average\")\n", - "tab2<- xtable(table2, digits =c(0,4,4))\n", - "tab2" - ], - "metadata": { - "id": "u8n1149MolrR" - }, - "execution_count": null, - "outputs": [] + "tags": [] + }, + "source": [ + "## Baseline OLS Estimates" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tiBCuqUdu-SG", + "papermill": { + "duration": 0.027888, + "end_time": "2021-07-23T16:18:24.568278", + "exception": false, + "start_time": "2021-07-23T16:18:24.540390", + "status": "completed" }, - { - "cell_type": "code", - "source": [ - "print(xtable(table1,type=\"latex\"))\n", - "print(xtable(table2,type=\"latex\"))" - ], - "metadata": { - "id": "FjJjD8gRURmc" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + "tags": [] + }, + "source": [ + "After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls in the full data set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yX0GRnnlryxu" + }, + "outputs": [], + "source": [ + "# Simple regression\n", + "lm0 <- lm(logghomr ~ logfssl, data = usedata)\n", + "vc0 <- vcovHC(lm0)\n", + "cat(\"Baseline OLS:\",lm0$coefficients[2],\" (\",sqrt(vc0[2,2]),\")\\n\")\n", + "# Confidence Interval with HC3 covariance\n", + "tt <- qt(c(0.025,0.975),summary(lm0)$df[2])\n", + "se <- sqrt(diag(vc0))\n", + "ci <- coef(lm0) + se %o% tt\n", + "cat(\"2.5%: \", ci[2,1],\"97.5%: \", ci[2,2])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WfcEZxr7rxB2" + }, + "source": [ + "The point estimate is $0.302$ with the confidence interval ranging from 0.277 to 0.327. This\n", + "suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% then the predicted gun homicide rate goes up by 0.3%, without controlling for counties' characteristics.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SCc5D-QhNIsG" + }, + "source": [ + "Next we estimate with the baseline set of controls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ljFlAr5Isjzd" + }, + "outputs": [], + "source": [ + "# Regression on baseline controls\n", + "varlist <- c(d,X1,X2,census)\n", + "lmC <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = usedata)\n", + "vcC <- vcovHC(lmC)\n", + "cat(\"OLS with Controls:\",lmC$coefficients[\"logfssl\"],\" (\",sqrt(vcC[\"logfssl\",\"logfssl\"]),\")\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e-b9PUBBs2rE" + }, + "source": [ + "\n", + "\n", + "We can also run our regression with time and space averages as controls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iOFCWtUKyFK2" + }, + "outputs": [], + "source": [ + "# Regression on time and cross sectional averages\n", + "varlistX <- c(X1,X2,census)\n", + "varlistMeans <- c(d,X1,X2,census)\n", + "for(i in 1:length(varlistX)){\n", + " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"J\" , sep=\"\"))\n", + "}\n", + "for(i in 1:length(varlistX)){\n", + " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"T\" , sep=\"\"))\n", + "}\n", + "lmM <- lm(paste(\"logghomr ~\",paste(varlistMeans, collapse = \"+\")), data = usedata)\n", + "vcM <- vcovHC(lmM)\n", + "cat(\"OLS with Averages:\",lmM$coefficients[\"logfssl\"],\" (\",sqrt(vcM[\"logfssl\",\"logfssl\"]),\")\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EdhH_81itPev" + }, + "source": [ + "Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we now include all controls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBMWYpbBtKzy" + }, + "outputs": [], + "source": [ + "# Regression on all controls\n", + "lmA <- lm(logghomr ~ ., data = usedata)\n", + "vcA <- vcovHC(lmA)\n", + "cat(\"OLS All:\",lmA$coefficients[\"logfssl\"],\" (\",sqrt(vcA[\"logfssl\",\"logfssl\"]),\")\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b60ollfHydRw" + }, + "source": [ + "After controlling for a rich set of characteristics, the point estimate of gun ownership attenuates to 0.179.\n", + "\n", + "***NB***: In the background, `lm()` is dropping variables based on collinearity diagnostics. These depend on system linear algebra routines and can lead to large differences in high-dimensional or other ill-conditioned settings when using otherwise identical code across languages and/or machines.\n", + "\n", + "Now we turn to our double machine learning framework, employing linear and flexible estimation methods with cross-fitting." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "702RF417z6-1" + }, + "source": [ + "## DML Estimates\n", + "\n", + "We perform inference on $\\beta$ in the following the partially linear model:\n", + " $$\n", + "Y_{j, t}=\\beta D_{j,(t-1)}+g(Z_{j,t})+\\epsilon_{j, t}.\n", + "$$\n", + "In the first stage, using cross-fitting, we employ modern regression methods to build estimators $\\hat \\ell(Z_{j,t})$ and $\\hat m(Z_{j,t})$, where\n", + "- $\\ell(Z_{j,t}):=E(Y_{j,t}|Z_{j,t})$\n", + "- $m(Z_{j,t}):=E(D_{j,t}|Z_{j,t})$\n", + "\n", + "Using these, we obtain the estimates of the residualized quantities\n", + "- $\\tilde Y_{j,t} = Y_{j,t}- E(Y_{j,t}|Z_{j,t})$\n", + "- $\\tilde D_{j,t}= D_{j,t}- E(D_{j,t}|Z_{j,t})$\n", + "\n", + "Using these residualized quantities, we note our model can be written as\n", + "$$\n", + "\\tilde Y_{j,t} = \\beta \\tilde D_{j,t} + \\epsilon_{j,t}, \\quad E (\\epsilon_{j,t} |\\tilde D_{j,t}) =0.\n", + "$$\n", + "In the final stage, using ordinary least squares of $\\tilde Y_{j,t}$ on $\\tilde D_{j,t}$, we obtain the\n", + "estimate of $\\beta$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y1rLIZVx1LNv" + }, + "source": [ + "In the following, we consider 10 different methods for the first-stage models for $\\ell(\\cdot)$ and $m(\\cdot)$ covering linear, penalized linear, and flexible methods. We also report the first-stage RMSE scores for estimating $Y$ and $D$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u8n1149MolrR" + }, + "outputs": [], + "source": [ + "# NB: this cell takes > 3 hours to runon colab. To reduce computation time,\n", + "# reduce the number of cross-fitting folds. Note this may affect stability\n", + "# of estimates.\n", + "\n", + "set.seed(123)\n", + "\n", + "# Cross-fitting\n", + "n <- nrow(usedata)\n", + "Kf <- 5 # Number of cross-fitting folds\n", + "sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", + "cvgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", + "\n", + "# Initialize variables for cross-fit predictions\n", + "yhat.r <- matrix(NA,n,10) # Going to consider 10 learners\n", + "dhat.r <- matrix(NA,n,10)\n", + "\n", + "# Cross-fitting loop\n", + "for(k in 1:Kf) {\n", + " cat(\"fold: \", k,\"\\n\")\n", + " indk <- cvgroup == k\n", + "\n", + " ktrain <- usedata[!indk,]\n", + " ktest <- usedata[indk,]\n", + "\n", + " #### Simple regression models ####\n", + "\n", + " # Simple regression\n", + " yhat.r[indk,1] <- ktest$logghomr - mean(ktrain$logghomr)\n", + " dhat.r[indk,1] <- ktest$logfssl - mean(ktrain$logfssl)\n", + "\n", + " # Baseline controls\n", + " varlist <- c(X1,X2,census)\n", + " lmyk.C <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", + " yhat.r[indk,2] <- ktest$logghomr - predict(lmyk.C, ktest)\n", + " lmdk.C <- lm(paste(\"logfssl ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", + " dhat.r[indk,2] <- ktest$logfssl - predict(lmdk.C, ktest)\n", + "\n", + " # All controls\n", + " lmyk.A <- lm(logghomr ~ .-logfssl, data = ktrain)\n", + " yhat.r[indk,3] <- ktest$logghomr - predict(lmyk.A, ktest)\n", + " lmdk.A <- lm(logfssl ~ .-logghomr, data = ktrain)\n", + " dhat.r[indk,3] <- ktest$logfssl - predict(lmdk.A, ktest)\n", + "\n", + " #### Penalized Linear Models ####\n", + "\n", + " # Lasso - default CV tuning\n", + " ytrain <- as.matrix(usedata[!indk,\"logghomr\"])\n", + " dtrain <- as.matrix(usedata[!indk,\"logfssl\"])\n", + " xtrain <- as.matrix(usedata[!indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")])\n", + " ytest <- as.matrix(usedata[indk,\"logghomr\"])\n", + " dtest <- as.matrix(usedata[indk,\"logfssl\"])\n", + " xtest <- as.matrix(usedata[indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")])\n", + "\n", + " lassoyk <- cv.glmnet(xtrain,ytrain)\n", + " yhat.r[indk,4] <- ytest - predict(lassoyk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " lassodk <- cv.glmnet(xtrain,dtrain)\n", + " dhat.r[indk,4] <- dtest - predict(lassodk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " # Ridge\n", + " ridgeyk <- cv.glmnet(xtrain,ytrain,alpha = 0)\n", + " yhat.r[indk,5] <- ytest - predict(ridgeyk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " ridgedk <- cv.glmnet(xtrain,dtrain, alpha = 0)\n", + " dhat.r[indk,5] <- dtest - predict(ridgedk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " # EN, .5 - no cv over alpha\n", + " enyk <- cv.glmnet(xtrain,ytrain,alpha = .5)\n", + " yhat.r[indk,6] <- ytest - predict(enyk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " endk <- cv.glmnet(xtrain,dtrain, alpha = .5)\n", + " dhat.r[indk,6] <- dtest - predict(endk, newx = xtest, s = \"lambda.min\")\n", + "\n", + " #### Flexible regression models ####\n", + "\n", + " # Random forest\n", + " rfyk <- randomForest(logghomr ~ .-logfssl, data = ktrain)\n", + " yhat.r[indk,7] <- ktest$logghomr - predict(rfyk, ktest)\n", + " rfdk <- randomForest(logfssl ~ .-logghomr, data = ktrain)\n", + " dhat.r[indk,7] <- ktest$logfssl - predict(rfdk, ktest)\n", + "\n", + " # Boosted tree - depth 4\n", + " xgb_train.y = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[!indk,\"logghomr\"]))\n", + " xgb_test.y = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[indk,\"logghomr\"]))\n", + " xgb_train.d = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[!indk,\"logfssl\"]))\n", + " xgb_test.d = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[indk,\"logfssl\"]))\n", + "\n", + " byk = xgb.cv(data = xgb_train.y,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", + " best.iter = which.min(as.matrix(byk$evaluation_log[,4]))\n", + " byk = xgboost(data = xgb_train.y,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", + " yhat.r[indk,8] = ktest$logghomr - predict(byk, newdata = xgb_test.y,\n", + " iterationrange = c(1,(best.iter+1)))\n", + "\n", + " bdk = xgb.cv(data = xgb_train.d,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", + " best.iter = which.min(as.matrix(bdk$evaluation_log[,4]))\n", + " bdk = xgboost(data = xgb_train.d,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", + " dhat.r[indk,8] = ktest$logfssl - predict(bdk, newdata = xgb_test.d,\n", + " iterationrange = c(1,(best.iter+1)))\n", + "\n", + " #### Neural Networks ####\n", + "\n", + " # normalize the covariate data\n", + " mean <- apply(xtrain, 2, mean)\n", + " std <- apply(xtrain, 2, sd)\n", + " xtrainNN <- scale(xtrain, center = mean, scale = std)\n", + " xtestNN <- scale(xtest, center = mean, scale = std)\n", + "\n", + " xtestNN <- xtestNN[,which(!is.nan(colMeans(xtrainNN)))]\n", + " xtrainNN <- xtrainNN[,which(!is.nan(colMeans(xtrainNN)))]\n", + "\n", + " # DNN 50/50/50/50, .5 dropout\n", + " NNmodely <- keras_model_sequential()\n", + " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 1)\n", + "\n", + " NNmodely %>% compile(\n", + " loss = \"mse\",\n", + " optimizer = optimizer_rmsprop())\n", + "\n", + " fit.NNmodely <- NNmodely %>% fit(\n", + " xtrainNN, ytrain,\n", + " epochs = 200, batch_size = 200,\n", + " validation_split = .2, verbose = 0\n", + " )\n", + " yhat.r[indk,9] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", + "\n", + " NNmodeld <- keras_model_sequential()\n", + " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dropout(rate = .5) %>%\n", + " layer_dense(units = 1)\n", + "\n", + " NNmodeld %>% compile(\n", + " loss = \"mse\",\n", + " optimizer = optimizer_rmsprop())\n", + "\n", + " fit.NNmodeld <- NNmodeld %>% fit(\n", + " xtrainNN, dtrain,\n", + " epochs = 200, batch_size = 200,\n", + " validation_split = .2, verbose = 0\n", + " )\n", + " dhat.r[indk,9] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", + "\n", + " # DNN 50/50/50/50, early stopping\n", + " NNmodely <- keras_model_sequential()\n", + " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 1)\n", + "\n", + " NNmodely %>% compile(\n", + " loss = \"mse\",\n", + " optimizer = optimizer_rmsprop())\n", + "\n", + " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", + " restore_best_weights = TRUE)\n", + "\n", + " fit.NNmodely <- NNmodely %>% fit(\n", + " xtrainNN, ytrain,\n", + " epochs = 200, batch_size = 200,\n", + " validation_split = .2, verbose = 0,\n", + " callbacks = list(early.stop)\n", + " )\n", + " yhat.r[indk,10] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", + "\n", + " NNmodeld <- keras_model_sequential()\n", + " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 1)\n", + "\n", + " NNmodeld %>% compile(\n", + " loss = \"mse\",\n", + " optimizer = optimizer_rmsprop())\n", + "\n", + " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", + " restore_best_weights = TRUE)\n", + "\n", + " fit.NNmodeld <- NNmodeld %>% fit(\n", + " xtrainNN, dtrain,\n", + " epochs = 200, batch_size = 200,\n", + " validation_split = .2, verbose = 0,\n", + " callbacks = list(early.stop)\n", + " )\n", + " dhat.r[indk,10] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", + "\n", + "}\n", + "\n", + "################################################################################\n", + "# Predictions done, now DML\n", + "\n", + "RMSE.y <- sqrt(colMeans(yhat.r^2))\n", + "RMSE.d <- sqrt(colMeans(dhat.r^2))\n", + "\n", + "# dml coefficient estimates\n", + "b.dml <- rep(NA,10)\n", + "s.dml <- rep(NA,10)\n", + "for(k in 1:10){\n", + " lm.k <- lm(yhat.r[,k] ~ dhat.r[,k]-1)\n", + " v.k <- vcovHC(lm.k)\n", + " b.dml[k] <- lm.k$coefficients\n", + " s.dml[k] <- sqrt(v.k)\n", + "}\n", + "\n", + "# \"best\" coefficient estimate\n", + "lm.k <- lm(yhat.r[,which.min(RMSE.y)] ~ dhat.r[,which.min(RMSE.d)]-1)\n", + "v.k <- vcovHC(lm.k)\n", + "b.dml[11] <- lm.k$coefficients\n", + "s.dml[11] <- sqrt(v.k)\n", + "\n", + "# ls model average\n", + "yhat <- usedata$logghomr - yhat.r\n", + "dhat <- usedata$logfssl - dhat.r\n", + "\n", + "ma.y <- lm(usedata$logghomr ~ yhat-1)\n", + "ma.d <- lm(usedata$logfssl ~ dhat-1)\n", + "weights.y <- ma.y$coefficients\n", + "weights.d <- ma.d$coefficients\n", + "lm.k <- lm(ma.y$residuals ~ ma.d$residuals-1)\n", + "v.k <- vcovHC(lm.k)\n", + "b.dml[12] <- lm.k$coefficients\n", + "s.dml[12] <- sqrt(v.k)\n", + "\n", + "## Display results\n", + "table1 <- matrix(0, 10, 2)\n", + "table1[,1] <- RMSE.y\n", + "table1[,2] <- RMSE.d\n", + "colnames(table1)<- c(\"RMSE Y\",\"RMSE D\")\n", + "rownames(table1)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", + " \"Random Forest\",\"Boosted trees - depth 4\",\n", + " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\")\n", + "tab1<- xtable(table1, digits =c(0,4,4))\n", + "tab1\n", + "\n", + "table2 <- matrix(0, 12, 2)\n", + "table2[,1] <- b.dml\n", + "table2[,2] <- s.dml\n", + "colnames(table2)<- c(\"Point Estimate\",\"Std. Error\")\n", + "rownames(table2)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", + " \"Random Forest\",\"Boosted trees - depth 4\",\n", + " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\",\n", + " \"Best\",\"Least Squares Model Average\")\n", + "tab2<- xtable(table2, digits =c(0,4,4))\n", + "tab2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FjJjD8gRURmc" + }, + "outputs": [], + "source": [ + "print(xtable(table1,type=\"latex\"))\n", + "print(xtable(table2,type=\"latex\"))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 31afef16c6deade2f68f6f6c3a6db7167939252a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 08:49:15 -0700 Subject: [PATCH 079/261] linting fixes --- .github/workflows/transform-R-to-Rmd.yml | 2 +- .lintr | 2 +- PM2/r_linear_penalized_regs.irnb | 1305 ++++++++++++---------- PM2/r_ml_for_wage_prediction.irnb | 805 ++++++------- 4 files changed, 1169 insertions(+), 945 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 390ed380..f7d32594 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -62,7 +62,7 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase"))) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"))) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) diff --git a/.lintr b/.lintr index c015a66d..9e79e4da 100644 --- a/.lintr +++ b/.lintr @@ -1,4 +1,4 @@ linters: linters_with_defaults( line_length_linter(120), - object_name_linter(styles = c("snake_case", "CamelCase")) + object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")) ) diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index 1662c145..f170aa4b 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -5,6 +5,7 @@ "metadata": { "_execution_state": "idle", "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "EaMt_4G0ONZ7", "papermill": { "duration": 0.010774, "end_time": "2021-02-15T11:01:41.782833", @@ -12,8 +13,7 @@ "start_time": "2021-02-15T11:01:41.772059", "status": "completed" }, - "tags": [], - "id": "EaMt_4G0ONZ7" + "tags": [] }, "source": [ "# Penalized Linear Regressions: A Simulation Experiment" @@ -21,6 +21,14 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fw3Ya0m6vboO", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "install.packages(\"xtable\")\n", "install.packages(\"hdm\")\n", @@ -30,74 +38,76 @@ "library(xtable)\n", "library(glmnet)\n", "library(ggplot2)" - ], - "metadata": { - "id": "Fw3Ya0m6vboO" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Data Generating Process" - ], "metadata": { "id": "GNTVs-CtE-U9" - } + }, + "source": [ + "## Data Generating Process" + ] }, { "cell_type": "markdown", - "source": [ - "We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients" - ], "metadata": { "id": "UXGpnWeeFAHV" - } + }, + "source": [ + "We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N1TPWyBtBrqB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "gen_data <- function(n, p, regime = \"sparse\") {\n", " # constants chosen to get R^2 of approximately .80\n", " if (regime == \"sparse\") {\n", - " beta <- (1/seq(1:(p))^2)*7;\n", + " beta <- (1 / seq(1:p)^2) * 7\n", " } else if (regime == \"dense\") {\n", - " beta <- (rnorm(p))*0.5;\n", + " beta <- rnorm(p) * 0.5\n", " } else if (regime == \"sparsedense\") {\n", - " beta_1 <- (1/seq(1:(p))^2)*6.5\n", - " beta_2 <- (rnorm(p,0,0.5))*0.7\n", - " beta <- beta_1+beta_2\n", + " beta_1 <- (1 / seq(1:p)^2) * 6.5\n", + " beta_2 <- rnorm(p, 0, 0.5) * 0.7\n", + " beta <- beta_1 + beta_2\n", " }\n", "\n", " true_fn <- function(x) {\n", - " (x[,1:dim(x)[2]]%*%beta)\n", + " x[, seq_len(dim(x)[2])] %*% beta\n", " }\n", "\n", - " X <- matrix(runif(n*p, min = -0.5, max = 0.5), n, p);\n", + " X <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", " gX <- true_fn(X)\n", " y <- gX + rnorm(n)\n", "\n", - " Xtest <- matrix(runif(n*p, min = -0.5, max = 0.5), n, p)\n", + " Xtest <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", " gXtest <- true_fn(Xtest)\n", " ytest <- gXtest + rnorm(n)\n", "\n", - " Xpop <- matrix(runif(100000*p, min = -0.5, max = 0.5), 100000,p)\n", + " Xpop <- matrix(runif(100000 * p, min = -0.5, max = 0.5), 100000, p)\n", " gXpop <- true_fn(Xpop)\n", " ypop <- gXpop + rnorm(100000)\n", "\n", - " return(list(X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest, Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta=beta))\n", + " return(list(\n", + " X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest,\n", + " Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta = beta\n", + " ))\n", "}" - ], - "metadata": { - "id": "N1TPWyBtBrqB" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { + "id": "5UedfBJpONZ7", "papermill": { "duration": 0.010616, "end_time": "2021-02-15T11:01:41.804126", @@ -105,8 +115,7 @@ "start_time": "2021-02-15T11:01:41.793510", "status": "completed" }, - "tags": [], - "id": "5UedfBJpONZ7" + "tags": [] }, "source": [ "## Data Generating Process: Approximately Sparse" @@ -114,20 +123,31 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LV521EPdA05z", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "set.seed(1)\n", "n <- 100\n", "p <- 400\n", - "res <- gen_data(n,p,regime=\"sparse\")" - ], - "metadata": { - "id": "LV521EPdA05z" - }, - "execution_count": null, - "outputs": [] + "res <- gen_data(n, p, regime = \"sparse\")" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "REt70Qs_zBPl", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "X <- res$X\n", "y <- res$y\n", @@ -139,79 +159,87 @@ "ypop <- res$ypop\n", "gXpop <- res$gXpop\n", "betas <- res$beta" - ], - "metadata": { - "id": "REt70Qs_zBPl" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "plot(gX,y, xlab=\"g(X)\", ylab=\"y\") #plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX)/var(y))) # theoretical R-square in the simulation example" - ], + "execution_count": null, "metadata": { - "id": "3lvcbHdqv11D" + "id": "3lvcbHdqv11D", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" + ] }, { "cell_type": "code", - "source": [ - "# Plot betas\n", - "plot(1:length(betas), abs(betas), log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\")))" - ], + "execution_count": null, "metadata": { - "id": "Ry_b39bLDIDT" + "id": "Ry_b39bLDIDT", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# Plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")" + ] }, { "cell_type": "markdown", - "source": [ - "## Lasso, Ridge, ElasticNet" - ], "metadata": { "id": "g6jcTnhwUkhl" - } + }, + "source": [ + "## Lasso, Ridge, ElasticNet" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "aRWiO93SUw1G" + }, "source": [ "We use glmnet's penalized estimators, which choose the penalty parameter via cross-validation (by default 10-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. The parameter `alpha` controls what penalty (or allows for a convex combination of `l1` and `l2` penalty). Set `alpha=0.5` for elastic net.\n", "\n", "Features will be standardized (by glmnet) so that penalization does not favor different features asymmetrically." - ], - "metadata": { - "id": "aRWiO93SUw1G" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dy1XNF6JXPpe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "r2_score <- function(preds, actual, ytrain = y){\n", - " rss <- sum((preds - actual) ^ 2) ## residual sum of squares\n", - " tss <- sum((actual - mean(ytrain)) ^ 2) ## total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object\n", - " rsq <- 1 - rss/tss\n", + "r2_score <- function(preds, actual, ytrain = y) {\n", + " rss <- sum((preds - actual)^2) # residual sum of squares\n", + " # total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object\n", + " tss <- sum((actual - mean(ytrain))^2)\n", + " rsq <- 1 - rss / tss\n", " return(rsq)\n", "}" - ], - "metadata": { - "id": "Dy1XNF6JXPpe" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "id": "Cy7dThUhONZ_", "papermill": { "duration": 2.898022, "end_time": "2021-02-15T11:01:45.358083", @@ -220,91 +248,110 @@ "status": "completed" }, "tags": [], - "id": "Cy7dThUhONZ_" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.lasso.cv <- cv.glmnet(X, y, family=\"gaussian\", alpha=1, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.ridge <- cv.glmnet(X, y, family=\"gaussian\", alpha=0, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.elnet <- cv.glmnet(X, y, family=\"gaussian\", alpha=.5, nfolds=5) # family gaussian means that we'll be using square loss" + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)" ] }, { "cell_type": "markdown", - "source": [ - "We calculate the R-squared on the small test set that we have" - ], "metadata": { "id": "a7WQJRJ6l0n4" - } + }, + "source": [ + "We calculate the R-squared on the small test set that we have" + ] }, { "cell_type": "code", - "source": [ - "cat('lassocv R2 (Test): ', r2_score(predict(fit.lasso.cv, newx = Xtest, s=\"lambda.min\"), ytest),\n", - " '\\nridge R2 (Test): ', r2_score(predict(fit.ridge, newx = Xtest, s=\"lambda.min\"), ytest),\n", - " '\\nelnet R2 (Test): ', r2_score(predict(fit.elnet, newx = Xtest, s=\"lambda.min\"), ytest)\n", - ")" - ], + "execution_count": null, "metadata": { - "id": "SMuo4MlvXtxH" + "id": "SMuo4MlvXtxH", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "cat(\n", + " \"lassocv R2 (Test): \", r2_score(predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"), ytest),\n", + " \"\\nridge R2 (Test): \", r2_score(predict(fit_ridge, newx = Xtest, s = \"lambda.min\"), ytest),\n", + " \"\\nelnet R2 (Test): \", r2_score(predict(fit_elnet, newx = Xtest, s = \"lambda.min\"), ytest)\n", + ")" + ] }, { "cell_type": "markdown", - "source": [ - "We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample)" - ], "metadata": { "id": "Fw7a-6_-Yhbb" - } + }, + "source": [ + "We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample)" + ] }, { "cell_type": "code", - "source": [ - "R2.lasso.cv <- r2_score(predict(fit.lasso.cv, newx = Xpop, s=\"lambda.min\"), ypop)\n", - "R2.ridge <- r2_score(predict(fit.ridge, newx = Xpop, s=\"lambda.min\"), ypop)\n", - "R2.elnet <- r2_score(predict(fit.elnet, newx = Xpop, s=\"lambda.min\"), ypop)\n", - "\n", - "cat('lassocv R2 (Pop): ', R2.lasso.cv,\n", - " '\\nridge R2 (Pop): ', R2.ridge,\n", - " '\\nelnet R2 (Pop): ', R2.elnet\n", - ")" - ], + "execution_count": null, "metadata": { - "id": "UKmjj0fdYiL1" + "id": "UKmjj0fdYiL1", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "\n", + "cat(\n", + " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", + " \"\\nridge R2 (Pop): \", r2_ridge,\n", + " \"\\nelnet R2 (Pop): \", r2_elnet\n", + ")" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "QECIRikt3j5y" + }, "source": [ "#### glmnet failure in Ridge\n", "\n", "**Note**: Ridge regression performs worse relatively to the Ridge in the correponding [Python notebook](https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM2/python_linear_penalized_regs.ipynb). Even if one were to control for the randomness in the data and use the same data for both, R's glmnet fails.\n", "\n", "To understand why, look at the cross-validated MSE curve with different $\\lambda$ ()." - ], - "metadata": { - "id": "QECIRikt3j5y" - } + ] }, { "cell_type": "code", - "source": [ - "plot(fit.ridge)" - ], + "execution_count": null, "metadata": { - "id": "kUvo6YbaHaSN" + "id": "kUvo6YbaHaSN", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "plot(fit_ridge)" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "mVRvqs8fnRaA" + }, "source": [ "From the [glmnet documentation](https://glmnet.stanford.edu/articles/glmnet.html):\n", "\n", @@ -313,177 +360,201 @@ "> This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\\lambda$ sequence (error bars). Two special values along the $\\lambda$ sequence are indicated by the vertical dotted lines. ```lambda.min``` is the value of $\\lambda$ that gives minimum mean cross-validated error, while ```lambda.1se``` is the value of $\\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum.\n", "\n", "Notice that the chosen ```lambda.min``` is at the boundary of the sequence. An easy way to check this instead of plotting is to extract the $\\lambda$ sequence and the minimum chosen $\\lambda_{min}$ from the fitted object." - ], - "metadata": { - "id": "mVRvqs8fnRaA" - } + ] }, { "cell_type": "code", - "source": [ - "cat(\"lambda sequence: \", fit.ridge$lambda)\n", - "cat(\"\\nChosen minimum lambda: \", fit.ridge$lambda.min)" - ], + "execution_count": null, "metadata": { - "id": "ZsjlfgrynSLx" + "id": "ZsjlfgrynSLx", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "cat(\"lambda sequence: \", fit_ridge$lambda)\n", + "cat(\"\\nChosen minimum lambda: \", fit_ridge$lambda.min)" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "9VKXxWilod6N" + }, "source": [ "In general, it is good practice to examine the lambda sequence that R produces and searches over in cross-validation. When the penalty is chosen at the boundary like we see here, this indicates the generated penalty sequence is likely misspecified. Thus, we choose to supply our own sequence. In particular, we choose values that match up with those in Python's ```sklearn``` Ridge implementation.\n", "\n", "\n", "```glmnet``` minimizes the elastic net loss function as follows:\n", - "$$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\lambda_{R} \\left( \\frac{1}{2} (1-\\alpha) \\|\\beta\\|_2^2 + \\alpha \\|\\beta\\|_1 \\right) $$ For ridge, $\\alpha=0$, so $$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\frac{\\lambda_{R}}{2} \\|\\beta\\|_2^2 $$\n", + "$$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\lambda_{R} \\left( \\frac{1}{2} (1-\\alpha) \\|\\beta\\|_2^2 + \\alpha \\|\\beta\\|_1 \\right) $$ \n", + "\n", + "For ridge, $\\alpha=0$, so $$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\frac{\\lambda_{R}}{2} \\|\\beta\\|_2^2 $$\n", "\n", "Meanwhile, ```sklearn``` minimizes $$\\min_{\\beta} \\frac{1}{N} \\|X\\beta-y\\|_2^2 + \\frac{\\lambda_{python}}{N} \\|\\beta\\|_2^2$$ where $\\lambda_{python}$ is chosen from the grid $(0.1,1,10)$.\n", "\n", "To translate this into R, we must set in glmnet $$\\lambda_{R} :=\\frac{2}{N} \\lambda_{python}$$" - ], - "metadata": { - "id": "9VKXxWilod6N" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "o-k2e0zMI65-", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# sklearn lambdas (penalty)\n", - "lambdas_sklearn = c(0.1,1,10) # defaults\n", - "l_seq = 2 / nrow(X) * lambdas_sklearn\n", + "lambdas_sklearn <- c(0.1, 1, 10) # defaults\n", + "l_seq <- 2 / nrow(X) * lambdas_sklearn\n", "l_seq # note how different these are to the actual lambdas generated by glmnet" - ], - "metadata": { - "id": "o-k2e0zMI65-" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "fit.ridge <- cv.glmnet(X, y, family=\"gaussian\", alpha=0, nfolds=5, lambda = l_seq)\n", - "R2.ridge <- r2_score(predict(fit.ridge, newx = Xpop, s=\"lambda.min\"), ypop)" - ], + "execution_count": null, "metadata": { - "id": "gLH-u5we8QaY" + "id": "gLH-u5we8QaY", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5, lambda = l_seq)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)" + ] }, { "cell_type": "code", - "source": [ - "cat('lassocv R2 (Pop): ', R2.lasso.cv,\n", - " '\\nridge R2 (Pop): ', R2.ridge,\n", - " '\\nelnet R2 (Pop): ', R2.elnet\n", - ")" - ], + "execution_count": null, "metadata": { - "id": "snYw1Gg0phee" + "id": "snYw1Gg0phee", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "cat(\n", + " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", + " \"\\nridge R2 (Pop): \", r2_ridge,\n", + " \"\\nelnet R2 (Pop): \", r2_elnet\n", + ")" + ] }, { "cell_type": "markdown", - "source": [ - "## Plug-in Hyperparameter Lasso and Post-Lasso OLS" - ], "metadata": { "id": "-GuaTiprcCqq" - } + }, + "source": [ + "## Plug-in Hyperparameter Lasso and Post-Lasso OLS" + ] }, { "cell_type": "markdown", - "source": [ - "Here we compute the lasso and ols post lasso using plug-in choices for penalty levels." - ], "metadata": { "id": "T2te6CvUcEa5" - } + }, + "source": [ + "Here we compute the lasso and ols post lasso using plug-in choices for penalty levels." + ] }, { "cell_type": "markdown", + "metadata": { + "id": "NQGL2JsocEjC" + }, "source": [ "\\We use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, and $\\Phi^{-1}$ denotes the quantile function. Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", "\n", "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." - ], - "metadata": { - "id": "NQGL2JsocEjC" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "G7yKoP1IcI5y" + }, "source": [ "We pull an anaue of R's rlasso. Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\\\n", "The post lasso function makes OLS with the selected ***T*** regressors.\n", "To select those parameters, they use $\\lambda$ as variable to penalize\\\n", "**Funny thing: the function rlasso was named like that because it is the \"rigorous\" Lasso.**" - ], - "metadata": { - "id": "G7yKoP1IcI5y" - } + ] }, { "cell_type": "code", - "source": [ - "fit.rlasso <- rlasso(y~X, post=FALSE) # lasso with plug-in penalty level\n", - "fit.rlasso.post <- rlasso(y~X, post=TRUE) # post-lasso with plug-in penalty level" - ], + "execution_count": null, "metadata": { - "id": "fHDKDGlVcXBh" + "id": "fHDKDGlVcXBh", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YMpfjDycchEp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "R2.lasso <- r2_score(predict(fit.rlasso, newdata = Xtest), ytest)\n", - "R2.lasso.post <- r2_score(predict(fit.rlasso.post, newdata = Xtest), ytest)\n", + "r2_lasso <- r2_score(predict(fit_rlasso, newdata = Xtest), ytest)\n", + "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xtest), ytest)\n", "\n", - "cat('rlasso R2 (Test): ', R2.lasso,\n", - " '\\nrlasso-post R2 (Test): ', R2.lasso.post\n", + "cat(\n", + " \"rlasso R2 (Test): \", r2_lasso,\n", + " \"\\nrlasso-post R2 (Test): \", r2_lasso_post\n", ")" - ], - "metadata": { - "id": "YMpfjDycchEp" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7CLOwOKKIgB5", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "R2.lasso <- r2_score(predict(fit.rlasso, newdata = (Xpop)), (ypop))\n", - "R2.lasso.post <- r2_score(predict(fit.rlasso.post, newdata = (Xpop)), (ypop))\n", + "r2_lasso <- r2_score(predict(fit_rlasso, newdata = (Xpop)), (ypop))\n", + "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = (Xpop)), (ypop))\n", "\n", - "cat('rlasso R2 (Pop): ', R2.lasso,\n", - " '\\nrlasso-post R2 (Pop): ', R2.lasso.post\n", + "cat(\n", + " \"rlasso R2 (Pop): \", r2_lasso,\n", + " \"\\nrlasso-post R2 (Pop): \", r2_lasso_post\n", ")" - ], - "metadata": { - "id": "7CLOwOKKIgB5" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## LAVA: Dense + Sparse Coefficients" - ], "metadata": { "id": "WUaAe00Uc5-r" - } + }, + "source": [ + "## LAVA: Dense + Sparse Coefficients" + ] }, { "cell_type": "markdown", "metadata": { + "id": "YBN4j8FMONaA", "papermill": { "duration": 0.02899, "end_time": "2021-02-15T11:01:56.880825", @@ -491,8 +562,7 @@ "start_time": "2021-02-15T11:01:56.851835", "status": "completed" }, - "tags": [], - "id": "YBN4j8FMONaA" + "tags": [] }, "source": [ "Next we code up lava, which alternates the fitting of lasso and ridge" @@ -500,275 +570,312 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jUqZjZJ-mIaG", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", "# number of iteration. Could iterate until a convergence criterion is met.\n", - "lava.predict <- function(X, Y, newX, lambda1, lambda2, iter=5){\n", + "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", + " require(glmnet)\n", "\n", " # Need to demean internally\n", - " dy = Y - mean(Y)\n", - " dx = scale(X, scale = FALSE)\n", + " dy <- Y - mean(Y)\n", + " dx <- scale(X, scale = FALSE)\n", "\n", - " sp1 = glmnet(dx, dy, lambda = lambda1) #lasso step fits \"sparse part\"\n", - " de1 = glmnet(dx, dy-predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", "\n", - " i=1\n", - " while(i<= iter) {\n", - " sp1 = glmnet(dx, dy-predict(de1, newx = dx, s=\"lambda.min\"), lambda = lambda1)\n", - " de1 = glmnet(dx, dy-predict(sp1, newx = dx, s=\"lambda.min\"), alpha = 0, lambda = lambda2)\n", - " i = i+1 }\n", + " i <- 1\n", + " while (i <= iter) {\n", + " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " i <- i + 1\n", + " }\n", "\n", - " bhat = sp1$beta+de1$beta\n", - " a0 = mean(Y)-sum(colMeans(X)*bhat)\n", + " bhat <- sp1$beta + de1$beta\n", + " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", "\n", " # Need to add intercept to output\n", "\n", - " yhat = newX%*%bhat + a0\n", + " yhat <- newX %*% bhat + a0\n", "\n", " return(yhat)\n", - "}" - ], - "metadata": { - "id": "jUqZjZJ-mIaG" - }, - "execution_count": null, - "outputs": [] + "}\n" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tr_KBCwwovp6", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# define function to get predictions and r2 scores for lava estimator\n", "\n", - "lava_yhat_r2 <- function(xtr.mod, xte.mod, ytr, yte, num_folds = 5){\n", - "\n", - " # 5-fold CV. glmnet does cross-validation internally and\n", - " # relatively efficiently. We're going to write out all the steps to make sure\n", - " # we're using the same CV folds across all procedures in a transparent way and\n", - " # to keep the overall structure clear as well.\n", + "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", + " # 5-fold CV. glmnet does cross-validation internally and\n", + " # relatively efficiently. We're going to write out all the steps to make sure\n", + " # we're using the same CV folds across all procedures in a transparent way and\n", + " # to keep the overall structure clear as well.\n", "\n", - " # Setup for brute force K-Fold CV\n", - " n = length(ytr)\n", - " Kf = num_folds # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling( n/Kf ) )\n", - " cvgroup <- sample( sampleframe , size=n , replace=FALSE ) # CV groups\n", + " # Setup for brute force K-Fold CV\n", + " n <- length(ytr)\n", + " Kf <- num_folds # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", + " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", "\n", "\n", - " ##------------------------------------------------------------\n", - " # We're going to take a shortcut and use the range of lambda values that come out\n", - " # of the default implementation in glmnet for everything. Could do better here - maybe\n", + " ## ------------------------------------------------------------\n", + " # We're going to take a shortcut and use the range of lambda values that come out\n", + " # of the default implementation in glmnet for everything. Could do better here - maybe\n", "\n", - " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge.mod = glmnet(xtr.mod,ytr,alpha=0) # alpha = 0 gives ridge\n", - " ridge.lambda = ridge.mod$lambda # values of penalty parameter\n", + " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", "\n", - " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso.mod = glmnet(xtr.mod,ytr) # default is lasso (equivalent to alpha = 1)\n", - " lasso.lambda = lasso.mod$lambda # values of penalty parameter\n", + " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", "\n", - " ##------------------------------------------------------------\n", + " ## ------------------------------------------------------------\n", "\n", "\n", - " # Lava - Using a double loop over candidate penalty parameter values.\n", + " # Lava - Using a double loop over candidate penalty parameter values.\n", "\n", - " lambda1.lava.mod = lasso.mod$lambda[seq(5,length(lasso.mod$lambda),10)]\n", - " lambda2.lava.mod = ridge.mod$lambda[seq(5,length(ridge.mod$lambda),10)]\n", + " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", + " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", "\n", - " CV.mod.lava = matrix(0,length(lambda1.lava.mod),length(lambda2.lava.mod))\n", + " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", "\n", - " for(k in 1:Kf) {\n", - " indk = cvgroup == k\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", "\n", - " K.xtr.mod = xtr.mod[!indk,]\n", - " K.ytr = ytr[!indk]\n", - " K.xte.mod = xtr.mod[indk,]\n", - " K.yte = ytr[indk]\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + " k_xte_mod <- xtr_mod[indk, ]\n", + " k_yte <- ytr[indk]\n", "\n", - " for(ii in 1:length(lambda1.lava.mod)) {\n", - " for(jj in 1:length(lambda2.lava.mod)) {\n", - " CV.mod.lava[ii,jj] = CV.mod.lava[ii,jj] +\n", - " sum((K.yte - lava.predict(K.xtr.mod, K.ytr, newX = K.xte.mod ,\n", - " lambda1 = lambda1.lava.mod[ii],\n", - " lambda2 = lambda2.lava.mod[jj]))^2)\n", - " }\n", + " for (ii in seq_along(lambda1_lava_mod)) {\n", + " for (jj in seq_along(lambda2_lava_mod)) {\n", + " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", + " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", + " newX = k_xte_mod,\n", + " lambda1 = lambda1_lava_mod[ii],\n", + " lambda2 = lambda2_lava_mod[jj]))^2)\n", " }\n", - "\n", " }\n", + " }\n", "\n", - " # Get CV min values of tuning parameters\n", - " cvmin.lava.mod = which(CV.mod.lava == min(CV.mod.lava) , arr.ind = TRUE)\n", - " cvlambda1.lava.mod = lambda1.lava.mod[cvmin.lava.mod[1]]\n", - " cvlambda2.lava.mod = lambda2.lava.mod[cvmin.lava.mod[2]]\n", + " # Get CV min values of tuning parameters\n", + " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", + " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", + " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", "\n", - " cat(\"Min Lava Lasso CV Penalty: \", cvlambda1.lava.mod)\n", - " cat(\"\\nMin Lava Ridge CV Penalty: \", cvlambda2.lava.mod)\n", + " cat(\"Min Lava Lasso CV Penalty: \", cvlambda1_lava_mod)\n", + " cat(\"\\nMin Lava Ridge CV Penalty: \", cvlambda2_lava_mod)\n", "\n", "\n", - " #### Look at performance on test sample\n", + " #### Look at performance on test sample\n", "\n", - " # Calculate R^2 in training data and in validation data as measures\n", - " # Refit on entire training sample\n", + " # Calculate R^2 in training data and in validation data as measures\n", + " # Refit on entire training sample\n", "\n", "\n", - " #### CV-min model\n", + " #### CV-min model\n", "\n", - " # In sample fit\n", - " cvmin.yhat.lava.tr <- lava.predict(xtr.mod, ytr, newX = xtr.mod,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod)\n", - " r2.lava.mod = 1-sum((ytr-cvmin.yhat.lava.tr)^2)/sum((ytr-mean(ytr))^2)\n", + " # In sample fit\n", + " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", "\n", - " # Out of sample fit\n", - " cvmin.yhat.lava.test <- lava.predict(xtr.mod, ytr, newX = xte.mod,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod)\n", - " r2V.lava.mod = 1-sum((yte-cvmin.yhat.lava.test)^2)/sum((yte-mean(ytr))^2)\n", + " # Out of sample fit\n", + " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", "\n", "\n", - " cat(\"\\nIn sample R2 (CV-min): \", r2.lava.mod)\n", - " cat(\"\\nOut of Sample R2 (CV-min): \", r2V.lava.mod)\n", + " cat(\"\\nIn sample R2 (CV-min): \", r2_lava_mod)\n", + " cat(\"\\nOut of Sample R2 (CV-min): \", r2v_lava_mod)\n", "\n", "\n", - " #### Use average model across cv-folds and refit model using all training data\n", - " ###### we won't report these results.\n", - " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", - " n.tr = length(ytr)\n", - " n.te = length(yte)\n", - " yhat.tr.lava.mod = matrix(0,n.tr,Kf)\n", - " yhat.te.lava.mod = matrix(0,n.te,Kf)\n", + " #### Use average model across cv-folds and refit model using all training data\n", + " ###### we won't report these results.\n", + " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", + " n_tr <- length(ytr)\n", + " n_te <- length(yte)\n", + " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", + " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", "\n", "\n", - " for(k in 1:Kf) {\n", - " indk = cvgroup == k\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", "\n", - " K.xtr.mod = xtr.mod[!indk,]\n", - " K.ytr = ytr[!indk]\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", "\n", - " # Lava\n", - " yhat.tr.lava.mod[,k] = as.vector(lava.predict(K.xtr.mod, K.ytr, newX = xtr.mod ,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod))\n", - " yhat.te.lava.mod[,k] = as.vector(lava.predict(K.xtr.mod, K.ytr, newX = xte.mod ,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod))\n", - " }\n", + " # Lava\n", + " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " }\n", "\n", - " avg.yhat.lava.tr <- rowMeans(yhat.tr.lava.mod)\n", - " avg.yhat.lava.test <- rowMeans(yhat.te.lava.mod)\n", - "\n", - " r2.CVave.lava.mod = 1-sum((ytr-avg.yhat.lava.tr)^2)/sum((ytr-mean(ytr))^2)\n", - " r2V.CVave.lava.mod = 1-sum((yte-avg.yhat.lava.test)^2)/sum((yte-mean(ytr))^2)\n", - "\n", - " cat(\"\\nIn sample R2 (Average Across Folds): \", r2.CVave.lava.mod)\n", - " cat(\"\\nOut of Sample R2 (Average Across Folds): \", r2V.CVave.lava.mod)\n", - "\n", - " return(c(cvlambda1.lava.mod,\n", - " cvlambda2.lava.mod,\n", - " cvmin.yhat.lava.tr, # CV_min\n", - " cvmin.yhat.lava.test, # CV_min\n", - " r2.lava.mod, # CV_min\n", - " r2V.lava.mod, # CV_min\n", - " avg.yhat.lava.tr, # Average across Folds\n", - " avg.yhat.lava.test, # Average across Folds\n", - " r2.CVave.lava.mod, # Average across Folds\n", - " r2V.CVave.lava.mod # Average across Folds\n", - " )\n", - " )\n", + " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", + " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", + "\n", + " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " cat(\"\\nIn sample R2 (Average Across Folds): \", r2_cv_ave_lava_mod)\n", + " cat(\"\\nOut of Sample R2 (Average Across Folds): \", r2v_cv_ave_lava_mod)\n", + "\n", + " return(c(\n", + " cvlambda1_lava_mod,\n", + " cvlambda2_lava_mod,\n", + " cvmin_yhat_lava_tr, # CV_min\n", + " cvmin_yhat_lava_test, # CV_min\n", + " r2_lava_mod, # CV_min\n", + " r2v_lava_mod, # CV_min\n", + " avg_yhat_lava_tr, # Average across Folds\n", + " avg_yhat_lava_test, # Average across Folds\n", + " r2_cv_ave_lava_mod, # Average across Folds\n", + " r2v_cv_ave_lava_mod # Average across Folds\n", + " ))\n", "}" - ], - "metadata": { - "id": "tr_KBCwwovp6" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5dEsONeRF51R", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Results for Test\n", "cat(\"Test Results ...\\n\")\n", - "R2.lava.traintest <- lava_yhat_r2(X, Xtest, y, ytest)" - ], - "metadata": { - "id": "5dEsONeRF51R" - }, - "execution_count": null, - "outputs": [] + "r2_lava_traintest <- lava_yhat_r2(X, Xtest, y, ytest)" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kdAQN0yq_ISV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Results for Pop\n", "## note we don't have to re-train the entire model\n", "## this is just due to the way the function is defined above\n", "cat(\"Population Results ...\\n\")\n", - "R2.lava.pop <- lava_yhat_r2(X, Xpop, y, ypop)" - ], - "metadata": { - "id": "kdAQN0yq_ISV" - }, - "execution_count": null, - "outputs": [] + "r2_lava_pop <- lava_yhat_r2(X, Xpop, y, ypop)" + ] }, { "cell_type": "code", - "source": [ - "# report R2 using CV min\n", - "cat('LAVA R2 (Test): ', R2.lava.traintest[[6]])\n", - "cat('\\nLAVA R2 (Pop) ', R2.lava.pop[[6]])" - ], + "execution_count": null, "metadata": { - "id": "GaTBT7NkhRmH" + "id": "GaTBT7NkhRmH", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# report R2 using CV min\n", + "cat(\"LAVA R2 (Test): \", r2_lava_traintest[[6]])\n", + "cat(\"\\nLAVA R2 (Pop) \", r2_lava_pop[[6]])" + ] }, { "cell_type": "markdown", - "source": [ - "## Summarizing Results" - ], "metadata": { "id": "Gv0bAoZZiLnH" - } + }, + "source": [ + "## Summarizing Results" + ] }, { "cell_type": "code", - "source": [ - "table<- matrix(0, 6, 1)\n", - "table[1,1] <- R2.lasso.cv\n", - "table[2,1] <- R2.ridge\n", - "table[3,1] <- R2.elnet\n", - "table[4,1] <- R2.lasso\n", - "table[5,1] <- R2.lasso.post\n", - "table[6,1] <- R2.lava.pop[[6]]\n", - "\n", - "colnames(table)<- c(\"R2 (Population)\")\n", - "rownames(table)<- c(\"Cross-Validated Lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\n", - " \"Lasso\",\"Post-Lasso\",\"Lava\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab\n" - ], + "execution_count": null, "metadata": { - "id": "VtzIoSdyS9To" + "id": "VtzIoSdyS9To", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "table <- matrix(0, 6, 1)\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_lasso\n", + "table[5, 1] <- r2_lasso_post\n", + "table[6, 1] <- r2_lava_pop[[6]]\n", + "\n", + "colnames(table) <- c(\"R2 (Population)\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "npU6rAHRUs_s", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Creating a data frame with the predicted values for test\n", "data <- data.frame(\n", " gXtest = gXtest,\n", - " Ridge = predict(fit.ridge, newx = Xtest, s=\"lambda.min\"),\n", - " ENet = predict(fit.elnet, newx = Xtest, s=\"lambda.min\"),\n", - " RLasso = predict(fit.rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit.rlasso.post, newdata = Xtest),\n", - " LassoCV = predict(fit.lasso.cv, newx = Xtest, s=\"lambda.min\"),\n", - " Lava = as.vector(R2.lava.traintest[[4]])\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(r2_lava_traintest[[4]])\n", ")\n", - "colnames(data) = c(\"gXtest\",\"Ridge\",\"ENet\",\"RLasso\",\"RlassoPost\",\"LassoCV\",\"Lava\")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", "\n", "# Reshaping data into longer format for ggplot\n", "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", @@ -776,36 +883,42 @@ "# Plotting\n", "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c('brown', 'yellow', 'red', \"green\", 'blue', 'magenta'), guide = guide_legend(title = \"Model\")) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", " theme_minimal() +\n", - " labs(title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\") +\n", - " guides(shape = \"none\") # Remove the shape legend" - ], - "metadata": { - "id": "npU6rAHRUs_s" - }, - "execution_count": null, - "outputs": [] + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] }, { "cell_type": "markdown", - "source": [ - "## Data Generating Process: Dense Coefficients" - ], "metadata": { "id": "fc8S-gruBnFD" - } + }, + "source": [ + "## Data Generating Process: Dense Coefficients" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BiEL0vydBowk", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "set.seed(1)\n", "n <- 100\n", "p <- 400\n", - "res <- gen_data(n,p, regime=\"dense\")\n", + "res <- gen_data(n, p, regime = \"dense\")\n", "\n", "X <- res$X\n", "y <- res$y\n", @@ -816,112 +929,137 @@ "Xpop <- res$Xpop\n", "ypop <- res$ypop\n", "gXpop <- res$gXpop\n", - "betas <- res$beta" - ], - "metadata": { - "id": "BiEL0vydBowk" - }, - "execution_count": null, - "outputs": [] + "betas <- res$beta\n" + ] }, { "cell_type": "code", - "source": [ - "plot(gX,y, xlab=\"g(X)\", ylab=\"y\") #plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX)/var(y))) # theoretical R-square in the simulation example" - ], + "execution_count": null, "metadata": { - "id": "BoHnfTmcDgvw" + "id": "BoHnfTmcDgvw", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example\n" + ] }, { "cell_type": "code", - "source": [ - "# plot betas\n", - "plot(1:length(betas), abs(betas), log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\")))" - ], + "execution_count": null, "metadata": { - "id": "qU2g-tf6DjsN" + "id": "qU2g-tf6DjsN", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")\n" + ] }, { "cell_type": "code", - "source": [ - "fit.lasso.cv <- cv.glmnet(X, y, family=\"gaussian\", alpha=1, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.ridge <- cv.glmnet(X, y, family=\"gaussian\", alpha=0, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.elnet <- cv.glmnet(X, y, family=\"gaussian\", alpha=.5, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.rlasso <- rlasso(y~X, post=FALSE) # lasso with plug-in penalty level\n", - "fit.rlasso.post <- rlasso(y~X, post=TRUE) # post-lasso with plug-in penalty level\n", - "\n", - "R2.lasso.cv <- r2_score(predict(fit.lasso.cv,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.ridge <- r2_score(predict(fit.ridge,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.elnet <- r2_score(predict(fit.elnet,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.rlasso <- r2_score(predict(fit.rlasso,newdata=Xpop), ypop)\n", - "R2.rlasso.post <- r2_score(predict(fit.rlasso.post,newdata=Xpop), ypop)\n", - "R2.lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" - ], + "execution_count": null, "metadata": { - "id": "kGKVHss9BpDr" + "id": "kGKVHss9BpDr", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", + "\n", + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", + "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", + "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e93xdkcECQN_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "table <- matrix(0, 6, 1)\n", - "table[1,1] <- R2.lasso.cv\n", - "table[2,1] <- R2.ridge\n", - "table[3,1] <- R2.elnet\n", - "table[4,1] <- R2.rlasso\n", - "table[5,1] <- R2.rlasso.post\n", - "table[6,1] <- R2.lava\n", - "\n", - "colnames(table)<- c(\"R2\")\n", - "rownames(table)<- c(\"Cross-Validated Lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\n", - " \"Lasso\",\"Post-Lasso\",\"Lava\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_rlasso\n", + "table[5, 1] <- r2_rlasso_post\n", + "table[6, 1] <- r2_lava\n", + "\n", + "colnames(table) <- c(\"R2\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", "tab" - ], - "metadata": { - "id": "e93xdkcECQN_" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "# get lava prediction on test set for plot below\n", - "lava.yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" - ], + "execution_count": null, "metadata": { - "id": "ZdSCN8zeCQSR" + "id": "ZdSCN8zeCQSR", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# get lava prediction on test set for plot below\n", + "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uiDd9oxhVcnc", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Creating a data frame with the predicted values for test\n", "data <- data.frame(\n", " gXtest = gXtest,\n", - " Ridge = predict(fit.ridge, newx = Xtest, s=\"lambda.min\"),\n", - " ENet = predict(fit.elnet, newx = Xtest, s=\"lambda.min\"),\n", - " RLasso = predict(fit.rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit.rlasso.post, newdata = Xtest),\n", - " LassoCV = predict(fit.lasso.cv, newx = Xtest, s=\"lambda.min\"),\n", - " Lava = as.vector(lava.yhat)\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(lava_yhat)\n", ")\n", - "colnames(data) = c(\"gXtest\",\"Ridge\",\"ENet\",\"RLasso\",\"RlassoPost\",\"LassoCV\",\"Lava\")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", "\n", "# Reshaping data into longer format for ggplot\n", "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", @@ -929,23 +1067,22 @@ "# Plotting\n", "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c('brown', 'yellow', 'red', \"green\", 'blue', 'magenta'), guide = guide_legend(title = \"Model\")) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", " theme_minimal() +\n", - " labs(title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\") +\n", - " guides(shape = \"none\") # Remove the shape legend" - ], - "metadata": { - "id": "uiDd9oxhVcnc" - }, - "execution_count": null, - "outputs": [] + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] }, { "cell_type": "markdown", "metadata": { + "id": "sxZFIhYuONaB", "papermill": { "duration": 0.018842, "end_time": "2021-02-15T11:02:51.941852", @@ -953,8 +1090,7 @@ "start_time": "2021-02-15T11:02:51.923010", "status": "completed" }, - "tags": [], - "id": "sxZFIhYuONaB" + "tags": [] }, "source": [ "## Data Generating Process: Approximately Sparse + Small Dense Part" @@ -964,6 +1100,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "nQcWgf3KONaC", "papermill": { "duration": 0.207598, "end_time": "2021-02-15T11:02:52.168536", @@ -972,14 +1109,16 @@ "status": "completed" }, "tags": [], - "id": "nQcWgf3KONaC" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1)\n", "n <- 100\n", "p <- 400\n", - "res <- gen_data(n,p, regime=\"sparsedense\")\n", + "res <- gen_data(n, p, regime = \"sparsedense\")\n", "\n", "X <- res$X\n", "y <- res$y\n", @@ -995,34 +1134,43 @@ }, { "cell_type": "code", - "source": [ - "plot(gX,y, xlab=\"g(X)\", ylab=\"y\") #plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX)/var(y))) # theoretical R-square in the simulation example" - ], + "execution_count": null, "metadata": { - "id": "yiIrU6SQDkjK" + "id": "yiIrU6SQDkjK", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" + ] }, { "cell_type": "code", - "source": [ - "# plot betas\n", - "plot(1:length(betas), abs(betas), log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\")))" - ], + "execution_count": null, "metadata": { - "id": "X2N8JfHDDkmk" + "id": "X2N8JfHDDkmk", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")\n" + ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "id": "obWejQaJONaC", "papermill": { "duration": 1.432822, "end_time": "2021-02-15T11:02:53.626802", @@ -1031,28 +1179,34 @@ "status": "completed" }, "tags": [], - "id": "obWejQaJONaC" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.lasso.cv <- cv.glmnet(X, y, family=\"gaussian\", alpha=1, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.ridge <- cv.glmnet(X, y, family=\"gaussian\", alpha=0, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.elnet <- cv.glmnet(X, y, family=\"gaussian\", alpha=.5, nfolds=5) # family gaussian means that we'll be using square loss\n", - "fit.rlasso <- rlasso(y~X, post=FALSE) # lasso with plug-in penalty level\n", - "fit.rlasso.post <- rlasso(y~X, post=TRUE) # post-lasso with plug-in penalty level\n", - "\n", - "R2.lasso.cv <- r2_score(predict(fit.lasso.cv,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.ridge <- r2_score(predict(fit.ridge,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.elnet <- r2_score(predict(fit.elnet,newx=Xpop, s=\"lambda.min\"), ypop)\n", - "R2.rlasso <- r2_score(predict(fit.rlasso,newdata=Xpop), ypop)\n", - "R2.rlasso.post <- r2_score(predict(fit.rlasso.post,newdata=Xpop), ypop)\n", - "R2.lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", + "\n", + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", + "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", + "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "id": "38KYAe5MONaC", "papermill": { "duration": 13.756606, "end_time": "2021-02-15T11:03:07.405363", @@ -1061,52 +1215,67 @@ "status": "completed" }, "tags": [], - "id": "38KYAe5MONaC" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "table <- matrix(0, 6, 1)\n", - "table[1,1] <- R2.lasso.cv\n", - "table[2,1] <- R2.ridge\n", - "table[3,1] <- R2.elnet\n", - "table[4,1] <- R2.rlasso\n", - "table[5,1] <- R2.rlasso.post\n", - "table[6,1] <- R2.lava\n", - "\n", - "colnames(table)<- c(\"R2\")\n", - "rownames(table)<- c(\"Cross-Validated Lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\n", - " \"Lasso\",\"Post-Lasso\",\"Lava\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_rlasso\n", + "table[5, 1] <- r2_rlasso_post\n", + "table[6, 1] <- r2_lava\n", + "\n", + "colnames(table) <- c(\"R2\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", "tab" ] }, { "cell_type": "code", - "source": [ - "# get lava prediction on test set for plot below\n", - "lava.yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" - ], + "execution_count": null, "metadata": { - "id": "oW3kq2xNOone" + "id": "oW3kq2xNOone", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# get lava prediction on test set for plot below\n", + "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1sYLd-O0V2IC", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Creating a data frame with the predicted values for test\n", "data <- data.frame(\n", " gXtest = gXtest,\n", - " Ridge = predict(fit.ridge, newx = Xtest, s=\"lambda.min\"),\n", - " ENet = predict(fit.elnet, newx = Xtest, s=\"lambda.min\"),\n", - " RLasso = predict(fit.rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit.rlasso.post, newdata = Xtest),\n", - " LassoCV = predict(fit.lasso.cv, newx = Xtest, s=\"lambda.min\"),\n", - " Lava = as.vector(lava.yhat)\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(lava_yhat)\n", ")\n", - "colnames(data) = c(\"gXtest\",\"Ridge\",\"ENet\",\"RLasso\",\"RlassoPost\",\"LassoCV\",\"Lava\")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", "\n", "# Reshaping data into longer format for ggplot\n", "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", @@ -1114,22 +1283,23 @@ "# Plotting\n", "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c('brown', 'yellow', 'red', \"green\", 'blue', 'magenta'), guide = guide_legend(title = \"Model\")) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", " theme_minimal() +\n", - " labs(title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\") +\n", - " guides(shape = \"none\") # Remove the shape legend" - ], - "metadata": { - "id": "1sYLd-O0V2IC" - }, - "execution_count": null, - "outputs": [] + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "R", "language": "R", @@ -1154,9 +1324,6 @@ "parameters": {}, "start_time": "2021-02-15T11:01:38.329045", "version": "2.2.2" - }, - "colab": { - "provenance": [] } }, "nbformat": 4, diff --git a/PM2/r_ml_for_wage_prediction.irnb b/PM2/r_ml_for_wage_prediction.irnb index 4d47279c..768a40ae 100644 --- a/PM2/r_ml_for_wage_prediction.irnb +++ b/PM2/r_ml_for_wage_prediction.irnb @@ -5,6 +5,7 @@ "metadata": { "_execution_state": "idle", "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "dtt9U13qNGOn", "papermill": { "duration": 0.036479, "end_time": "2021-02-13T18:19:43.396666", @@ -12,8 +13,7 @@ "start_time": "2021-02-13T18:19:43.360187", "status": "completed" }, - "tags": [], - "id": "dtt9U13qNGOn" + "tags": [] }, "source": [ "# A Simple Case Study using Wage Data from 2015" @@ -22,6 +22,7 @@ { "cell_type": "markdown", "metadata": { + "id": "LCKYnHWrNGOn", "papermill": { "duration": 0.036639, "end_time": "2021-02-13T18:19:43.468425", @@ -29,8 +30,7 @@ "start_time": "2021-02-13T18:19:43.431786", "status": "completed" }, - "tags": [], - "id": "LCKYnHWrNGOn" + "tags": [] }, "source": [ "We illustrate how to predict an outcome variable $Y$ in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. We use linear prediction rules for estimation, including OLS and the penalized linear methods we've studied. Later, we will also consider nonlinear prediction rules including tree-based methods and neural nets." @@ -38,6 +38,14 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VPwV7nNDS_nz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "install.packages(\"xtable\")\n", "install.packages(\"hdm\")\n", @@ -48,16 +56,12 @@ "library(xtable)\n", "library(glmnet)\n", "library(MLmetrics)" - ], - "metadata": { - "id": "VPwV7nNDS_nz" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": { + "id": "gRYGynhrNGOo", "papermill": { "duration": 0.034705, "end_time": "2021-02-13T18:19:43.537814", @@ -65,8 +69,7 @@ "start_time": "2021-02-13T18:19:43.503109", "status": "completed" }, - "tags": [], - "id": "gRYGynhrNGOo" + "tags": [] }, "source": [ "## Data" @@ -75,6 +78,7 @@ { "cell_type": "markdown", "metadata": { + "id": "S_YMURKqNGOo", "papermill": { "duration": 0.036082, "end_time": "2021-02-13T18:19:43.609347", @@ -82,8 +86,7 @@ "start_time": "2021-02-13T18:19:43.573265", "status": "completed" }, - "tags": [], - "id": "S_YMURKqNGOo" + "tags": [] }, "source": [ "Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.\n", @@ -94,6 +97,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "B09_5wrUNGOo", "papermill": { "duration": 0.279387, "end_time": "2021-02-13T18:19:43.923823", @@ -102,18 +106,21 @@ "status": "completed" }, "tags": [], - "id": "B09_5wrUNGOo" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", "data <- read.csv(file)\n", - "dim(data)\n" + "dim(data)" ] }, { "cell_type": "markdown", "metadata": { + "id": "Ol9dToMQNGOq", "papermill": { "duration": 0.034902, "end_time": "2021-02-13T18:19:43.994834", @@ -121,8 +128,7 @@ "start_time": "2021-02-13T18:19:43.959932", "status": "completed" }, - "tags": [], - "id": "Ol9dToMQNGOq" + "tags": [] }, "source": [ "The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators." @@ -132,6 +138,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "bsqnI6d0NGOq", "papermill": { "duration": 0.091723, "end_time": "2021-02-13T18:19:44.123394", @@ -140,17 +147,20 @@ "status": "completed" }, "tags": [], - "id": "bsqnI6d0NGOq" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "Z <- subset(data,select=-c(lwage,wage)) # regressors\n", + "Z <- subset(data, select = -c(lwage, wage)) # regressors\n", "colnames(Z)" ] }, { "cell_type": "markdown", "metadata": { + "id": "VeU2XMYENGOr", "papermill": { "duration": 0.037074, "end_time": "2021-02-13T18:19:44.196749", @@ -158,8 +168,7 @@ "start_time": "2021-02-13T18:19:44.159675", "status": "completed" }, - "tags": [], - "id": "VeU2XMYENGOr" + "tags": [] }, "source": [ "The following figure shows the weekly wage distribution from the US survey data." @@ -169,6 +178,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "d3sbTfpRNGOr", "papermill": { "duration": 0.443391, "end_time": "2021-02-13T18:19:44.677379", @@ -177,16 +187,19 @@ "status": "completed" }, "tags": [], - "id": "d3sbTfpRNGOr" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "hist(data$wage, xlab= \"hourly wage\", main=\"Empirical wage distribution from the US survey data\", breaks= 35)\n" + "hist(data$wage, xlab = \"hourly wage\", main = \"Empirical wage distribution from the US survey data\", breaks = 35)" ] }, { "cell_type": "markdown", "metadata": { + "id": "BmGfrWFNNGOs", "papermill": { "duration": 0.036602, "end_time": "2021-02-13T18:19:44.752465", @@ -194,8 +207,7 @@ "start_time": "2021-02-13T18:19:44.715863", "status": "completed" }, - "tags": [], - "id": "BmGfrWFNNGOs" + "tags": [] }, "source": [ "Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by\n", @@ -205,6 +217,7 @@ { "cell_type": "markdown", "metadata": { + "id": "msBSjtuKNGOs", "papermill": { "duration": 0.036009, "end_time": "2021-02-13T18:19:44.826260", @@ -212,8 +225,7 @@ "start_time": "2021-02-13T18:19:44.790251", "status": "completed" }, - "tags": [], - "id": "msBSjtuKNGOs" + "tags": [] }, "source": [ "## Analysis" @@ -222,6 +234,7 @@ { "cell_type": "markdown", "metadata": { + "id": "B-XZMCogNGOs", "papermill": { "duration": 0.036925, "end_time": "2021-02-13T18:19:44.899159", @@ -229,8 +242,7 @@ "start_time": "2021-02-13T18:19:44.862234", "status": "completed" }, - "tags": [], - "id": "B-XZMCogNGOs" + "tags": [] }, "source": [ "Due to the skewness of the data, we are considering log wages which leads to the following regression model\n", @@ -241,6 +253,7 @@ { "cell_type": "markdown", "metadata": { + "id": "pNLS-C_7NGOt", "papermill": { "duration": 0.036183, "end_time": "2021-02-13T18:19:44.971528", @@ -248,8 +261,7 @@ "start_time": "2021-02-13T18:19:44.935345", "status": "completed" }, - "tags": [], - "id": "pNLS-C_7NGOt" + "tags": [] }, "source": [ "In this notebook, we will evaluate *linear* prediction rules. In later notebooks, we will also utilize nonlinear prediction methods. In linear models, we estimate the prediction rule of the form\n", @@ -269,6 +281,7 @@ { "cell_type": "markdown", "metadata": { + "id": "-kGLyGXvNGOt", "papermill": { "duration": 0.037318, "end_time": "2021-02-13T18:19:45.044959", @@ -276,8 +289,7 @@ "start_time": "2021-02-13T18:19:45.007641", "status": "completed" }, - "tags": [], - "id": "-kGLyGXvNGOt" + "tags": [] }, "source": [ "To evaluate the out-of-sample performance, we split the data first." @@ -287,6 +299,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "oAqJIgmlNGOt", "papermill": { "duration": 0.062188, "end_time": "2021-02-13T18:19:45.143118", @@ -295,21 +308,24 @@ "status": "completed" }, "tags": [], - "id": "oAqJIgmlNGOt" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1234)\n", - "training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE)\n", + "training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE)\n", "\n", - "data_train <- data[training,]\n", - "data_test <- data[-training,]" + "data_train <- data[training, ]\n", + "data_test <- data[-training, ]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "id": "C6qC_wyjNGOu", "papermill": { "duration": 0.060969, "end_time": "2021-02-13T18:19:45.445389", @@ -318,17 +334,20 @@ "status": "completed" }, "tags": [], - "id": "C6qC_wyjNGOu" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "Y_train <- data_train$lwage\n", - "Y_test <- data_test$lwage" + "y_train <- data_train$lwage\n", + "y_test <- data_test$lwage" ] }, { "cell_type": "markdown", "metadata": { + "id": "5-CCjJnbNGOt", "papermill": { "duration": 0.038774, "end_time": "2021-02-13T18:19:45.217757", @@ -336,8 +355,7 @@ "start_time": "2021-02-13T18:19:45.178983", "status": "completed" }, - "tags": [], - "id": "5-CCjJnbNGOt" + "tags": [] }, "source": [ "We are starting by running a simple OLS regression. We fit the basic and flexible model to our training data by running an ols regression and compute the R-squared on the test sample" @@ -346,6 +364,7 @@ { "cell_type": "markdown", "metadata": { + "id": "qbummAR-NGOu", "papermill": { "duration": 0.037704, "end_time": "2021-02-13T18:19:45.622370", @@ -353,8 +372,7 @@ "start_time": "2021-02-13T18:19:45.584666", "status": "completed" }, - "tags": [], - "id": "qbummAR-NGOu" + "tags": [] }, "source": [ "As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression and computing the mean squared error and $R^2$ on the test sample." @@ -362,17 +380,18 @@ }, { "cell_type": "markdown", - "source": [ - "### Low dimensional specification (basic)" - ], "metadata": { "id": "4LNs__OcfmFV" - } + }, + "source": [ + "### Low dimensional specification (basic)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "id": "WD7tshOlNGOt", "papermill": { "duration": 0.094135, "end_time": "2021-02-13T18:19:45.347955", @@ -381,15 +400,17 @@ "status": "completed" }, "tags": [], - "id": "WD7tshOlNGOt" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "X_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", - "formula_basic <- as.formula(paste(\"lwage\", \"~\", X_basic))\n", - "model_X_basic_train <- model.matrix(formula_basic,data_train)\n", - "model_X_basic_test <- model.matrix(formula_basic,data_test)\n", - "p_basic <- dim(model_X_basic_train)[2]\n", + "x_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", + "formula_basic <- as.formula(paste(\"lwage\", \"~\", x_basic))\n", + "model_x_basic_train <- model.matrix(formula_basic, data_train)\n", + "model_x_basic_test <- model.matrix(formula_basic, data_test)\n", + "p_basic <- dim(model_x_basic_train)[2]\n", "p_basic" ] }, @@ -397,6 +418,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "kx1xoWHFNGOv", "papermill": { "duration": 0.069537, "end_time": "2021-02-13T18:19:45.887169", @@ -405,20 +427,23 @@ "status": "completed" }, "tags": [], - "id": "kx1xoWHFNGOv" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# ols (basic model)\n", - "fit.lm.basic <- lm(formula_basic, data_train)\n", + "fit_lm_basic <- lm(formula_basic, data_train)\n", "# Compute the Out-Of-Sample Performance\n", - "yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test)\n", - "cat(\"Basic model MSE (OLS): \", mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model)" + "yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test)\n", + "cat(\"Basic model MSE (OLS): \", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model)" ] }, { "cell_type": "markdown", "metadata": { + "id": "hDlMXF0ANGOw", "papermill": { "duration": 0.052764, "end_time": "2021-02-13T18:19:46.122829", @@ -426,8 +451,7 @@ "start_time": "2021-02-13T18:19:46.070065", "status": "completed" }, - "tags": [], - "id": "hDlMXF0ANGOw" + "tags": [] }, "source": [ "To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*:" @@ -437,6 +461,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "ERCs22oDNGOw", "papermill": { "duration": 0.076484, "end_time": "2021-02-13T18:19:46.239015", @@ -445,17 +470,20 @@ "status": "completed" }, "tags": [], - "id": "ERCs22oDNGOw" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2]\n", - "MSE.lm.basic" + "mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2]\n", + "mse_lm_basic" ] }, { "cell_type": "markdown", "metadata": { + "id": "3PQ-f_waNGOw", "papermill": { "duration": 0.039088, "end_time": "2021-02-13T18:19:46.317915", @@ -463,8 +491,7 @@ "start_time": "2021-02-13T18:19:46.278827", "status": "completed" }, - "tags": [], - "id": "3PQ-f_waNGOw" + "tags": [] }, "source": [ "We also compute the out-of-sample $R^2$:" @@ -474,6 +501,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "YLgvLE2BNGOw", "papermill": { "duration": 0.057098, "end_time": "2021-02-13T18:19:46.413754", @@ -482,42 +510,49 @@ "status": "completed" }, "tags": [], - "id": "YLgvLE2BNGOw" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test)\n", - "cat(\"Basic model R^2 (OLS): \",R2.lm.basic) # MSE OLS (basic model)" + "r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test)\n", + "cat(\"Basic model R^2 (OLS): \", r2_lm_basic) # MSE OLS (basic model)" ] }, { "cell_type": "markdown", - "source": [ - "### High-dimensional specification (flexible)" - ], "metadata": { "id": "tTZyELgyf51J" - } + }, + "source": [ + "### High-dimensional specification (flexible)" + ] }, { "cell_type": "code", - "source": [ - "X_flex <- \"sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\"\n", - "formula_flex <- as.formula(paste(\"lwage\", \"~\", X_flex))\n", - "model_X_flex_train <- model.matrix(formula_flex,data_train)\n", - "model_X_flex_test <- model.matrix(formula_flex,data_test)\n", - "p_flex <- dim(model_X_flex_train)[2]\n", - "p_flex" - ], + "execution_count": null, "metadata": { - "id": "J8Rffx0ef3nM" + "id": "J8Rffx0ef3nM", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", + " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", + "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", + "model_x_flex_train <- model.matrix(formula_flex, data_train)\n", + "model_x_flex_test <- model.matrix(formula_flex, data_test)\n", + "p_flex <- dim(model_x_flex_train)[2]\n", + "p_flex" + ] }, { "cell_type": "markdown", "metadata": { + "id": "77G7YfbnNGOw", "papermill": { "duration": 0.039585, "end_time": "2021-02-13T18:19:46.492903", @@ -525,8 +560,7 @@ "start_time": "2021-02-13T18:19:46.453318", "status": "completed" }, - "tags": [], - "id": "77G7YfbnNGOw" + "tags": [] }, "source": [ "We repeat the same procedure for the flexible model." @@ -536,6 +570,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "KpRtjaAlNGOw", "papermill": { "duration": 0.198636, "end_time": "2021-02-13T18:19:46.730717", @@ -544,23 +579,26 @@ "status": "completed" }, "tags": [], - "id": "KpRtjaAlNGOw" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# ols (flexible model)\n", - "fit.lm.flex <- lm(formula_flex, data_train)\n", + "fit_lm_flex <- lm(formula_flex, data_train)\n", "# Compute the Out-Of-Sample Performance\n", - "options(warn=-1)\n", - "yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test)\n", - "MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2]\n", - "R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test)\n", - "cat(\"Flexible model R^2 (OLS): \",R2.lm.flex) # MSE OLS (flexible model)" + "options(warn = -1)\n", + "yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test)\n", + "mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2]\n", + "r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test)\n", + "cat(\"Flexible model R^2 (OLS): \", r2_lm_flex) # MSE OLS (flexible model)" ] }, { "cell_type": "markdown", "metadata": { + "id": "sShiB-i9NGOx", "papermill": { "duration": 0.042521, "end_time": "2021-02-13T18:19:46.935859", @@ -568,8 +606,7 @@ "start_time": "2021-02-13T18:19:46.893338", "status": "completed" }, - "tags": [], - "id": "sShiB-i9NGOx" + "tags": [] }, "source": [ "### Penalized regressions (flexible model)\n", @@ -581,6 +618,7 @@ { "cell_type": "markdown", "metadata": { + "id": "5wgFVRkkNGOx", "papermill": { "duration": 0.051953, "end_time": "2021-02-13T18:19:46.853182", @@ -588,8 +626,7 @@ "start_time": "2021-02-13T18:19:46.801229", "status": "completed" }, - "tags": [], - "id": "5wgFVRkkNGOx" + "tags": [] }, "source": [ "We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We proceed by running penalized regressions first for the flexible model, tuned via cross-validation." @@ -597,37 +634,41 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n3jvO5HQmzbf", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ - "fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=.5)\n", + "fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = .5)\n", "\n", - "yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test)\n", - "yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test)\n", - "yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test)\n", + "yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test)\n", + "yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test)\n", + "yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test)\n", "\n", - "MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2]\n", - "MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2]\n", - "MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2]\n", + "mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2]\n", + "mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2]\n", + "mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test)\n", - "R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test)\n", - "R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test)\n", + "r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test)\n", + "r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test)\n", + "r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test)\n", "\n", "# R^2 using cross-validation (flexible model)\n", - "cat('Flexible model R^2 (Lasso): ', R2.lasso.cv.flex)\n", - "cat('\\nFlexible model R^2 (Ridge): ', R2.ridge.flex)\n", - "cat('\\nFlexible model R^2 (Elastic Net): ', R2.elnet.flex)" - ], - "metadata": { - "id": "n3jvO5HQmzbf" - }, - "execution_count": null, - "outputs": [] + "cat(\"Flexible model R^2 (Lasso): \", r2_lasso_cv_flex)\n", + "cat(\"\\nFlexible model R^2 (Ridge): \", r2_ridge_flex)\n", + "cat(\"\\nFlexible model R^2 (Elastic Net): \", r2_elnet_flex)" + ] }, { "cell_type": "markdown", "metadata": { + "id": "ZytMS-aCNGOx", "papermill": { "duration": 0.040161, "end_time": "2021-02-13T18:19:47.015626", @@ -635,8 +676,7 @@ "start_time": "2021-02-13T18:19:46.975465", "status": "completed" }, - "tags": [], - "id": "ZytMS-aCNGOx" + "tags": [] }, "source": [ "We can also try a variant of the `l1` penalty, where the weight is chosen based on theoretical derivations. We use package *hdm* and the function *rlasso*, relying on a theoretical based choice of the penalty level $\\lambda$ in the lasso regression." @@ -644,18 +684,19 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "n-D_fRJBnkEH" + }, "source": [ "Specifically, we use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, $\\Phi^{-1}$ denotes the quantile function, and $\\hat{\\sigma}$ is estimated in an iterative manner (see corresponding notes in book). Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", "\n", "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." - ], - "metadata": { - "id": "n-D_fRJBnkEH" - } + ] }, { "cell_type": "markdown", "metadata": { + "id": "yBiZ3q3INGOy", "papermill": { "duration": 0.049543, "end_time": "2021-02-13T18:19:47.757271", @@ -663,8 +704,7 @@ "start_time": "2021-02-13T18:19:47.707728", "status": "completed" }, - "tags": [], - "id": "yBiZ3q3INGOy" + "tags": [] }, "source": [ "Now, we repeat the same procedure for the flexible model." @@ -674,6 +714,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "id": "PlTdJh5PNGOy", "papermill": { "duration": 3.430649, "end_time": "2021-02-13T18:19:51.229007", @@ -682,241 +723,251 @@ "status": "completed" }, "tags": [], - "id": "PlTdJh5PNGOy" + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE)\n", - "fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE)\n", - "yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test)\n", - "yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test)\n", + "fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE)\n", + "fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE)\n", + "yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test)\n", + "yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test)\n", "\n", - "MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2]\n", - "MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2]\n", + "mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test)\n", - "R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test)\n", + "r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test)\n", + "r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test)\n", "# R^2 theoretically chosen penalty (flexible model)\n", - "cat('Flexible model R^2 (RLasso): ', R2.lasso.flex)\n", - "cat('\\nFlexible model R^2 (RLasso post): ', R2.lasso.post.flex)" + "cat(\"Flexible model R^2 (RLasso): \", r2_lasso_flex)\n", + "cat(\"\\nFlexible model R^2 (RLasso post): \", r2_lasso_post_flex)" ] }, { "cell_type": "markdown", - "source": [ - "Finally, we try the combination of a sparse and a dense coefficient using the LAVA method" - ], "metadata": { "id": "aude922IfxBG" - } + }, + "source": [ + "Finally, we try the combination of a sparse and a dense coefficient using the LAVA method" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WgBPFQ72ftBz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", "# number of iteration. Could iterate until a convergence criterion is met.\n", - "lava.predict <- function(X, Y, newX, lambda1, lambda2, iter=5){\n", + "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", + " require(glmnet)\n", "\n", " # Need to demean internally\n", - " dy = Y - mean(Y)\n", - " dx = scale(X, scale = FALSE)\n", + " dy <- Y - mean(Y)\n", + " dx <- scale(X, scale = FALSE)\n", "\n", - " sp1 = glmnet(dx, dy, lambda = lambda1) #lasso step fits \"sparse part\"\n", - " de1 = glmnet(dx, dy-predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", "\n", - " i=1\n", - " while(i<= iter) {\n", - " sp1 = glmnet(dx, dy-predict(de1, newx = dx, s=\"lambda.min\"), lambda = lambda1)\n", - " de1 = glmnet(dx, dy-predict(sp1, newx = dx, s=\"lambda.min\"), alpha = 0, lambda = lambda2)\n", - " i = i+1 }\n", + " i <- 1\n", + " while (i <= iter) {\n", + " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " i <- i + 1\n", + " }\n", "\n", - " bhat = sp1$beta+de1$beta\n", - " a0 = mean(Y)-sum(colMeans(X)*bhat)\n", + " bhat <- sp1$beta + de1$beta\n", + " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", "\n", " # Need to add intercept to output\n", "\n", - " yhat = newX%*%bhat + a0\n", + " yhat <- newX %*% bhat + a0\n", "\n", " return(yhat)\n", "}" - ], - "metadata": { - "id": "WgBPFQ72ftBz" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2HFE2EbdkMjj", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# define function to get predictions and r2 scores for lava estimator\n", - "lava_yhat_r2 <- function(xtr.mod, xte.mod, ytr, yte, num_folds = 5){\n", - "\n", - " # 5-fold CV. glmnet does cross-validation internally and\n", - " # relatively efficiently. We're going to write out all the steps to make sure\n", - " # we're using the same CV folds across all procedures in a transparent way and\n", - " # to keep the overall structure clear as well.\n", + "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", + " # 5-fold CV. glmnet does cross-validation internally and\n", + " # relatively efficiently. We're going to write out all the steps to make sure\n", + " # we're using the same CV folds across all procedures in a transparent way and\n", + " # to keep the overall structure clear as well.\n", "\n", - " # Setup for brute force K-Fold CV\n", - " n = length(ytr)\n", - " Kf = num_folds # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling( n/Kf ) )\n", - " cvgroup <- sample( sampleframe , size=n , replace=FALSE ) # CV groups\n", + " # Setup for brute force K-Fold CV\n", + " n <- length(ytr)\n", + " Kf <- num_folds # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", + " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", "\n", "\n", - " ##------------------------------------------------------------\n", - " # We're going to take a shortcut and use the range of lambda values that come out\n", - " # of the default implementation in glmnet for everything. Could do better here - maybe\n", + " ## ------------------------------------------------------------\n", + " # We're going to take a shortcut and use the range of lambda values that come out\n", + " # of the default implementation in glmnet for everything. Could do better here - maybe\n", "\n", - " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge.mod = glmnet(xtr.mod,ytr,alpha=0) # alpha = 0 gives ridge\n", - " ridge.lambda = ridge.mod$lambda # values of penalty parameter\n", + " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", "\n", - " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso.mod = glmnet(xtr.mod,ytr) # default is lasso (equivalent to alpha = 1)\n", - " lasso.lambda = lasso.mod$lambda # values of penalty parameter\n", + " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", "\n", - " ##------------------------------------------------------------\n", + " ## ------------------------------------------------------------\n", "\n", "\n", - " # Lava - Using a double loop over candidate penalty parameter values.\n", + " # Lava - Using a double loop over candidate penalty parameter values.\n", "\n", - " lambda1.lava.mod = lasso.mod$lambda[seq(5,length(lasso.mod$lambda),10)]\n", - " lambda2.lava.mod = ridge.mod$lambda[seq(5,length(ridge.mod$lambda),10)]\n", + " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", + " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", "\n", - " CV.mod.lava = matrix(0,length(lambda1.lava.mod),length(lambda2.lava.mod))\n", + " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", "\n", - " for(k in 1:Kf) {\n", - " indk = cvgroup == k\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", "\n", - " K.xtr.mod = xtr.mod[!indk,]\n", - " K.ytr = ytr[!indk]\n", - " K.xte.mod = xtr.mod[indk,]\n", - " K.yte = ytr[indk]\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + " k_xte_mod <- xtr_mod[indk, ]\n", + " k_yte <- ytr[indk]\n", "\n", - " for(ii in 1:length(lambda1.lava.mod)) {\n", - " for(jj in 1:length(lambda2.lava.mod)) {\n", - " CV.mod.lava[ii,jj] = CV.mod.lava[ii,jj] +\n", - " sum((K.yte - lava.predict(K.xtr.mod, K.ytr, newX = K.xte.mod ,\n", - " lambda1 = lambda1.lava.mod[ii],\n", - " lambda2 = lambda2.lava.mod[jj]))^2)\n", - " }\n", + " for (ii in seq_along(lambda1_lava_mod)) {\n", + " for (jj in seq_along(lambda2_lava_mod)) {\n", + " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", + " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", + " newX = k_xte_mod,\n", + " lambda1 = lambda1_lava_mod[ii],\n", + " lambda2 = lambda2_lava_mod[jj]))^2)\n", " }\n", - "\n", " }\n", - "\n", - " # Get CV min values of tuning parameters\n", - " cvmin.lava.mod = which(CV.mod.lava == min(CV.mod.lava) , arr.ind = TRUE)\n", - " cvlambda1.lava.mod = lambda1.lava.mod[cvmin.lava.mod[1]]\n", - " cvlambda2.lava.mod = lambda2.lava.mod[cvmin.lava.mod[2]]\n", - "\n", - " # cat(\"Min Lava Lasso CV Penalty: \", cvlambda1.lava.mod)\n", - " # cat(\"\\nMin Lava Ridge CV Penalty: \", cvlambda2.lava.mod)\n", - "\n", - "\n", - " #### Look at performance on test sample\n", - "\n", - " # Calculate R^2 in training data and in validation data as measures\n", - " # Refit on entire training sample\n", - "\n", - "\n", - " #### CV-min model\n", - "\n", - " # In sample fit\n", - " cvmin.yhat.lava.tr <- lava.predict(xtr.mod, ytr, newX = xtr.mod,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod)\n", - " r2.lava.mod = 1-sum((ytr-cvmin.yhat.lava.tr)^2)/sum((ytr-mean(ytr))^2)\n", - "\n", - " # Out of sample fit\n", - " cvmin.yhat.lava.test <- lava.predict(xtr.mod, ytr, newX = xte.mod,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod)\n", - " r2V.lava.mod = 1-sum((yte-cvmin.yhat.lava.test)^2)/sum((yte-mean(ytr))^2)\n", - "\n", - "\n", - " # cat(\"\\nIn sample R2 (CV-min): \", r2.lava.mod)\n", - " # cat(\"\\nOut of Sample R2 (CV-min): \", r2V.lava.mod)\n", - "\n", - "\n", - " #### Use average model across cv-folds and refit model using all training data\n", - " ###### we won't report these results.\n", - " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", - " n.tr = length(ytr)\n", - " n.te = length(yte)\n", - " yhat.tr.lava.mod = matrix(0,n.tr,Kf)\n", - " yhat.te.lava.mod = matrix(0,n.te,Kf)\n", - "\n", - "\n", - " for(k in 1:Kf) {\n", - " indk = cvgroup == k\n", - "\n", - " K.xtr.mod = xtr.mod[!indk,]\n", - " K.ytr = ytr[!indk]\n", - "\n", - " # Lava\n", - " yhat.tr.lava.mod[,k] = as.vector(lava.predict(K.xtr.mod, K.ytr, newX = xtr.mod ,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod))\n", - " yhat.te.lava.mod[,k] = as.vector(lava.predict(K.xtr.mod, K.ytr, newX = xte.mod ,\n", - " lambda1 = cvlambda1.lava.mod,\n", - " lambda2 = cvlambda2.lava.mod))\n", - " }\n", - "\n", - " avg.yhat.lava.tr <- rowMeans(yhat.tr.lava.mod)\n", - " avg.yhat.lava.test <- rowMeans(yhat.te.lava.mod)\n", - "\n", - " r2.CVave.lava.mod = 1-sum((ytr-avg.yhat.lava.tr)^2)/sum((ytr-mean(ytr))^2)\n", - " r2V.CVave.lava.mod = 1-sum((yte-avg.yhat.lava.test)^2)/sum((yte-mean(ytr))^2)\n", - "\n", - " # cat(\"\\nIn sample R2 (Average Across Folds): \", r2.CVave.lava.mod)\n", - " # cat(\"\\nOut of Sample R2 (Average Across Folds): \", r2V.CVave.lava.mod)\n", - "\n", - " return(c(cvlambda1.lava.mod,\n", - " cvlambda2.lava.mod,\n", - " cvmin.yhat.lava.tr, # CV_min\n", - " cvmin.yhat.lava.test, # CV_min\n", - " r2.lava.mod, # CV_min\n", - " r2V.lava.mod, # CV_min\n", - " avg.yhat.lava.tr, # Average across Folds\n", - " avg.yhat.lava.test, # Average across Folds\n", - " r2.CVave.lava.mod, # Average across Folds\n", - " r2V.CVave.lava.mod # Average across Folds\n", - " )\n", - " )\n", + " }\n", + "\n", + " # Get CV min values of tuning parameters\n", + " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", + " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", + " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", + "\n", + " #### Look at performance on test sample\n", + "\n", + " # Calculate R^2 in training data and in validation data as measures\n", + " # Refit on entire training sample\n", + "\n", + " #### CV-min model\n", + "\n", + " # In sample fit\n", + " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + "\n", + " # Out of sample fit\n", + " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " #### Use average model across cv-folds and refit model using all training data\n", + " ###### we won't report these results.\n", + " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", + " n_tr <- length(ytr)\n", + " n_te <- length(yte)\n", + " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", + " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", + "\n", + "\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", + "\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + "\n", + " # Lava\n", + " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " }\n", + "\n", + " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", + " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", + "\n", + " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " return(c(\n", + " cvlambda1_lava_mod,\n", + " cvlambda2_lava_mod,\n", + " cvmin_yhat_lava_tr, # CV_min\n", + " cvmin_yhat_lava_test, # CV_min\n", + " r2_lava_mod, # CV_min\n", + " r2v_lava_mod, # CV_min\n", + " avg_yhat_lava_tr, # Average across Folds\n", + " avg_yhat_lava_test, # Average across Folds\n", + " r2_cv_ave_lava_mod, # Average across Folds\n", + " r2v_cv_ave_lava_mod # Average across Folds\n", + " ))\n", "}" - ], - "metadata": { - "id": "2HFE2EbdkMjj" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "fit.lava.flex <- lava_yhat_r2(model_X_flex_train, model_X_flex_test, Y_train, Y_test)\n", - "cat('Flexible model R^2 (LAVA): ', fit.lava.flex[[6]]) # using CV_min" - ], + "execution_count": null, "metadata": { - "id": "Uw3LMCiskJzV" + "id": "Uw3LMCiskJzV", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "fit_lava_flex <- lava_yhat_r2(model_x_flex_train, model_x_flex_test, y_train, y_test)\n", + "cat(\"Flexible model R^2 (LAVA): \", fit_lava_flex[[6]]) # using CV_min" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "f8lYhGaWfpYR" + }, "source": [ "\n", "\n", "We find that for this dataset the low dimensional OLS is sufficient. The high-dimensional approaches did not manage to substantively increase predictive power." - ], - "metadata": { - "id": "f8lYhGaWfpYR" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "bxuPZI4Zx0Vm" + }, "source": [ "### Extra high-dimensional specification (extra flexible)\n", "\n", @@ -926,112 +977,121 @@ "\n", "\n", "\n" - ], - "metadata": { - "id": "bxuPZI4Zx0Vm" - } + ] }, { "cell_type": "code", - "source": [ - "X_extra <- \" sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)^2\"\n", - "formula_extra <- as.formula(paste(\"lwage\", \"~\", X_extra))\n", - "model_X_extra_train <- model.matrix(formula_extra,data_train)\n", - "model_X_extra_test <- model.matrix(formula_extra,data_test)\n", - "p_extra <- dim(model_X_extra_train)[2]\n", - "p_extra" - ], + "execution_count": null, "metadata": { - "id": "JsFhSsM_rGjN" + "id": "JsFhSsM_rGjN", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "x_extra <- \" sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\"\n", + "formula_extra <- as.formula(paste(\"lwage\", \"~\", x_extra))\n", + "model_x_extra_train <- model.matrix(formula_extra, data_train)\n", + "model_x_extra_test <- model.matrix(formula_extra, data_test)\n", + "p_extra <- dim(model_x_extra_train)[2]\n", + "p_extra" + ] }, { "cell_type": "code", - "source": [ - "# ols (extra flexible model)\n", - "fit.lm.extra <- lm(formula_extra, data_train)\n", - "options(warn=-1)\n", - "yhat.lm.extra <- predict(fit.lm.extra, newdata=data_test)\n", - "MSE.lm.extra <- summary(lm((Y_test-yhat.lm.extra)^2~1))$coef[1:2]\n", - "R2.lm.extra <- 1-MSE.lm.extra[1]/var(Y_test)\n", - "cat(\"Extra flexible model R^2 (OLS): \",R2.lm.extra)" - ], + "execution_count": null, "metadata": { - "id": "eheA1UPBsHfL" + "id": "eheA1UPBsHfL", + "vscode": { + "languageId": "r" + } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "# ols (extra flexible model)\n", + "fit_lm_extra <- lm(formula_extra, data_train)\n", + "options(warn = -1)\n", + "yhat_lm_extra <- predict(fit_lm_extra, newdata = data_test)\n", + "mse_lm_extra <- summary(lm((y_test - yhat_lm_extra)^2 ~ 1))$coef[1:2]\n", + "r2_lm_extra <- 1 - mse_lm_extra[1] / var(y_test)\n", + "cat(\"Extra flexible model R^2 (OLS): \", r2_lm_extra)" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "Scm5monOrJu5" + }, "source": [ "#### Penalized regressions (extra flexible model)\n", "\n", "Now let's repeat our penalized regression analysis for the extra flexible model. Note this block takes a while ~ 1 hour 15 minutes. To reduce time substantially, reduce the number of folds in LAVA." - ], - "metadata": { - "id": "Scm5monOrJu5" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tOKoNLKFovrI", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], "source": [ "# penalized regressions\n", - "fit.lasso.cv.extra <- cv.glmnet(model_X_extra_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge.extra <- cv.glmnet(model_X_extra_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet.extra <- cv.glmnet(model_X_extra_train, Y_train, family=\"gaussian\", alpha=.5)\n", - "fit.rlasso.extra <- rlasso(formula_extra, data_train, post=FALSE)\n", - "fit.rlasso.post.extra <- rlasso(formula_extra, data_train, post=TRUE)\n", - "fit.lava.extra <- lava_yhat_r2(model_X_extra_train, model_X_extra_test, Y_train, Y_test)\n", - "\n", - "yhat.lasso.cv.extra <- predict(fit.lasso.cv.extra , newx = model_X_extra_test)\n", - "yhat.ridge.extra <- predict(fit.ridge.extra , newx = model_X_extra_test)\n", - "yhat.elnet.extra <- predict(fit.elnet.extra , newx = model_X_extra_test)\n", - "yhat.rlasso.extra <- predict(fit.rlasso.extra, newdata=data_test)\n", - "yhat.rlasso.post.extra <- predict(fit.rlasso.post.extra, newdata=data_test)\n", - "yhat.lava.extra <- fit.lava.extra[[4]]\n", - "\n", - "MSE.lasso.cv.extra <- summary(lm((Y_test-yhat.lasso.cv.extra )^2~1))$coef[1:2]\n", - "MSE.ridge.extra <- summary(lm((Y_test-yhat.ridge.extra )^2~1))$coef[1:2]\n", - "MSE.elnet.extra <- summary(lm((Y_test-yhat.elnet.extra )^2~1))$coef[1:2]\n", - "MSE.lasso.extra <- summary(lm((Y_test-yhat.rlasso.extra)^2~1))$coef[1:2]\n", - "MSE.lasso.post.extra <- summary(lm((Y_test-yhat.rlasso.post.extra)^2~1))$coef[1:2]\n", - "MSE.lava.extra <- summary(lm(as.vector(Y_test-yhat.lava.extra)^2~1))$coef[1:2]\n", - "\n", - "R2.lasso.cv.extra <- 1-MSE.lasso.cv.extra [1]/var(Y_test)\n", - "R2.ridge.extra <- 1-MSE.ridge.extra [1]/var(Y_test)\n", - "R2.elnet.extra <- 1-MSE.elnet.extra [1]/var(Y_test)\n", - "R2.lasso.extra <- 1-MSE.lasso.extra[1]/var(Y_test)\n", - "R2.lasso.post.extra <- 1-MSE.lasso.post.extra[1]/var(Y_test)\n", - "R2.lava.extra <- 1-MSE.lava.extra[1]/var(Y_test) # R2.lava.extra <- fit.lava.extra[[6]]\n", + "fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = .5)\n", + "fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE)\n", + "fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE)\n", + "fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test)\n", + "\n", + "yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test)\n", + "yhat_ridge_extra <- predict(fit_ridge_extra, newx = model_x_extra_test)\n", + "yhat_elnet_extra <- predict(fit_elnet_extra, newx = model_x_extra_test)\n", + "yhat_rlasso_extra <- predict(fit_rlasso_extra, newdata = data_test)\n", + "yhat_rlasso_post_extra <- predict(fit_rlasso_post_extra, newdata = data_test)\n", + "yhat_lava_extra <- fit_lava_extra[[4]]\n", + "\n", + "mse_lasso_cv_extra <- summary(lm((y_test - yhat_lasso_cv_extra)^2 ~ 1))$coef[1:2]\n", + "mse_ridge_extra <- summary(lm((y_test - yhat_ridge_extra)^2 ~ 1))$coef[1:2]\n", + "mse_elnet_extra <- summary(lm((y_test - yhat_elnet_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_extra <- summary(lm((y_test - yhat_rlasso_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post_extra <- summary(lm((y_test - yhat_rlasso_post_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lava_extra <- summary(lm(as.vector(y_test - yhat_lava_extra)^2 ~ 1))$coef[1:2]\n", + "\n", + "r2_lasso_cv_extra <- 1 - mse_lasso_cv_extra[1] / var(y_test)\n", + "r2_ridge_extra <- 1 - mse_ridge_extra[1] / var(y_test)\n", + "r2_elnet_extra <- 1 - mse_elnet_extra[1] / var(y_test)\n", + "r2_lasso_extra <- 1 - mse_lasso_extra[1] / var(y_test)\n", + "r2_lasso_post_extra <- 1 - mse_lasso_post_extra[1] / var(y_test)\n", + "r2_lava_extra <- 1 - mse_lava_extra[1] / var(y_test)\n", "\n", "# R^2 (extra flexible)\n", - "cat('\\nExtra flexible model R^2 (Lasso): ', R2.lasso.cv.extra)\n", - "cat('\\nExtra flexible model R^2 (Ridge): ', R2.ridge.extra)\n", - "cat('\\nExtra flexible model R^2 (Elastic Net): ', R2.elnet.extra)\n", - "cat('\\nExtra flexible model R^2 (RLasso): ', R2.lasso.extra)\n", - "cat('\\nExtra flexible model R^2 (RLasso post): ', R2.lasso.post.extra)\n", - "cat('\\nExtra flexible model R^2 (LAVA): ', R2.lava.extra) #using CV_min" - ], - "metadata": { - "id": "tOKoNLKFovrI" - }, - "execution_count": null, - "outputs": [] + "cat(\"\\nExtra flexible model R^2 (Lasso): \", r2_lasso_cv_extra)\n", + "cat(\"\\nExtra flexible model R^2 (Ridge): \", r2_ridge_extra)\n", + "cat(\"\\nExtra flexible model R^2 (Elastic Net): \", r2_elnet_extra)\n", + "cat(\"\\nExtra flexible model R^2 (RLasso): \", r2_lasso_extra)\n", + "cat(\"\\nExtra flexible model R^2 (RLasso post): \", r2_lasso_post_extra)\n", + "cat(\"\\nExtra flexible model R^2 (LAVA): \", r2_lava_extra) # using CV_min" + ] }, { "cell_type": "markdown", - "source": [ - "" - ], "metadata": { "id": "Btez-AI8yE7S" - } + }, + "source": [ + "" + ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "R", "language": "R", @@ -1056,9 +1116,6 @@ "parameters": {}, "start_time": "2021-02-13T18:19:39.889520", "version": "2.2.2" - }, - "colab": { - "provenance": [] } }, "nbformat": 4, From 6c891124e496adbd215e2071fc4354c9e389d965 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jul 2024 16:51:36 +0000 Subject: [PATCH 080/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM2 --- PM2/r_convergence_hypothesis_double_lasso.Rmd | 266 ++ ...r_convergence_hypothesis_double_lasso.irnb | 1631 ++++------ PM2/r_experiment_non_orthogonal.Rmd | 515 ++++ PM2/r_experiment_non_orthogonal.irnb | 2076 ++++++------- PM2/r_heterogenous_wage_effects.Rmd | 99 + PM2/r_heterogenous_wage_effects.irnb | 1314 ++------ PM2/r_linear_penalized_regs.Rmd | 704 +++++ PM2/r_linear_penalized_regs.irnb | 2636 ++++++++--------- PM2/r_ml_for_wage_prediction.Rmd | 446 +++ PM2/r_ml_for_wage_prediction.irnb | 2174 +++++++------- PM2/r_orthogonal_orig.Rmd | 113 + PM2/r_orthogonal_orig.irnb | 378 ++- 12 files changed, 6621 insertions(+), 5731 deletions(-) create mode 100644 PM2/r_convergence_hypothesis_double_lasso.Rmd create mode 100644 PM2/r_experiment_non_orthogonal.Rmd create mode 100644 PM2/r_heterogenous_wage_effects.Rmd create mode 100644 PM2/r_linear_penalized_regs.Rmd create mode 100644 PM2/r_ml_for_wage_prediction.Rmd create mode 100644 PM2/r_orthogonal_orig.Rmd diff --git a/PM2/r_convergence_hypothesis_double_lasso.Rmd b/PM2/r_convergence_hypothesis_double_lasso.Rmd new file mode 100644 index 00000000..b5b7f9e7 --- /dev/null +++ b/PM2/r_convergence_hypothesis_double_lasso.Rmd @@ -0,0 +1,266 @@ +--- +title: An R Markdown document converted from "PM2/r_convergence_hypothesis_double_lasso.irnb" +output: html_document +--- + +# Testing the Convergence Hypothesis + +```{r} +install.packages("hdm") +install.packages("xtable") +install.packages("lmtest") +install.packages("sandwich") +install.packages("glmnet") +install.packages("ggplot2") +library(hdm) +library(xtable) +library(lmtest) +library(sandwich) +library(glmnet) # For LassoCV +library(ggplot2) +``` + +## Introduction + +We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\beta_1$ in the high-dimensional linear regression model: + $$ + Y = \beta_1 D + \beta_2'W + \epsilon. + $$ + +Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$). + +The relationship is captured by $\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\beta_1< 0)$ or fall behind $(\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation. + +## Data Analysis + +We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data. + +```{r} +getdata <- function(...) { + e <- new.env() + name <- data(..., envir = e)[1] + e[[name]] +} + +# now load your data calling getdata() +growth <- getdata(GrowthData) +``` + +The sample contains $90$ countries and $63$ controls. + +```{r} +growth +``` + +Thus $p \approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\beta_1$. +To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step. + +```{r} +## Create the outcome variable y and covariates x +y <- growth$Outcome +X <- growth[-which(colnames(growth) %in% c("intercept"))] +``` + +```{r} +fit <- lm(Outcome ~ ., data = X) +est <- summary(fit)$coef["gdpsh465", 1] + +hcv_coefs <- vcovHC(fit, type = "HC1") # HC - "heteroskedasticity cosistent" +se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors + +# print unconditional effect of gdpsh465 and the corresponding standard error +cat("The estimated coefficient on gdpsh465 is", est, + " and the corresponding robust standard error is", se) + +# Calculate the 95% confidence interval for 'gdpsh465' +lower_ci <- est - 1.96 * se +upper_ci <- est + 1.96 * se + +cat("95% Confidence Interval: [", lower_ci, ",", upper_ci, "]") +``` + +## Summarize OLS results + +```{r} +# Create an empty data frame with column names +table <- data.frame( + Method = character(0), + Estimate = character(0), + `Std. Error` = numeric(0), + `Lower Bound CI` = numeric(0), + `Upper Bound CI` = numeric(0) +) + +# Add OLS results to the table +table <- rbind(table, c("OLS", est, se, lower_ci, upper_ci)) + +# Rename the columns to match the Python table +colnames(table) <- c("Method", "Estimate", "Std. Error", "lower bound CI", "upper bound CI") + +# Print the table +print(table) +``` + +Least squares provides a rather noisy estimate (high standard error) of the +speed of convergence, and does not allow us to answer the question +about the convergence hypothesis since the confidence interval includes zero. + +In contrast, we can use the partialling-out approach based on lasso regression ("Double Lasso"). + +```{r} +y <- growth$Outcome +W <- growth[-which(colnames(growth) %in% c("Outcome", "intercept", "gdpsh465"))] +D <- growth$gdpsh465 +``` + +## Method 1: Lasso with Theoretical Penalty using HDM + +While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here. + +We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome. + +```{r} +double_lasso <- function(y, D, W) { + require(hdm) + + # residualize outcome with Lasso + yfit_rlasso <- rlasso(W, y, post = FALSE) + yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) + yres <- y - as.numeric(yhat_rlasso) + + + # residualize treatment with Lasso + dfit_rlasso <- rlasso(W, D, post = FALSE) + dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) + dres <- D - as.numeric(dhat_rlasso) + + # rest is the same as in the OLS case + hat <- mean(yres * dres) / mean(dres^2) + epsilon <- yres - hat * dres + V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2 + stderr <- sqrt(V / length(y)) + + return(list(hat = hat, stderr = stderr)) +} +``` + +```{r} +results <- double_lasso(y, D, W) +hat <- results$hat +stderr <- results$stderr +# Calculate the 95% confidence interval +ci_lower <- hat - 1.96 * stderr +ci_upper <- hat + 1.96 * stderr +``` + +The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large. + +In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\%$ and the $95\%$ confidence interval for the (annual) rate of convergence $[-7.8\%,-2.2\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis. + +```{r} +# Add Double Lasso results to the table +table <- rbind(table, c("Double Lasso", hat, stderr, ci_lower, ci_upper)) + +# Print the table +print(table) +``` + +## Method 2: Lasso with Cross-Validation + +This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal. + +```{r} +# Choose penalty based on KFold cross validation +set.seed(123) +# Given small sample size, we use an aggressive number of 20 folds +n_folds <- 20 + + +# Define LassoCV models for y and D +model_y <- cv.glmnet( + x = as.matrix(W), + y = y, + alpha = 1, # Lasso penalty + nfolds = n_folds, + family = "gaussian" +) + +model_d <- cv.glmnet( + x = as.matrix(W), + y = D, + alpha = 1, # Lasso penalty + nfolds = n_folds, + family = "gaussian" +) + +# Get the best lambda values for y and D +best_lambda_y <- model_y$lambda.min +best_lambda_d <- model_d$lambda.min + +# Fit Lasso models with the best lambda values +lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y) +lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d) + +# Calculate the residuals +res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W)) +res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W)) +``` + +```{r} +tmp_df <- as.data.frame(cbind(res_y, res_d)) +colnames(tmp_df) = c("res_y", "res_D") +``` + +```{r} +fit_cv <- lm(res_y ~ res_d, data = tmp_df) +est_cv <- summary(fit_cv)$coef["res_D", 1] + +hcv_cv_coefs <- vcovHC(fit_cv, type = "HC1") # HC - "heteroskedasticity cosistent" +se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors + +# Calculate the 95% confidence interval for 'gdpsh465' +lower_ci_cv <- est_cv - 1.96 * se_cv +upper_ci_cv <- est_cv + 1.96 * se_cv +``` + +```{r} +# Add LassoCV results to the table +table <- rbind(table, c("Double Lasso CV", est_cv, se_cv, lower_ci_cv, upper_ci_cv)) + +# Print the table +print(table) +``` + +We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes. + +```{r} +# Create a data frame to store the results +results_y <- data.frame( + Alphas = model_y$lambda, + OutOfSampleR2 = 1 - model_y$cvm / var(y) +) + +results_d <- data.frame( + Alphas = model_d$lambda, + OutOfSampleR2 = 1 - model_d$cvm / var(D) +) + +# Plot Outcome Lasso-CV Model +ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) + + geom_line() + + labs( + title = "Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level", + x = "Penalty Level", + y = "Out-of-sample R-squared" + ) + +# Plot Treatment Lasso-CV Model +ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) + + geom_line() + + labs( + title = "Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level", + x = "Penalty Level", + y = "Out-of-sample R-squared" + ) +``` + diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 4f56cbe9..6c48fccc 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -1,1093 +1,564 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "79U65py1grzb" - }, - "source": [ - "# Testing the Convergence Hypothesis" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GK-MMvLseA2Q", - "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’, ‘glmnet’, ‘checkmate’, ‘Formula’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependency ‘zoo’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Loading required package: zoo\n", - "\n", - "\n", - "Attaching package: ‘zoo’\n", - "\n", - "\n", - "The following objects are masked from ‘package:base’:\n", - "\n", - " as.Date, as.Date.numeric\n", - "\n", - "\n", - "Loading required package: Matrix\n", - "\n", - "Loaded glmnet 4.1-8\n", - "\n" - ] - } - ], - "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"xtable\")\n", - "install.packages(\"lmtest\")\n", - "install.packages(\"sandwich\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"ggplot2\")\n", - "library(hdm)\n", - "library(xtable)\n", - "library(lmtest)\n", - "library(sandwich)\n", - "library(glmnet) # For LassoCV\n", - "library(ggplot2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nlpSLLV6g1pc" - }, - "source": [ - "## Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xXkzGJWag02O" - }, - "source": [ - "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", - " $$\n", - " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", - " $$\n", - " \n", - "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", - " \n", - "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5Ul2ppLfUBQ" - }, - "source": [ - "## Data Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9GgPNICafYuK" - }, - "source": [ - "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "_B9DWuS6fcVW", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "getdata <- function(...) {\n", - " e <- new.env()\n", - " name <- data(..., envir = e)[1]\n", - " e[[name]]\n", - "}\n", - "\n", - "# now load your data calling getdata()\n", - "growth <- getdata(GrowthData)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smYhqwpbffVh" - }, - "source": [ - "The sample contains $90$ countries and $63$ controls." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1dsF7_R4j-Qv", - "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 90 × 63
Outcomeinterceptgdpsh465bmp1lfreeopfreetarh65hm65hf65p65seccf65syr65syrm65syrf65teapri65teasec65ex1im1xr65tot1
<dbl><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
-0.0243357516.5916740.28370.1534910.0438880.0070.0130.0010.29 0.040.0330.0570.01047.617.30.07290.0667 0.348-0.014727
0.1004725716.8297940.61410.3135090.0618270.0190.0320.0070.91 0.640.1730.2740.06757.118.00.09400.1438 0.525 0.005750
0.0670514818.8950820.00000.2042440.0091860.2600.3250.2011.0018.142.5732.4782.66726.520.70.17410.1750 1.082-0.010040
0.0640891717.5652750.19970.2487140.0362700.0610.0700.0511.00 2.630.4380.4530.42427.822.70.12650.1496 6.625-0.002195
0.0279295517.1623970.17400.2992520.0373670.0170.0270.0070.82 2.110.2570.2870.22934.517.60.12110.1308 2.500 0.003283
0.0464074417.2189100.00000.2588650.0208800.0230.0380.0060.50 1.460.1600.1740.14634.3 8.10.06340.0762 1.000-0.001747
0.0673323417.8536050.00000.1825250.0143850.0390.0630.0140.92 1.590.3420.4840.20746.614.70.03420.0428 12.499 0.009092
0.0209776817.7039100.27760.2152750.0297130.0240.0350.0130.69 1.630.1840.2190.15234.016.10.08640.0931 7.000 0.011630
0.0335512419.0634630.00000.1096140.0021710.4020.4880.3141.0024.723.2063.1543.25328.220.60.05940.0460 1.000 0.008169
0.0391465218.1519100.14840.1108850.0285790.1450.1730.1141.00 6.760.7030.7850.62020.3 7.20.05240.0523 2.119 0.007584
0.0761265116.9295170.02960.1657840.0201150.0460.0660.0250.73 6.211.3161.6830.96927.817.20.05600.0826 11.879 0.086032
0.1279512117.2377780.21510.0784880.0115810.0220.0310.0141.00 3.960.5940.6740.51528.214.80.02700.0275 1.938 0.007666
-0.0243260918.1158200.43180.1374820.0265470.0590.0730.0451.0011.361.1321.1261.13852.118.80.08040.0930 0.003 0.016968
0.0782934217.2717040.16890.1645980.0444460.0290.0450.0130.84 3.100.5680.6950.45035.913.10.06170.0678 10.479 0.004573
0.1129115517.1212520.18320.1880160.0456780.0330.0510.0150.91 3.160.4400.5120.36937.412.70.07750.0780 18.476-0.020322
0.0523081916.9772810.09620.2046110.0778520.0370.0430.0301.00 2.400.4190.5480.29930.3 7.90.06680.0787125.990 0.028916
0.0363908917.6496930.02270.1362870.0467300.0810.1050.0560.99 3.510.5620.6990.42735.714.70.08720.0938 26.800 0.020228
0.0297382318.0567440.02080.1978530.0372240.0830.0970.0691.00 3.300.7220.7650.68036.612.60.05570.0624 0.052 0.013407
-0.0566435818.7809410.26540.1898670.0317470.0680.0890.0460.94 2.990.3720.4620.28134.020.30.31780.1583 4.500-0.024761
0.0192048016.2878590.42070.1306820.1099210.0530.0390.0110.74 0.340.1420.2230.05535.519.10.02010.0341 4.762-0.021656
0.0852060016.1377270.13710.1238180.0158970.0280.0250.0070.72 0.560.1480.2320.06541.321.30.02980.0297 4.125-0.054872
0.1339822118.1288800.00000.1672100.0033110.1290.1960.0631.0013.161.7271.9101.56028.123.20.05700.0609360.000-0.054874
0.1730247416.6808550.47130.2284240.0293280.0620.0900.0321.00 3.950.9741.5260.47062.434.90.02060.0618265.690 0.018194
0.1096991517.1770190.01780.1852400.0154530.0200.0260.0130.90 1.890.5710.8430.28626.924.10.22950.1990 3.061-0.034733
0.0159899016.6489850.47620.1711810.0589370.0180.0280.0070.40 0.760.3570.5120.18539.926.90.01780.0634 4.762-0.000222
0.0622497716.8793560.29270.1795080.0358420.1880.1690.2081.00 3.690.6510.7590.54731.431.20.06950.0728 4.017 0.033636
0.1098706917.3473000.10170.2476260.0373920.0800.1330.0270.78 0.720.1950.3030.08536.221.60.08600.0898 3.177 0.010162
0.0921062816.7250340.02660.1799330.0463760.0150.0200.0100.78 0.860.2580.3820.13734.616.50.05580.0613 20.800-0.018514
0.0833760418.4510530.00000.3585560.0164680.0900.1330.0441.00 2.910.7661.0870.51021.915.30.16870.1635 26.000 0.010943
0.0762334518.6024530.00000.4162340.0147210.1480.1940.1001.0012.171.5541.7241.39820.6 7.20.26290.2698 50.000-0.001521
-0.0340453918.3461680.31990.1108850.0285790.2720.2890.2721.00 9.580.9190.9360.90218.2 7.70.06250.0578 36.603 0.014286
-0.0338063517.3031700.31330.1657840.0201150.1120.1320.0650.85 5.601.1581.4730.86222.718.20.10710.1028 20.000 0.111198
0.0699148817.8590270.12220.0784880.0115810.1070.1030.0920.88 2.800.5960.6450.54821.714.40.03570.0466 8.127 0.006002
-0.0817256017.9983351.63780.1374820.0265470.1560.1810.1501.0013.741.3391.2221.44534.815.20.07830.0847 4.911-0.127025
0.0460100517.6558640.13450.1645980.0444460.0800.0970.0581.00 8.251.0761.1431.01332.119.40.05250.0572 30.929-0.004592
0.0665980917.6750820.08980.1880160.0456780.2690.3380.2001.00 5.800.6870.7450.63037.516.40.09060.0959 25.000 0.191066
-0.0113842417.8300280.48800.1362870.0467300.1460.1930.0941.00 6.420.9501.1290.77239.123.80.07640.0866 40.500-0.007018
-0.1009899018.4986220.00100.1898670.0317470.1810.1900.1590.97 7.630.8010.8500.75230.216.80.21310.1437 4.285 0.168536
0.0547508716.2166060.75570.2143450.0734950.0230.0510.0060.73 0.440.2820.4880.05150.621.80.02320.0407 8.876-0.084064
0.0946181718.4144960.00000.3743280.0000000.1010.1470.0531.0011.801.8462.3691.30131.124.30.59580.5819 4.935 0.021808
0.0457152916.3835070.35560.1306820.1099210.0860.1300.0420.79 1.000.4460.7130.16342.319.80.01880.0222 8.653-0.012443
0.0654911118.7823230.00000.1672100.0033110.2460.3310.1600.9915.521.9692.1211.82825.317.50.10320.0958296.800-0.057094
0.0212465117.2513450.05160.2638130.0452250.0900.0530.0300.88 4.000.8171.2050.41334.721.10.07300.2227 0.320 0.128443
0.1414454817.5115250.10530.2284240.0293280.1030.1390.0541.00 9.331.7002.3691.06051.737.10.09030.1229484.000 0.007257
0.0968162317.7137850.00500.1852400.0154530.0310.0420.0160.91 4.350.8911.2550.51731.827.30.19220.1821 2.402 0.030424
0.0405342016.7286290.61900.1711810.0589370.0190.0270.0090.46 1.010.6701.0390.27140.118.00.02810.0459 9.900-0.012137
0.0105884117.1861440.07600.1795080.0358420.1840.1730.2171.00 5.340.9431.0490.83729.031.50.07030.0716 7.248 0.009640
0.1855264918.3260330.00500.3216580.0051060.0900.1090.0751.00 4.641.1271.4270.81730.523.10.74700.8489 2.371 0.051395
0.0931049117.8946910.10620.2476260.0373920.1210.1750.0630.96 1.470.4810.7610.20033.819.60.07970.1018 3.017 0.207492
0.0652285617.1754900.00000.1799330.0463760.0350.0400.0270.83 1.230.3320.4510.21927.927.20.06360.0721 20.379 0.018019
0.0380950219.0309740.00000.2931380.0055170.2450.2510.2381.00 7.501.1671.2101.12822.515.50.16620.1617 4.286-0.006642
0.0342130018.9955370.00000.3047200.0116580.2460.2600.1901.00 6.750.6670.7760.57523.515.00.25970.2288 2.460-0.003241
0.0527591418.2348300.03630.2884050.0115890.1830.2220.1421.00 8.181.0101.2200.82130.228.30.10440.1796 32.051-0.034352
0.0384156418.3325490.00000.3454850.0065030.1880.2480.1361.0013.121.5761.5671.58531.014.30.28660.3500 0.452-0.001660
0.0318947918.6455860.00000.2884400.0059950.2560.3010.1991.00 6.911.3071.5791.06218.911.30.12960.1458652.850-0.046278
0.0311959818.9910640.00000.3718980.0145860.2550.3360.1700.9811.412.2262.4941.97127.515.90.44070.4257 2.529-0.011883
0.0340956618.0251890.00500.2964370.0136150.1080.1170.0931.00 1.950.5100.6940.36220.215.70.16690.2201 25.553-0.039080
0.0469004619.0301370.00000.2657780.0086290.2880.3370.2371.0025.642.7272.6642.78820.4 9.40.32380.3134 4.152 0.005175
0.0397733718.8653120.00000.2829390.0050480.1880.2360.1391.0010.761.8881.9201.86020.016.00.18450.1940 0.452-0.029551
0.0406415418.9123390.00000.1503660.0243770.2570.3380.2151.0024.403.0513.2352.87518.529.10.18760.2007 0.886-0.036482
\n" - ], - "text/latex": [ - "A data.frame: 90 × 63\n", - "\\begin{tabular}{lllllllllllllllllllll}\n", - " Outcome & intercept & gdpsh465 & bmp1l & freeop & freetar & h65 & hm65 & hf65 & p65 & ⋯ & seccf65 & syr65 & syrm65 & syrf65 & teapri65 & teasec65 & ex1 & im1 & xr65 & tot1\\\\\n", - " & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n", - "\\hline\n", - "\t -0.02433575 & 1 & 6.591674 & 0.2837 & 0.153491 & 0.043888 & 0.007 & 0.013 & 0.001 & 0.29 & ⋯ & 0.04 & 0.033 & 0.057 & 0.010 & 47.6 & 17.3 & 0.0729 & 0.0667 & 0.348 & -0.014727\\\\\n", - "\t 0.10047257 & 1 & 6.829794 & 0.6141 & 0.313509 & 0.061827 & 0.019 & 0.032 & 0.007 & 0.91 & ⋯ & 0.64 & 0.173 & 0.274 & 0.067 & 57.1 & 18.0 & 0.0940 & 0.1438 & 0.525 & 0.005750\\\\\n", - "\t 0.06705148 & 1 & 8.895082 & 0.0000 & 0.204244 & 0.009186 & 0.260 & 0.325 & 0.201 & 1.00 & ⋯ & 18.14 & 2.573 & 2.478 & 2.667 & 26.5 & 20.7 & 0.1741 & 0.1750 & 1.082 & -0.010040\\\\\n", - "\t 0.06408917 & 1 & 7.565275 & 0.1997 & 0.248714 & 0.036270 & 0.061 & 0.070 & 0.051 & 1.00 & ⋯ & 2.63 & 0.438 & 0.453 & 0.424 & 27.8 & 22.7 & 0.1265 & 0.1496 & 6.625 & -0.002195\\\\\n", - "\t 0.02792955 & 1 & 7.162397 & 0.1740 & 0.299252 & 0.037367 & 0.017 & 0.027 & 0.007 & 0.82 & ⋯ & 2.11 & 0.257 & 0.287 & 0.229 & 34.5 & 17.6 & 0.1211 & 0.1308 & 2.500 & 0.003283\\\\\n", - "\t 0.04640744 & 1 & 7.218910 & 0.0000 & 0.258865 & 0.020880 & 0.023 & 0.038 & 0.006 & 0.50 & ⋯ & 1.46 & 0.160 & 0.174 & 0.146 & 34.3 & 8.1 & 0.0634 & 0.0762 & 1.000 & -0.001747\\\\\n", - "\t 0.06733234 & 1 & 7.853605 & 0.0000 & 0.182525 & 0.014385 & 0.039 & 0.063 & 0.014 & 0.92 & ⋯ & 1.59 & 0.342 & 0.484 & 0.207 & 46.6 & 14.7 & 0.0342 & 0.0428 & 12.499 & 0.009092\\\\\n", - "\t 0.02097768 & 1 & 7.703910 & 0.2776 & 0.215275 & 0.029713 & 0.024 & 0.035 & 0.013 & 0.69 & ⋯ & 1.63 & 0.184 & 0.219 & 0.152 & 34.0 & 16.1 & 0.0864 & 0.0931 & 7.000 & 0.011630\\\\\n", - "\t 0.03355124 & 1 & 9.063463 & 0.0000 & 0.109614 & 0.002171 & 0.402 & 0.488 & 0.314 & 1.00 & ⋯ & 24.72 & 3.206 & 3.154 & 3.253 & 28.2 & 20.6 & 0.0594 & 0.0460 & 1.000 & 0.008169\\\\\n", - "\t 0.03914652 & 1 & 8.151910 & 0.1484 & 0.110885 & 0.028579 & 0.145 & 0.173 & 0.114 & 1.00 & ⋯ & 6.76 & 0.703 & 0.785 & 0.620 & 20.3 & 7.2 & 0.0524 & 0.0523 & 2.119 & 0.007584\\\\\n", - "\t 0.07612651 & 1 & 6.929517 & 0.0296 & 0.165784 & 0.020115 & 0.046 & 0.066 & 0.025 & 0.73 & ⋯ & 6.21 & 1.316 & 1.683 & 0.969 & 27.8 & 17.2 & 0.0560 & 0.0826 & 11.879 & 0.086032\\\\\n", - "\t 0.12795121 & 1 & 7.237778 & 0.2151 & 0.078488 & 0.011581 & 0.022 & 0.031 & 0.014 & 1.00 & ⋯ & 3.96 & 0.594 & 0.674 & 0.515 & 28.2 & 14.8 & 0.0270 & 0.0275 & 1.938 & 0.007666\\\\\n", - "\t -0.02432609 & 1 & 8.115820 & 0.4318 & 0.137482 & 0.026547 & 0.059 & 0.073 & 0.045 & 1.00 & ⋯ & 11.36 & 1.132 & 1.126 & 1.138 & 52.1 & 18.8 & 0.0804 & 0.0930 & 0.003 & 0.016968\\\\\n", - "\t 0.07829342 & 1 & 7.271704 & 0.1689 & 0.164598 & 0.044446 & 0.029 & 0.045 & 0.013 & 0.84 & ⋯ & 3.10 & 0.568 & 0.695 & 0.450 & 35.9 & 13.1 & 0.0617 & 0.0678 & 10.479 & 0.004573\\\\\n", - "\t 0.11291155 & 1 & 7.121252 & 0.1832 & 0.188016 & 0.045678 & 0.033 & 0.051 & 0.015 & 0.91 & ⋯ & 3.16 & 0.440 & 0.512 & 0.369 & 37.4 & 12.7 & 0.0775 & 0.0780 & 18.476 & -0.020322\\\\\n", - "\t 0.05230819 & 1 & 6.977281 & 0.0962 & 0.204611 & 0.077852 & 0.037 & 0.043 & 0.030 & 1.00 & ⋯ & 2.40 & 0.419 & 0.548 & 0.299 & 30.3 & 7.9 & 0.0668 & 0.0787 & 125.990 & 0.028916\\\\\n", - "\t 0.03639089 & 1 & 7.649693 & 0.0227 & 0.136287 & 0.046730 & 0.081 & 0.105 & 0.056 & 0.99 & ⋯ & 3.51 & 0.562 & 0.699 & 0.427 & 35.7 & 14.7 & 0.0872 & 0.0938 & 26.800 & 0.020228\\\\\n", - "\t 0.02973823 & 1 & 8.056744 & 0.0208 & 0.197853 & 0.037224 & 0.083 & 0.097 & 0.069 & 1.00 & ⋯ & 3.30 & 0.722 & 0.765 & 0.680 & 36.6 & 12.6 & 0.0557 & 0.0624 & 0.052 & 0.013407\\\\\n", - "\t -0.05664358 & 1 & 8.780941 & 0.2654 & 0.189867 & 0.031747 & 0.068 & 0.089 & 0.046 & 0.94 & ⋯ & 2.99 & 0.372 & 0.462 & 0.281 & 34.0 & 20.3 & 0.3178 & 0.1583 & 4.500 & -0.024761\\\\\n", - "\t 0.01920480 & 1 & 6.287859 & 0.4207 & 0.130682 & 0.109921 & 0.053 & 0.039 & 0.011 & 0.74 & ⋯ & 0.34 & 0.142 & 0.223 & 0.055 & 35.5 & 19.1 & 0.0201 & 0.0341 & 4.762 & -0.021656\\\\\n", - "\t 0.08520600 & 1 & 6.137727 & 0.1371 & 0.123818 & 0.015897 & 0.028 & 0.025 & 0.007 & 0.72 & ⋯ & 0.56 & 0.148 & 0.232 & 0.065 & 41.3 & 21.3 & 0.0298 & 0.0297 & 4.125 & -0.054872\\\\\n", - "\t 0.13398221 & 1 & 8.128880 & 0.0000 & 0.167210 & 0.003311 & 0.129 & 0.196 & 0.063 & 1.00 & ⋯ & 13.16 & 1.727 & 1.910 & 1.560 & 28.1 & 23.2 & 0.0570 & 0.0609 & 360.000 & -0.054874\\\\\n", - "\t 0.17302474 & 1 & 6.680855 & 0.4713 & 0.228424 & 0.029328 & 0.062 & 0.090 & 0.032 & 1.00 & ⋯ & 3.95 & 0.974 & 1.526 & 0.470 & 62.4 & 34.9 & 0.0206 & 0.0618 & 265.690 & 0.018194\\\\\n", - "\t 0.10969915 & 1 & 7.177019 & 0.0178 & 0.185240 & 0.015453 & 0.020 & 0.026 & 0.013 & 0.90 & ⋯ & 1.89 & 0.571 & 0.843 & 0.286 & 26.9 & 24.1 & 0.2295 & 0.1990 & 3.061 & -0.034733\\\\\n", - "\t 0.01598990 & 1 & 6.648985 & 0.4762 & 0.171181 & 0.058937 & 0.018 & 0.028 & 0.007 & 0.40 & ⋯ & 0.76 & 0.357 & 0.512 & 0.185 & 39.9 & 26.9 & 0.0178 & 0.0634 & 4.762 & -0.000222\\\\\n", - "\t 0.06224977 & 1 & 6.879356 & 0.2927 & 0.179508 & 0.035842 & 0.188 & 0.169 & 0.208 & 1.00 & ⋯ & 3.69 & 0.651 & 0.759 & 0.547 & 31.4 & 31.2 & 0.0695 & 0.0728 & 4.017 & 0.033636\\\\\n", - "\t 0.10987069 & 1 & 7.347300 & 0.1017 & 0.247626 & 0.037392 & 0.080 & 0.133 & 0.027 & 0.78 & ⋯ & 0.72 & 0.195 & 0.303 & 0.085 & 36.2 & 21.6 & 0.0860 & 0.0898 & 3.177 & 0.010162\\\\\n", - "\t 0.09210628 & 1 & 6.725034 & 0.0266 & 0.179933 & 0.046376 & 0.015 & 0.020 & 0.010 & 0.78 & ⋯ & 0.86 & 0.258 & 0.382 & 0.137 & 34.6 & 16.5 & 0.0558 & 0.0613 & 20.800 & -0.018514\\\\\n", - "\t 0.08337604 & 1 & 8.451053 & 0.0000 & 0.358556 & 0.016468 & 0.090 & 0.133 & 0.044 & 1.00 & ⋯ & 2.91 & 0.766 & 1.087 & 0.510 & 21.9 & 15.3 & 0.1687 & 0.1635 & 26.000 & 0.010943\\\\\n", - "\t 0.07623345 & 1 & 8.602453 & 0.0000 & 0.416234 & 0.014721 & 0.148 & 0.194 & 0.100 & 1.00 & ⋯ & 12.17 & 1.554 & 1.724 & 1.398 & 20.6 & 7.2 & 0.2629 & 0.2698 & 50.000 & -0.001521\\\\\n", - "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋱ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", - "\t -0.03404539 & 1 & 8.346168 & 0.3199 & 0.110885 & 0.028579 & 0.272 & 0.289 & 0.272 & 1.00 & ⋯ & 9.58 & 0.919 & 0.936 & 0.902 & 18.2 & 7.7 & 0.0625 & 0.0578 & 36.603 & 0.014286\\\\\n", - "\t -0.03380635 & 1 & 7.303170 & 0.3133 & 0.165784 & 0.020115 & 0.112 & 0.132 & 0.065 & 0.85 & ⋯ & 5.60 & 1.158 & 1.473 & 0.862 & 22.7 & 18.2 & 0.1071 & 0.1028 & 20.000 & 0.111198\\\\\n", - "\t 0.06991488 & 1 & 7.859027 & 0.1222 & 0.078488 & 0.011581 & 0.107 & 0.103 & 0.092 & 0.88 & ⋯ & 2.80 & 0.596 & 0.645 & 0.548 & 21.7 & 14.4 & 0.0357 & 0.0466 & 8.127 & 0.006002\\\\\n", - "\t -0.08172560 & 1 & 7.998335 & 1.6378 & 0.137482 & 0.026547 & 0.156 & 0.181 & 0.150 & 1.00 & ⋯ & 13.74 & 1.339 & 1.222 & 1.445 & 34.8 & 15.2 & 0.0783 & 0.0847 & 4.911 & -0.127025\\\\\n", - "\t 0.04601005 & 1 & 7.655864 & 0.1345 & 0.164598 & 0.044446 & 0.080 & 0.097 & 0.058 & 1.00 & ⋯ & 8.25 & 1.076 & 1.143 & 1.013 & 32.1 & 19.4 & 0.0525 & 0.0572 & 30.929 & -0.004592\\\\\n", - "\t 0.06659809 & 1 & 7.675082 & 0.0898 & 0.188016 & 0.045678 & 0.269 & 0.338 & 0.200 & 1.00 & ⋯ & 5.80 & 0.687 & 0.745 & 0.630 & 37.5 & 16.4 & 0.0906 & 0.0959 & 25.000 & 0.191066\\\\\n", - "\t -0.01138424 & 1 & 7.830028 & 0.4880 & 0.136287 & 0.046730 & 0.146 & 0.193 & 0.094 & 1.00 & ⋯ & 6.42 & 0.950 & 1.129 & 0.772 & 39.1 & 23.8 & 0.0764 & 0.0866 & 40.500 & -0.007018\\\\\n", - "\t -0.10098990 & 1 & 8.498622 & 0.0010 & 0.189867 & 0.031747 & 0.181 & 0.190 & 0.159 & 0.97 & ⋯ & 7.63 & 0.801 & 0.850 & 0.752 & 30.2 & 16.8 & 0.2131 & 0.1437 & 4.285 & 0.168536\\\\\n", - "\t 0.05475087 & 1 & 6.216606 & 0.7557 & 0.214345 & 0.073495 & 0.023 & 0.051 & 0.006 & 0.73 & ⋯ & 0.44 & 0.282 & 0.488 & 0.051 & 50.6 & 21.8 & 0.0232 & 0.0407 & 8.876 & -0.084064\\\\\n", - "\t 0.09461817 & 1 & 8.414496 & 0.0000 & 0.374328 & 0.000000 & 0.101 & 0.147 & 0.053 & 1.00 & ⋯ & 11.80 & 1.846 & 2.369 & 1.301 & 31.1 & 24.3 & 0.5958 & 0.5819 & 4.935 & 0.021808\\\\\n", - "\t 0.04571529 & 1 & 6.383507 & 0.3556 & 0.130682 & 0.109921 & 0.086 & 0.130 & 0.042 & 0.79 & ⋯ & 1.00 & 0.446 & 0.713 & 0.163 & 42.3 & 19.8 & 0.0188 & 0.0222 & 8.653 & -0.012443\\\\\n", - "\t 0.06549111 & 1 & 8.782323 & 0.0000 & 0.167210 & 0.003311 & 0.246 & 0.331 & 0.160 & 0.99 & ⋯ & 15.52 & 1.969 & 2.121 & 1.828 & 25.3 & 17.5 & 0.1032 & 0.0958 & 296.800 & -0.057094\\\\\n", - "\t 0.02124651 & 1 & 7.251345 & 0.0516 & 0.263813 & 0.045225 & 0.090 & 0.053 & 0.030 & 0.88 & ⋯ & 4.00 & 0.817 & 1.205 & 0.413 & 34.7 & 21.1 & 0.0730 & 0.2227 & 0.320 & 0.128443\\\\\n", - "\t 0.14144548 & 1 & 7.511525 & 0.1053 & 0.228424 & 0.029328 & 0.103 & 0.139 & 0.054 & 1.00 & ⋯ & 9.33 & 1.700 & 2.369 & 1.060 & 51.7 & 37.1 & 0.0903 & 0.1229 & 484.000 & 0.007257\\\\\n", - "\t 0.09681623 & 1 & 7.713785 & 0.0050 & 0.185240 & 0.015453 & 0.031 & 0.042 & 0.016 & 0.91 & ⋯ & 4.35 & 0.891 & 1.255 & 0.517 & 31.8 & 27.3 & 0.1922 & 0.1821 & 2.402 & 0.030424\\\\\n", - "\t 0.04053420 & 1 & 6.728629 & 0.6190 & 0.171181 & 0.058937 & 0.019 & 0.027 & 0.009 & 0.46 & ⋯ & 1.01 & 0.670 & 1.039 & 0.271 & 40.1 & 18.0 & 0.0281 & 0.0459 & 9.900 & -0.012137\\\\\n", - "\t 0.01058841 & 1 & 7.186144 & 0.0760 & 0.179508 & 0.035842 & 0.184 & 0.173 & 0.217 & 1.00 & ⋯ & 5.34 & 0.943 & 1.049 & 0.837 & 29.0 & 31.5 & 0.0703 & 0.0716 & 7.248 & 0.009640\\\\\n", - "\t 0.18552649 & 1 & 8.326033 & 0.0050 & 0.321658 & 0.005106 & 0.090 & 0.109 & 0.075 & 1.00 & ⋯ & 4.64 & 1.127 & 1.427 & 0.817 & 30.5 & 23.1 & 0.7470 & 0.8489 & 2.371 & 0.051395\\\\\n", - "\t 0.09310491 & 1 & 7.894691 & 0.1062 & 0.247626 & 0.037392 & 0.121 & 0.175 & 0.063 & 0.96 & ⋯ & 1.47 & 0.481 & 0.761 & 0.200 & 33.8 & 19.6 & 0.0797 & 0.1018 & 3.017 & 0.207492\\\\\n", - "\t 0.06522856 & 1 & 7.175490 & 0.0000 & 0.179933 & 0.046376 & 0.035 & 0.040 & 0.027 & 0.83 & ⋯ & 1.23 & 0.332 & 0.451 & 0.219 & 27.9 & 27.2 & 0.0636 & 0.0721 & 20.379 & 0.018019\\\\\n", - "\t 0.03809502 & 1 & 9.030974 & 0.0000 & 0.293138 & 0.005517 & 0.245 & 0.251 & 0.238 & 1.00 & ⋯ & 7.50 & 1.167 & 1.210 & 1.128 & 22.5 & 15.5 & 0.1662 & 0.1617 & 4.286 & -0.006642\\\\\n", - "\t 0.03421300 & 1 & 8.995537 & 0.0000 & 0.304720 & 0.011658 & 0.246 & 0.260 & 0.190 & 1.00 & ⋯ & 6.75 & 0.667 & 0.776 & 0.575 & 23.5 & 15.0 & 0.2597 & 0.2288 & 2.460 & -0.003241\\\\\n", - "\t 0.05275914 & 1 & 8.234830 & 0.0363 & 0.288405 & 0.011589 & 0.183 & 0.222 & 0.142 & 1.00 & ⋯ & 8.18 & 1.010 & 1.220 & 0.821 & 30.2 & 28.3 & 0.1044 & 0.1796 & 32.051 & -0.034352\\\\\n", - "\t 0.03841564 & 1 & 8.332549 & 0.0000 & 0.345485 & 0.006503 & 0.188 & 0.248 & 0.136 & 1.00 & ⋯ & 13.12 & 1.576 & 1.567 & 1.585 & 31.0 & 14.3 & 0.2866 & 0.3500 & 0.452 & -0.001660\\\\\n", - "\t 0.03189479 & 1 & 8.645586 & 0.0000 & 0.288440 & 0.005995 & 0.256 & 0.301 & 0.199 & 1.00 & ⋯ & 6.91 & 1.307 & 1.579 & 1.062 & 18.9 & 11.3 & 0.1296 & 0.1458 & 652.850 & -0.046278\\\\\n", - "\t 0.03119598 & 1 & 8.991064 & 0.0000 & 0.371898 & 0.014586 & 0.255 & 0.336 & 0.170 & 0.98 & ⋯ & 11.41 & 2.226 & 2.494 & 1.971 & 27.5 & 15.9 & 0.4407 & 0.4257 & 2.529 & -0.011883\\\\\n", - "\t 0.03409566 & 1 & 8.025189 & 0.0050 & 0.296437 & 0.013615 & 0.108 & 0.117 & 0.093 & 1.00 & ⋯ & 1.95 & 0.510 & 0.694 & 0.362 & 20.2 & 15.7 & 0.1669 & 0.2201 & 25.553 & -0.039080\\\\\n", - "\t 0.04690046 & 1 & 9.030137 & 0.0000 & 0.265778 & 0.008629 & 0.288 & 0.337 & 0.237 & 1.00 & ⋯ & 25.64 & 2.727 & 2.664 & 2.788 & 20.4 & 9.4 & 0.3238 & 0.3134 & 4.152 & 0.005175\\\\\n", - "\t 0.03977337 & 1 & 8.865312 & 0.0000 & 0.282939 & 0.005048 & 0.188 & 0.236 & 0.139 & 1.00 & ⋯ & 10.76 & 1.888 & 1.920 & 1.860 & 20.0 & 16.0 & 0.1845 & 0.1940 & 0.452 & -0.029551\\\\\n", - "\t 0.04064154 & 1 & 8.912339 & 0.0000 & 0.150366 & 0.024377 & 0.257 & 0.338 & 0.215 & 1.00 & ⋯ & 24.40 & 3.051 & 3.235 & 2.875 & 18.5 & 29.1 & 0.1876 & 0.2007 & 0.886 & -0.036482\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A data.frame: 90 × 63\n", - "\n", - "| Outcome <dbl> | intercept <int> | gdpsh465 <dbl> | bmp1l <dbl> | freeop <dbl> | freetar <dbl> | h65 <dbl> | hm65 <dbl> | hf65 <dbl> | p65 <dbl> | ⋯ ⋯ | seccf65 <dbl> | syr65 <dbl> | syrm65 <dbl> | syrf65 <dbl> | teapri65 <dbl> | teasec65 <dbl> | ex1 <dbl> | im1 <dbl> | xr65 <dbl> | tot1 <dbl> |\n", - "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", - "| -0.02433575 | 1 | 6.591674 | 0.2837 | 0.153491 | 0.043888 | 0.007 | 0.013 | 0.001 | 0.29 | ⋯ | 0.04 | 0.033 | 0.057 | 0.010 | 47.6 | 17.3 | 0.0729 | 0.0667 | 0.348 | -0.014727 |\n", - "| 0.10047257 | 1 | 6.829794 | 0.6141 | 0.313509 | 0.061827 | 0.019 | 0.032 | 0.007 | 0.91 | ⋯ | 0.64 | 0.173 | 0.274 | 0.067 | 57.1 | 18.0 | 0.0940 | 0.1438 | 0.525 | 0.005750 |\n", - "| 0.06705148 | 1 | 8.895082 | 0.0000 | 0.204244 | 0.009186 | 0.260 | 0.325 | 0.201 | 1.00 | ⋯ | 18.14 | 2.573 | 2.478 | 2.667 | 26.5 | 20.7 | 0.1741 | 0.1750 | 1.082 | -0.010040 |\n", - "| 0.06408917 | 1 | 7.565275 | 0.1997 | 0.248714 | 0.036270 | 0.061 | 0.070 | 0.051 | 1.00 | ⋯ | 2.63 | 0.438 | 0.453 | 0.424 | 27.8 | 22.7 | 0.1265 | 0.1496 | 6.625 | -0.002195 |\n", - "| 0.02792955 | 1 | 7.162397 | 0.1740 | 0.299252 | 0.037367 | 0.017 | 0.027 | 0.007 | 0.82 | ⋯ | 2.11 | 0.257 | 0.287 | 0.229 | 34.5 | 17.6 | 0.1211 | 0.1308 | 2.500 | 0.003283 |\n", - "| 0.04640744 | 1 | 7.218910 | 0.0000 | 0.258865 | 0.020880 | 0.023 | 0.038 | 0.006 | 0.50 | ⋯ | 1.46 | 0.160 | 0.174 | 0.146 | 34.3 | 8.1 | 0.0634 | 0.0762 | 1.000 | -0.001747 |\n", - "| 0.06733234 | 1 | 7.853605 | 0.0000 | 0.182525 | 0.014385 | 0.039 | 0.063 | 0.014 | 0.92 | ⋯ | 1.59 | 0.342 | 0.484 | 0.207 | 46.6 | 14.7 | 0.0342 | 0.0428 | 12.499 | 0.009092 |\n", - "| 0.02097768 | 1 | 7.703910 | 0.2776 | 0.215275 | 0.029713 | 0.024 | 0.035 | 0.013 | 0.69 | ⋯ | 1.63 | 0.184 | 0.219 | 0.152 | 34.0 | 16.1 | 0.0864 | 0.0931 | 7.000 | 0.011630 |\n", - "| 0.03355124 | 1 | 9.063463 | 0.0000 | 0.109614 | 0.002171 | 0.402 | 0.488 | 0.314 | 1.00 | ⋯ | 24.72 | 3.206 | 3.154 | 3.253 | 28.2 | 20.6 | 0.0594 | 0.0460 | 1.000 | 0.008169 |\n", - "| 0.03914652 | 1 | 8.151910 | 0.1484 | 0.110885 | 0.028579 | 0.145 | 0.173 | 0.114 | 1.00 | ⋯ | 6.76 | 0.703 | 0.785 | 0.620 | 20.3 | 7.2 | 0.0524 | 0.0523 | 2.119 | 0.007584 |\n", - "| 0.07612651 | 1 | 6.929517 | 0.0296 | 0.165784 | 0.020115 | 0.046 | 0.066 | 0.025 | 0.73 | ⋯ | 6.21 | 1.316 | 1.683 | 0.969 | 27.8 | 17.2 | 0.0560 | 0.0826 | 11.879 | 0.086032 |\n", - "| 0.12795121 | 1 | 7.237778 | 0.2151 | 0.078488 | 0.011581 | 0.022 | 0.031 | 0.014 | 1.00 | ⋯ | 3.96 | 0.594 | 0.674 | 0.515 | 28.2 | 14.8 | 0.0270 | 0.0275 | 1.938 | 0.007666 |\n", - "| -0.02432609 | 1 | 8.115820 | 0.4318 | 0.137482 | 0.026547 | 0.059 | 0.073 | 0.045 | 1.00 | ⋯ | 11.36 | 1.132 | 1.126 | 1.138 | 52.1 | 18.8 | 0.0804 | 0.0930 | 0.003 | 0.016968 |\n", - "| 0.07829342 | 1 | 7.271704 | 0.1689 | 0.164598 | 0.044446 | 0.029 | 0.045 | 0.013 | 0.84 | ⋯ | 3.10 | 0.568 | 0.695 | 0.450 | 35.9 | 13.1 | 0.0617 | 0.0678 | 10.479 | 0.004573 |\n", - "| 0.11291155 | 1 | 7.121252 | 0.1832 | 0.188016 | 0.045678 | 0.033 | 0.051 | 0.015 | 0.91 | ⋯ | 3.16 | 0.440 | 0.512 | 0.369 | 37.4 | 12.7 | 0.0775 | 0.0780 | 18.476 | -0.020322 |\n", - "| 0.05230819 | 1 | 6.977281 | 0.0962 | 0.204611 | 0.077852 | 0.037 | 0.043 | 0.030 | 1.00 | ⋯ | 2.40 | 0.419 | 0.548 | 0.299 | 30.3 | 7.9 | 0.0668 | 0.0787 | 125.990 | 0.028916 |\n", - "| 0.03639089 | 1 | 7.649693 | 0.0227 | 0.136287 | 0.046730 | 0.081 | 0.105 | 0.056 | 0.99 | ⋯ | 3.51 | 0.562 | 0.699 | 0.427 | 35.7 | 14.7 | 0.0872 | 0.0938 | 26.800 | 0.020228 |\n", - "| 0.02973823 | 1 | 8.056744 | 0.0208 | 0.197853 | 0.037224 | 0.083 | 0.097 | 0.069 | 1.00 | ⋯ | 3.30 | 0.722 | 0.765 | 0.680 | 36.6 | 12.6 | 0.0557 | 0.0624 | 0.052 | 0.013407 |\n", - "| -0.05664358 | 1 | 8.780941 | 0.2654 | 0.189867 | 0.031747 | 0.068 | 0.089 | 0.046 | 0.94 | ⋯ | 2.99 | 0.372 | 0.462 | 0.281 | 34.0 | 20.3 | 0.3178 | 0.1583 | 4.500 | -0.024761 |\n", - "| 0.01920480 | 1 | 6.287859 | 0.4207 | 0.130682 | 0.109921 | 0.053 | 0.039 | 0.011 | 0.74 | ⋯ | 0.34 | 0.142 | 0.223 | 0.055 | 35.5 | 19.1 | 0.0201 | 0.0341 | 4.762 | -0.021656 |\n", - "| 0.08520600 | 1 | 6.137727 | 0.1371 | 0.123818 | 0.015897 | 0.028 | 0.025 | 0.007 | 0.72 | ⋯ | 0.56 | 0.148 | 0.232 | 0.065 | 41.3 | 21.3 | 0.0298 | 0.0297 | 4.125 | -0.054872 |\n", - "| 0.13398221 | 1 | 8.128880 | 0.0000 | 0.167210 | 0.003311 | 0.129 | 0.196 | 0.063 | 1.00 | ⋯ | 13.16 | 1.727 | 1.910 | 1.560 | 28.1 | 23.2 | 0.0570 | 0.0609 | 360.000 | -0.054874 |\n", - "| 0.17302474 | 1 | 6.680855 | 0.4713 | 0.228424 | 0.029328 | 0.062 | 0.090 | 0.032 | 1.00 | ⋯ | 3.95 | 0.974 | 1.526 | 0.470 | 62.4 | 34.9 | 0.0206 | 0.0618 | 265.690 | 0.018194 |\n", - "| 0.10969915 | 1 | 7.177019 | 0.0178 | 0.185240 | 0.015453 | 0.020 | 0.026 | 0.013 | 0.90 | ⋯ | 1.89 | 0.571 | 0.843 | 0.286 | 26.9 | 24.1 | 0.2295 | 0.1990 | 3.061 | -0.034733 |\n", - "| 0.01598990 | 1 | 6.648985 | 0.4762 | 0.171181 | 0.058937 | 0.018 | 0.028 | 0.007 | 0.40 | ⋯ | 0.76 | 0.357 | 0.512 | 0.185 | 39.9 | 26.9 | 0.0178 | 0.0634 | 4.762 | -0.000222 |\n", - "| 0.06224977 | 1 | 6.879356 | 0.2927 | 0.179508 | 0.035842 | 0.188 | 0.169 | 0.208 | 1.00 | ⋯ | 3.69 | 0.651 | 0.759 | 0.547 | 31.4 | 31.2 | 0.0695 | 0.0728 | 4.017 | 0.033636 |\n", - "| 0.10987069 | 1 | 7.347300 | 0.1017 | 0.247626 | 0.037392 | 0.080 | 0.133 | 0.027 | 0.78 | ⋯ | 0.72 | 0.195 | 0.303 | 0.085 | 36.2 | 21.6 | 0.0860 | 0.0898 | 3.177 | 0.010162 |\n", - "| 0.09210628 | 1 | 6.725034 | 0.0266 | 0.179933 | 0.046376 | 0.015 | 0.020 | 0.010 | 0.78 | ⋯ | 0.86 | 0.258 | 0.382 | 0.137 | 34.6 | 16.5 | 0.0558 | 0.0613 | 20.800 | -0.018514 |\n", - "| 0.08337604 | 1 | 8.451053 | 0.0000 | 0.358556 | 0.016468 | 0.090 | 0.133 | 0.044 | 1.00 | ⋯ | 2.91 | 0.766 | 1.087 | 0.510 | 21.9 | 15.3 | 0.1687 | 0.1635 | 26.000 | 0.010943 |\n", - "| 0.07623345 | 1 | 8.602453 | 0.0000 | 0.416234 | 0.014721 | 0.148 | 0.194 | 0.100 | 1.00 | ⋯ | 12.17 | 1.554 | 1.724 | 1.398 | 20.6 | 7.2 | 0.2629 | 0.2698 | 50.000 | -0.001521 |\n", - "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", - "| -0.03404539 | 1 | 8.346168 | 0.3199 | 0.110885 | 0.028579 | 0.272 | 0.289 | 0.272 | 1.00 | ⋯ | 9.58 | 0.919 | 0.936 | 0.902 | 18.2 | 7.7 | 0.0625 | 0.0578 | 36.603 | 0.014286 |\n", - "| -0.03380635 | 1 | 7.303170 | 0.3133 | 0.165784 | 0.020115 | 0.112 | 0.132 | 0.065 | 0.85 | ⋯ | 5.60 | 1.158 | 1.473 | 0.862 | 22.7 | 18.2 | 0.1071 | 0.1028 | 20.000 | 0.111198 |\n", - "| 0.06991488 | 1 | 7.859027 | 0.1222 | 0.078488 | 0.011581 | 0.107 | 0.103 | 0.092 | 0.88 | ⋯ | 2.80 | 0.596 | 0.645 | 0.548 | 21.7 | 14.4 | 0.0357 | 0.0466 | 8.127 | 0.006002 |\n", - "| -0.08172560 | 1 | 7.998335 | 1.6378 | 0.137482 | 0.026547 | 0.156 | 0.181 | 0.150 | 1.00 | ⋯ | 13.74 | 1.339 | 1.222 | 1.445 | 34.8 | 15.2 | 0.0783 | 0.0847 | 4.911 | -0.127025 |\n", - "| 0.04601005 | 1 | 7.655864 | 0.1345 | 0.164598 | 0.044446 | 0.080 | 0.097 | 0.058 | 1.00 | ⋯ | 8.25 | 1.076 | 1.143 | 1.013 | 32.1 | 19.4 | 0.0525 | 0.0572 | 30.929 | -0.004592 |\n", - "| 0.06659809 | 1 | 7.675082 | 0.0898 | 0.188016 | 0.045678 | 0.269 | 0.338 | 0.200 | 1.00 | ⋯ | 5.80 | 0.687 | 0.745 | 0.630 | 37.5 | 16.4 | 0.0906 | 0.0959 | 25.000 | 0.191066 |\n", - "| -0.01138424 | 1 | 7.830028 | 0.4880 | 0.136287 | 0.046730 | 0.146 | 0.193 | 0.094 | 1.00 | ⋯ | 6.42 | 0.950 | 1.129 | 0.772 | 39.1 | 23.8 | 0.0764 | 0.0866 | 40.500 | -0.007018 |\n", - "| -0.10098990 | 1 | 8.498622 | 0.0010 | 0.189867 | 0.031747 | 0.181 | 0.190 | 0.159 | 0.97 | ⋯ | 7.63 | 0.801 | 0.850 | 0.752 | 30.2 | 16.8 | 0.2131 | 0.1437 | 4.285 | 0.168536 |\n", - "| 0.05475087 | 1 | 6.216606 | 0.7557 | 0.214345 | 0.073495 | 0.023 | 0.051 | 0.006 | 0.73 | ⋯ | 0.44 | 0.282 | 0.488 | 0.051 | 50.6 | 21.8 | 0.0232 | 0.0407 | 8.876 | -0.084064 |\n", - "| 0.09461817 | 1 | 8.414496 | 0.0000 | 0.374328 | 0.000000 | 0.101 | 0.147 | 0.053 | 1.00 | ⋯ | 11.80 | 1.846 | 2.369 | 1.301 | 31.1 | 24.3 | 0.5958 | 0.5819 | 4.935 | 0.021808 |\n", - "| 0.04571529 | 1 | 6.383507 | 0.3556 | 0.130682 | 0.109921 | 0.086 | 0.130 | 0.042 | 0.79 | ⋯ | 1.00 | 0.446 | 0.713 | 0.163 | 42.3 | 19.8 | 0.0188 | 0.0222 | 8.653 | -0.012443 |\n", - "| 0.06549111 | 1 | 8.782323 | 0.0000 | 0.167210 | 0.003311 | 0.246 | 0.331 | 0.160 | 0.99 | ⋯ | 15.52 | 1.969 | 2.121 | 1.828 | 25.3 | 17.5 | 0.1032 | 0.0958 | 296.800 | -0.057094 |\n", - "| 0.02124651 | 1 | 7.251345 | 0.0516 | 0.263813 | 0.045225 | 0.090 | 0.053 | 0.030 | 0.88 | ⋯ | 4.00 | 0.817 | 1.205 | 0.413 | 34.7 | 21.1 | 0.0730 | 0.2227 | 0.320 | 0.128443 |\n", - "| 0.14144548 | 1 | 7.511525 | 0.1053 | 0.228424 | 0.029328 | 0.103 | 0.139 | 0.054 | 1.00 | ⋯ | 9.33 | 1.700 | 2.369 | 1.060 | 51.7 | 37.1 | 0.0903 | 0.1229 | 484.000 | 0.007257 |\n", - "| 0.09681623 | 1 | 7.713785 | 0.0050 | 0.185240 | 0.015453 | 0.031 | 0.042 | 0.016 | 0.91 | ⋯ | 4.35 | 0.891 | 1.255 | 0.517 | 31.8 | 27.3 | 0.1922 | 0.1821 | 2.402 | 0.030424 |\n", - "| 0.04053420 | 1 | 6.728629 | 0.6190 | 0.171181 | 0.058937 | 0.019 | 0.027 | 0.009 | 0.46 | ⋯ | 1.01 | 0.670 | 1.039 | 0.271 | 40.1 | 18.0 | 0.0281 | 0.0459 | 9.900 | -0.012137 |\n", - "| 0.01058841 | 1 | 7.186144 | 0.0760 | 0.179508 | 0.035842 | 0.184 | 0.173 | 0.217 | 1.00 | ⋯ | 5.34 | 0.943 | 1.049 | 0.837 | 29.0 | 31.5 | 0.0703 | 0.0716 | 7.248 | 0.009640 |\n", - "| 0.18552649 | 1 | 8.326033 | 0.0050 | 0.321658 | 0.005106 | 0.090 | 0.109 | 0.075 | 1.00 | ⋯ | 4.64 | 1.127 | 1.427 | 0.817 | 30.5 | 23.1 | 0.7470 | 0.8489 | 2.371 | 0.051395 |\n", - "| 0.09310491 | 1 | 7.894691 | 0.1062 | 0.247626 | 0.037392 | 0.121 | 0.175 | 0.063 | 0.96 | ⋯ | 1.47 | 0.481 | 0.761 | 0.200 | 33.8 | 19.6 | 0.0797 | 0.1018 | 3.017 | 0.207492 |\n", - "| 0.06522856 | 1 | 7.175490 | 0.0000 | 0.179933 | 0.046376 | 0.035 | 0.040 | 0.027 | 0.83 | ⋯ | 1.23 | 0.332 | 0.451 | 0.219 | 27.9 | 27.2 | 0.0636 | 0.0721 | 20.379 | 0.018019 |\n", - "| 0.03809502 | 1 | 9.030974 | 0.0000 | 0.293138 | 0.005517 | 0.245 | 0.251 | 0.238 | 1.00 | ⋯ | 7.50 | 1.167 | 1.210 | 1.128 | 22.5 | 15.5 | 0.1662 | 0.1617 | 4.286 | -0.006642 |\n", - "| 0.03421300 | 1 | 8.995537 | 0.0000 | 0.304720 | 0.011658 | 0.246 | 0.260 | 0.190 | 1.00 | ⋯ | 6.75 | 0.667 | 0.776 | 0.575 | 23.5 | 15.0 | 0.2597 | 0.2288 | 2.460 | -0.003241 |\n", - "| 0.05275914 | 1 | 8.234830 | 0.0363 | 0.288405 | 0.011589 | 0.183 | 0.222 | 0.142 | 1.00 | ⋯ | 8.18 | 1.010 | 1.220 | 0.821 | 30.2 | 28.3 | 0.1044 | 0.1796 | 32.051 | -0.034352 |\n", - "| 0.03841564 | 1 | 8.332549 | 0.0000 | 0.345485 | 0.006503 | 0.188 | 0.248 | 0.136 | 1.00 | ⋯ | 13.12 | 1.576 | 1.567 | 1.585 | 31.0 | 14.3 | 0.2866 | 0.3500 | 0.452 | -0.001660 |\n", - "| 0.03189479 | 1 | 8.645586 | 0.0000 | 0.288440 | 0.005995 | 0.256 | 0.301 | 0.199 | 1.00 | ⋯ | 6.91 | 1.307 | 1.579 | 1.062 | 18.9 | 11.3 | 0.1296 | 0.1458 | 652.850 | -0.046278 |\n", - "| 0.03119598 | 1 | 8.991064 | 0.0000 | 0.371898 | 0.014586 | 0.255 | 0.336 | 0.170 | 0.98 | ⋯ | 11.41 | 2.226 | 2.494 | 1.971 | 27.5 | 15.9 | 0.4407 | 0.4257 | 2.529 | -0.011883 |\n", - "| 0.03409566 | 1 | 8.025189 | 0.0050 | 0.296437 | 0.013615 | 0.108 | 0.117 | 0.093 | 1.00 | ⋯ | 1.95 | 0.510 | 0.694 | 0.362 | 20.2 | 15.7 | 0.1669 | 0.2201 | 25.553 | -0.039080 |\n", - "| 0.04690046 | 1 | 9.030137 | 0.0000 | 0.265778 | 0.008629 | 0.288 | 0.337 | 0.237 | 1.00 | ⋯ | 25.64 | 2.727 | 2.664 | 2.788 | 20.4 | 9.4 | 0.3238 | 0.3134 | 4.152 | 0.005175 |\n", - "| 0.03977337 | 1 | 8.865312 | 0.0000 | 0.282939 | 0.005048 | 0.188 | 0.236 | 0.139 | 1.00 | ⋯ | 10.76 | 1.888 | 1.920 | 1.860 | 20.0 | 16.0 | 0.1845 | 0.1940 | 0.452 | -0.029551 |\n", - "| 0.04064154 | 1 | 8.912339 | 0.0000 | 0.150366 | 0.024377 | 0.257 | 0.338 | 0.215 | 1.00 | ⋯ | 24.40 | 3.051 | 3.235 | 2.875 | 18.5 | 29.1 | 0.1876 | 0.2007 | 0.886 | -0.036482 |\n", - "\n" - ], - "text/plain": [ - " Outcome intercept gdpsh465 bmp1l freeop freetar h65 hm65 hf65 \n", - "1 -0.02433575 1 6.591674 0.2837 0.153491 0.043888 0.007 0.013 0.001\n", - "2 0.10047257 1 6.829794 0.6141 0.313509 0.061827 0.019 0.032 0.007\n", - "3 0.06705148 1 8.895082 0.0000 0.204244 0.009186 0.260 0.325 0.201\n", - "4 0.06408917 1 7.565275 0.1997 0.248714 0.036270 0.061 0.070 0.051\n", - "5 0.02792955 1 7.162397 0.1740 0.299252 0.037367 0.017 0.027 0.007\n", - "6 0.04640744 1 7.218910 0.0000 0.258865 0.020880 0.023 0.038 0.006\n", - "7 0.06733234 1 7.853605 0.0000 0.182525 0.014385 0.039 0.063 0.014\n", - "8 0.02097768 1 7.703910 0.2776 0.215275 0.029713 0.024 0.035 0.013\n", - "9 0.03355124 1 9.063463 0.0000 0.109614 0.002171 0.402 0.488 0.314\n", - "10 0.03914652 1 8.151910 0.1484 0.110885 0.028579 0.145 0.173 0.114\n", - "11 0.07612651 1 6.929517 0.0296 0.165784 0.020115 0.046 0.066 0.025\n", - "12 0.12795121 1 7.237778 0.2151 0.078488 0.011581 0.022 0.031 0.014\n", - "13 -0.02432609 1 8.115820 0.4318 0.137482 0.026547 0.059 0.073 0.045\n", - "14 0.07829342 1 7.271704 0.1689 0.164598 0.044446 0.029 0.045 0.013\n", - "15 0.11291155 1 7.121252 0.1832 0.188016 0.045678 0.033 0.051 0.015\n", - "16 0.05230819 1 6.977281 0.0962 0.204611 0.077852 0.037 0.043 0.030\n", - "17 0.03639089 1 7.649693 0.0227 0.136287 0.046730 0.081 0.105 0.056\n", - "18 0.02973823 1 8.056744 0.0208 0.197853 0.037224 0.083 0.097 0.069\n", - "19 -0.05664358 1 8.780941 0.2654 0.189867 0.031747 0.068 0.089 0.046\n", - "20 0.01920480 1 6.287859 0.4207 0.130682 0.109921 0.053 0.039 0.011\n", - "21 0.08520600 1 6.137727 0.1371 0.123818 0.015897 0.028 0.025 0.007\n", - "22 0.13398221 1 8.128880 0.0000 0.167210 0.003311 0.129 0.196 0.063\n", - "23 0.17302474 1 6.680855 0.4713 0.228424 0.029328 0.062 0.090 0.032\n", - "24 0.10969915 1 7.177019 0.0178 0.185240 0.015453 0.020 0.026 0.013\n", - "25 0.01598990 1 6.648985 0.4762 0.171181 0.058937 0.018 0.028 0.007\n", - "26 0.06224977 1 6.879356 0.2927 0.179508 0.035842 0.188 0.169 0.208\n", - "27 0.10987069 1 7.347300 0.1017 0.247626 0.037392 0.080 0.133 0.027\n", - "28 0.09210628 1 6.725034 0.0266 0.179933 0.046376 0.015 0.020 0.010\n", - "29 0.08337604 1 8.451053 0.0000 0.358556 0.016468 0.090 0.133 0.044\n", - "30 0.07623345 1 8.602453 0.0000 0.416234 0.014721 0.148 0.194 0.100\n", - "⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "61 -0.03404539 1 8.346168 0.3199 0.110885 0.028579 0.272 0.289 0.272\n", - "62 -0.03380635 1 7.303170 0.3133 0.165784 0.020115 0.112 0.132 0.065\n", - "63 0.06991488 1 7.859027 0.1222 0.078488 0.011581 0.107 0.103 0.092\n", - "64 -0.08172560 1 7.998335 1.6378 0.137482 0.026547 0.156 0.181 0.150\n", - "65 0.04601005 1 7.655864 0.1345 0.164598 0.044446 0.080 0.097 0.058\n", - "66 0.06659809 1 7.675082 0.0898 0.188016 0.045678 0.269 0.338 0.200\n", - "67 -0.01138424 1 7.830028 0.4880 0.136287 0.046730 0.146 0.193 0.094\n", - "68 -0.10098990 1 8.498622 0.0010 0.189867 0.031747 0.181 0.190 0.159\n", - "69 0.05475087 1 6.216606 0.7557 0.214345 0.073495 0.023 0.051 0.006\n", - "70 0.09461817 1 8.414496 0.0000 0.374328 0.000000 0.101 0.147 0.053\n", - "71 0.04571529 1 6.383507 0.3556 0.130682 0.109921 0.086 0.130 0.042\n", - "72 0.06549111 1 8.782323 0.0000 0.167210 0.003311 0.246 0.331 0.160\n", - "73 0.02124651 1 7.251345 0.0516 0.263813 0.045225 0.090 0.053 0.030\n", - "74 0.14144548 1 7.511525 0.1053 0.228424 0.029328 0.103 0.139 0.054\n", - "75 0.09681623 1 7.713785 0.0050 0.185240 0.015453 0.031 0.042 0.016\n", - "76 0.04053420 1 6.728629 0.6190 0.171181 0.058937 0.019 0.027 0.009\n", - "77 0.01058841 1 7.186144 0.0760 0.179508 0.035842 0.184 0.173 0.217\n", - "78 0.18552649 1 8.326033 0.0050 0.321658 0.005106 0.090 0.109 0.075\n", - "79 0.09310491 1 7.894691 0.1062 0.247626 0.037392 0.121 0.175 0.063\n", - "80 0.06522856 1 7.175490 0.0000 0.179933 0.046376 0.035 0.040 0.027\n", - "81 0.03809502 1 9.030974 0.0000 0.293138 0.005517 0.245 0.251 0.238\n", - "82 0.03421300 1 8.995537 0.0000 0.304720 0.011658 0.246 0.260 0.190\n", - "83 0.05275914 1 8.234830 0.0363 0.288405 0.011589 0.183 0.222 0.142\n", - "84 0.03841564 1 8.332549 0.0000 0.345485 0.006503 0.188 0.248 0.136\n", - "85 0.03189479 1 8.645586 0.0000 0.288440 0.005995 0.256 0.301 0.199\n", - "86 0.03119598 1 8.991064 0.0000 0.371898 0.014586 0.255 0.336 0.170\n", - "87 0.03409566 1 8.025189 0.0050 0.296437 0.013615 0.108 0.117 0.093\n", - "88 0.04690046 1 9.030137 0.0000 0.265778 0.008629 0.288 0.337 0.237\n", - "89 0.03977337 1 8.865312 0.0000 0.282939 0.005048 0.188 0.236 0.139\n", - "90 0.04064154 1 8.912339 0.0000 0.150366 0.024377 0.257 0.338 0.215\n", - " p65 ⋯ seccf65 syr65 syrm65 syrf65 teapri65 teasec65 ex1 im1 xr65 \n", - "1 0.29 ⋯ 0.04 0.033 0.057 0.010 47.6 17.3 0.0729 0.0667 0.348\n", - "2 0.91 ⋯ 0.64 0.173 0.274 0.067 57.1 18.0 0.0940 0.1438 0.525\n", - "3 1.00 ⋯ 18.14 2.573 2.478 2.667 26.5 20.7 0.1741 0.1750 1.082\n", - "4 1.00 ⋯ 2.63 0.438 0.453 0.424 27.8 22.7 0.1265 0.1496 6.625\n", - "5 0.82 ⋯ 2.11 0.257 0.287 0.229 34.5 17.6 0.1211 0.1308 2.500\n", - "6 0.50 ⋯ 1.46 0.160 0.174 0.146 34.3 8.1 0.0634 0.0762 1.000\n", - "7 0.92 ⋯ 1.59 0.342 0.484 0.207 46.6 14.7 0.0342 0.0428 12.499\n", - "8 0.69 ⋯ 1.63 0.184 0.219 0.152 34.0 16.1 0.0864 0.0931 7.000\n", - "9 1.00 ⋯ 24.72 3.206 3.154 3.253 28.2 20.6 0.0594 0.0460 1.000\n", - "10 1.00 ⋯ 6.76 0.703 0.785 0.620 20.3 7.2 0.0524 0.0523 2.119\n", - "11 0.73 ⋯ 6.21 1.316 1.683 0.969 27.8 17.2 0.0560 0.0826 11.879\n", - "12 1.00 ⋯ 3.96 0.594 0.674 0.515 28.2 14.8 0.0270 0.0275 1.938\n", - "13 1.00 ⋯ 11.36 1.132 1.126 1.138 52.1 18.8 0.0804 0.0930 0.003\n", - "14 0.84 ⋯ 3.10 0.568 0.695 0.450 35.9 13.1 0.0617 0.0678 10.479\n", - "15 0.91 ⋯ 3.16 0.440 0.512 0.369 37.4 12.7 0.0775 0.0780 18.476\n", - "16 1.00 ⋯ 2.40 0.419 0.548 0.299 30.3 7.9 0.0668 0.0787 125.990\n", - "17 0.99 ⋯ 3.51 0.562 0.699 0.427 35.7 14.7 0.0872 0.0938 26.800\n", - "18 1.00 ⋯ 3.30 0.722 0.765 0.680 36.6 12.6 0.0557 0.0624 0.052\n", - "19 0.94 ⋯ 2.99 0.372 0.462 0.281 34.0 20.3 0.3178 0.1583 4.500\n", - "20 0.74 ⋯ 0.34 0.142 0.223 0.055 35.5 19.1 0.0201 0.0341 4.762\n", - "21 0.72 ⋯ 0.56 0.148 0.232 0.065 41.3 21.3 0.0298 0.0297 4.125\n", - "22 1.00 ⋯ 13.16 1.727 1.910 1.560 28.1 23.2 0.0570 0.0609 360.000\n", - "23 1.00 ⋯ 3.95 0.974 1.526 0.470 62.4 34.9 0.0206 0.0618 265.690\n", - "24 0.90 ⋯ 1.89 0.571 0.843 0.286 26.9 24.1 0.2295 0.1990 3.061\n", - "25 0.40 ⋯ 0.76 0.357 0.512 0.185 39.9 26.9 0.0178 0.0634 4.762\n", - "26 1.00 ⋯ 3.69 0.651 0.759 0.547 31.4 31.2 0.0695 0.0728 4.017\n", - "27 0.78 ⋯ 0.72 0.195 0.303 0.085 36.2 21.6 0.0860 0.0898 3.177\n", - "28 0.78 ⋯ 0.86 0.258 0.382 0.137 34.6 16.5 0.0558 0.0613 20.800\n", - "29 1.00 ⋯ 2.91 0.766 1.087 0.510 21.9 15.3 0.1687 0.1635 26.000\n", - "30 1.00 ⋯ 12.17 1.554 1.724 1.398 20.6 7.2 0.2629 0.2698 50.000\n", - "⋮ ⋮ ⋱ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "61 1.00 ⋯ 9.58 0.919 0.936 0.902 18.2 7.7 0.0625 0.0578 36.603\n", - "62 0.85 ⋯ 5.60 1.158 1.473 0.862 22.7 18.2 0.1071 0.1028 20.000\n", - "63 0.88 ⋯ 2.80 0.596 0.645 0.548 21.7 14.4 0.0357 0.0466 8.127\n", - "64 1.00 ⋯ 13.74 1.339 1.222 1.445 34.8 15.2 0.0783 0.0847 4.911\n", - "65 1.00 ⋯ 8.25 1.076 1.143 1.013 32.1 19.4 0.0525 0.0572 30.929\n", - "66 1.00 ⋯ 5.80 0.687 0.745 0.630 37.5 16.4 0.0906 0.0959 25.000\n", - "67 1.00 ⋯ 6.42 0.950 1.129 0.772 39.1 23.8 0.0764 0.0866 40.500\n", - "68 0.97 ⋯ 7.63 0.801 0.850 0.752 30.2 16.8 0.2131 0.1437 4.285\n", - "69 0.73 ⋯ 0.44 0.282 0.488 0.051 50.6 21.8 0.0232 0.0407 8.876\n", - "70 1.00 ⋯ 11.80 1.846 2.369 1.301 31.1 24.3 0.5958 0.5819 4.935\n", - "71 0.79 ⋯ 1.00 0.446 0.713 0.163 42.3 19.8 0.0188 0.0222 8.653\n", - "72 0.99 ⋯ 15.52 1.969 2.121 1.828 25.3 17.5 0.1032 0.0958 296.800\n", - "73 0.88 ⋯ 4.00 0.817 1.205 0.413 34.7 21.1 0.0730 0.2227 0.320\n", - "74 1.00 ⋯ 9.33 1.700 2.369 1.060 51.7 37.1 0.0903 0.1229 484.000\n", - "75 0.91 ⋯ 4.35 0.891 1.255 0.517 31.8 27.3 0.1922 0.1821 2.402\n", - "76 0.46 ⋯ 1.01 0.670 1.039 0.271 40.1 18.0 0.0281 0.0459 9.900\n", - "77 1.00 ⋯ 5.34 0.943 1.049 0.837 29.0 31.5 0.0703 0.0716 7.248\n", - "78 1.00 ⋯ 4.64 1.127 1.427 0.817 30.5 23.1 0.7470 0.8489 2.371\n", - "79 0.96 ⋯ 1.47 0.481 0.761 0.200 33.8 19.6 0.0797 0.1018 3.017\n", - "80 0.83 ⋯ 1.23 0.332 0.451 0.219 27.9 27.2 0.0636 0.0721 20.379\n", - "81 1.00 ⋯ 7.50 1.167 1.210 1.128 22.5 15.5 0.1662 0.1617 4.286\n", - "82 1.00 ⋯ 6.75 0.667 0.776 0.575 23.5 15.0 0.2597 0.2288 2.460\n", - "83 1.00 ⋯ 8.18 1.010 1.220 0.821 30.2 28.3 0.1044 0.1796 32.051\n", - "84 1.00 ⋯ 13.12 1.576 1.567 1.585 31.0 14.3 0.2866 0.3500 0.452\n", - "85 1.00 ⋯ 6.91 1.307 1.579 1.062 18.9 11.3 0.1296 0.1458 652.850\n", - "86 0.98 ⋯ 11.41 2.226 2.494 1.971 27.5 15.9 0.4407 0.4257 2.529\n", - "87 1.00 ⋯ 1.95 0.510 0.694 0.362 20.2 15.7 0.1669 0.2201 25.553\n", - "88 1.00 ⋯ 25.64 2.727 2.664 2.788 20.4 9.4 0.3238 0.3134 4.152\n", - "89 1.00 ⋯ 10.76 1.888 1.920 1.860 20.0 16.0 0.1845 0.1940 0.452\n", - "90 1.00 ⋯ 24.40 3.051 3.235 2.875 18.5 29.1 0.1876 0.2007 0.886\n", - " tot1 \n", - "1 -0.014727\n", - "2 0.005750\n", - "3 -0.010040\n", - "4 -0.002195\n", - "5 0.003283\n", - "6 -0.001747\n", - "7 0.009092\n", - "8 0.011630\n", - "9 0.008169\n", - "10 0.007584\n", - "11 0.086032\n", - "12 0.007666\n", - "13 0.016968\n", - "14 0.004573\n", - "15 -0.020322\n", - "16 0.028916\n", - "17 0.020228\n", - "18 0.013407\n", - "19 -0.024761\n", - "20 -0.021656\n", - "21 -0.054872\n", - "22 -0.054874\n", - "23 0.018194\n", - "24 -0.034733\n", - "25 -0.000222\n", - "26 0.033636\n", - "27 0.010162\n", - "28 -0.018514\n", - "29 0.010943\n", - "30 -0.001521\n", - "⋮ ⋮ \n", - "61 0.014286\n", - "62 0.111198\n", - "63 0.006002\n", - "64 -0.127025\n", - "65 -0.004592\n", - "66 0.191066\n", - "67 -0.007018\n", - "68 0.168536\n", - "69 -0.084064\n", - "70 0.021808\n", - "71 -0.012443\n", - "72 -0.057094\n", - "73 0.128443\n", - "74 0.007257\n", - "75 0.030424\n", - "76 -0.012137\n", - "77 0.009640\n", - "78 0.051395\n", - "79 0.207492\n", - "80 0.018019\n", - "81 -0.006642\n", - "82 -0.003241\n", - "83 -0.034352\n", - "84 -0.001660\n", - "85 -0.046278\n", - "86 -0.011883\n", - "87 -0.039080\n", - "88 0.005175\n", - "89 -0.029551\n", - "90 -0.036482" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "growth" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-AMcbsgefhTg" - }, - "source": [ - "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", - "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "DncWsRS9mgAp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "## Create the outcome variable y and covariates x\n", - "y <- growth$Outcome\n", - "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vPO08MjomqfZ", - "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The estimated coefficient on gdpsh465 is -0.009377989 and the corresponding robust standard error is 0.032421195% Confidence Interval: [ -0.07292335 , 0.05416737 ]" - ] - } - ], - "source": [ - "fit <- lm(Outcome ~ ., data = X)\n", - "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", - "\n", - "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", - "\n", - "# print unconditional effect of gdpsh465 and the corresponding standard error\n", - "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", - " \" and the corresponding robust standard error is\", se)\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci <- est - 1.96 * se\n", - "upper_ci <- est + 1.96 * se\n", - "\n", - "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D7nJZzhGfjQT" - }, - "source": [ - "## Summarize OLS results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EwGVcIVAfRe5", - "outputId": "87f41279-8907-415b-f8eb-589f736089b2", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - " upper bound CI\n", - "1 0.0541673700112012\n" - ] - } - ], - "source": [ - "# Create an empty data frame with column names\n", - "table <- data.frame(\n", - " Method = character(0),\n", - " Estimate = character(0),\n", - " `Std. Error` = numeric(0),\n", - " `Lower Bound CI` = numeric(0),\n", - " `Upper Bound CI` = numeric(0)\n", - ")\n", - "\n", - "# Add OLS results to the table\n", - "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", - "\n", - "# Rename the columns to match the Python table\n", - "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KfrhJqKhfwKB" - }, - "source": [ - "Least squares provides a rather noisy estimate (high standard error) of the\n", - "speed of convergence, and does not allow us to answer the question\n", - "about the convergence hypothesis since the confidence interval includes zero.\n", - "\n", - "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "D9Y2U1Ldf1eB", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "y <- growth$Outcome\n", - "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", - "D <- growth$gdpsh465" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8yNU2UgefzCZ" - }, - "source": [ - "## Method 1: Lasso with Theoretical Penalty using HDM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tQPxdzQ2f84M" - }, - "source": [ - "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", - "\n", - "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "DIzy51tZsoWp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", - "\n", - " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat_rlasso)\n", - "\n", - "\n", - " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", - " dres <- D - as.numeric(dhat_rlasso)\n", - "\n", - " # rest is the same as in the OLS case\n", - " hat <- mean(yres * dres) / mean(dres^2)\n", - " epsilon <- yres - hat * dres\n", - " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", - " stderr <- sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "Ncz7Uqn5sqqU", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "results <- double_lasso(y, D, W)\n", - "hat <- results$hat\n", - "stderr <- results$stderr\n", - "# Calculate the 95% confidence interval\n", - "ci_lower <- hat - 1.96 * stderr\n", - "ci_upper <- hat + 1.96 * stderr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "P5PEjKw9gLvC" - }, - "source": [ - "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", - "\n", - "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tNLVM4WEgL9v", - "outputId": "1f2683b7-630a-43c5-e110-74c527603850", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", - " upper bound CI\n", - "1 0.0541673700112012\n", - "2 -0.00975949506187093\n" - ] - } - ], - "source": [ - "# Add Double Lasso results to the table\n", - "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smPkxqCpgMR8" - }, - "source": [ - "## Method 2: Lasso with Cross-Validation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MH-eUye8liRq" - }, - "source": [ - "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "YhpTUkE_wQz9", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choose penalty based on KFold cross validation\n", - "set.seed(123)\n", - "# Given small sample size, we use an aggressive number of 20 folds\n", - "n_folds <- 20\n", - "\n", - "\n", - "# Define LassoCV models for y and D\n", - "model_y <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = y,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "model_d <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = D,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "# Get the best lambda values for y and D\n", - "best_lambda_y <- model_y$lambda.min\n", - "best_lambda_d <- model_d$lambda.min\n", - "\n", - "# Fit Lasso models with the best lambda values\n", - "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", - "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", - "\n", - "# Calculate the residuals\n", - "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", - "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "79U65py1grzb" + }, + "source": [ + "# Testing the Convergence Hypothesis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "cbVsr86tyqTY", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", - "colnames(tmp_df) = c(\"res_y\", \"res_D\")" - ] + "id": "GK-MMvLseA2Q", + "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "install.packages(\"xtable\")\n", + "install.packages(\"lmtest\")\n", + "install.packages(\"sandwich\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"ggplot2\")\n", + "library(hdm)\n", + "library(xtable)\n", + "library(lmtest)\n", + "library(sandwich)\n", + "library(glmnet) # For LassoCV\n", + "library(ggplot2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nlpSLLV6g1pc" + }, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xXkzGJWag02O" + }, + "source": [ + "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", + " $$\n", + " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", + " $$\n", + " \n", + "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", + " \n", + "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a5Ul2ppLfUBQ" + }, + "source": [ + "## Data Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9GgPNICafYuK" + }, + "source": [ + "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_B9DWuS6fcVW", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "getdata <- function(...) {\n", + " e <- new.env()\n", + " name <- data(..., envir = e)[1]\n", + " e[[name]]\n", + "}\n", + "\n", + "# now load your data calling getdata()\n", + "growth <- getdata(GrowthData)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smYhqwpbffVh" + }, + "source": [ + "The sample contains $90$ countries and $63$ controls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "D7SzuZ2P0P0X", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", - "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", - "\n", - "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci_cv <- est_cv - 1.96 * se_cv\n", - "upper_ci_cv <- est_cv + 1.96 * se_cv" - ] + "id": "1dsF7_R4j-Qv", + "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "growth" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-AMcbsgefhTg" + }, + "source": [ + "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", + "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DncWsRS9mgAp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create the outcome variable y and covariates x\n", + "y <- growth$Outcome\n", + "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ctl5T5vUygRk", - "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", - "3 Double Lasso CV -0.00210480949226998 0.00822866735729585 -0.0182329975125698\n", - " upper bound CI\n", - "1 0.0541673700112012\n", - "2 -0.00975949506187093\n", - "3 0.0140233785280299\n" - ] - } - ], - "source": [ - "# Add LassoCV results to the table\n", - "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", - "\n", - "# Print the table\n", - "print(table)" - ] + "id": "vPO08MjomqfZ", + "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit <- lm(Outcome ~ ., data = X)\n", + "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", + "\n", + "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", + "\n", + "# print unconditional effect of gdpsh465 and the corresponding standard error\n", + "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", + " \" and the corresponding robust standard error is\", se)\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci <- est - 1.96 * se\n", + "upper_ci <- est + 1.96 * se\n", + "\n", + "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D7nJZzhGfjQT" + }, + "source": [ + "## Summarize OLS results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "0LzDsUi8gmQM" - }, - "source": [ - "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." - ] + "id": "EwGVcIVAfRe5", + "outputId": "87f41279-8907-415b-f8eb-589f736089b2", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create an empty data frame with column names\n", + "table <- data.frame(\n", + " Method = character(0),\n", + " Estimate = character(0),\n", + " `Std. Error` = numeric(0),\n", + " `Lower Bound CI` = numeric(0),\n", + " `Upper Bound CI` = numeric(0)\n", + ")\n", + "\n", + "# Add OLS results to the table\n", + "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", + "\n", + "# Rename the columns to match the Python table\n", + "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KfrhJqKhfwKB" + }, + "source": [ + "Least squares provides a rather noisy estimate (high standard error) of the\n", + "speed of convergence, and does not allow us to answer the question\n", + "about the convergence hypothesis since the confidence interval includes zero.\n", + "\n", + "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D9Y2U1Ldf1eB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "y <- growth$Outcome\n", + "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", + "D <- growth$gdpsh465" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8yNU2UgefzCZ" + }, + "source": [ + "## Method 1: Lasso with Theoretical Penalty using HDM" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQPxdzQ2f84M" + }, + "source": [ + "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", + "\n", + "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DIzy51tZsoWp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", + " # residualize outcome with Lasso\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", + "\n", + "\n", + " # residualize treatment with Lasso\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " dres <- D - as.numeric(dhat_rlasso)\n", + "\n", + " # rest is the same as in the OLS case\n", + " hat <- mean(yres * dres) / mean(dres^2)\n", + " epsilon <- yres - hat * dres\n", + " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ncz7Uqn5sqqU", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "results <- double_lasso(y, D, W)\n", + "hat <- results$hat\n", + "stderr <- results$stderr\n", + "# Calculate the 95% confidence interval\n", + "ci_lower <- hat - 1.96 * stderr\n", + "ci_upper <- hat + 1.96 * stderr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P5PEjKw9gLvC" + }, + "source": [ + "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", + "\n", + "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 857 - }, - "id": "7uzcIGhVgmei", - "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deaBM9f/H8fddrddyUdakFLIVkaVIi+qXaEUpUimlpEUqRAqlpH0laVPas5WkolIpVL7JEsp6nTZEcnF+Z5mZ+5l7z3vOLO85M4fX8497586d+7nvc8zDnTt35gzpCKGEo1QPgNCBECAhJBAgISQQICEkECAhJBAgISQQICEkECBF0RLqGH7GUHpMZuFWOeXWJbKA1CAxVmKHsBeMaQOH0FOxjRH/7ou046LdqWGXSxDSlpHtquXkHz9kbclPvfheYksvoSMTW8CoYGSbqrm12j6w1Th9No0Pnd+GpoS+C3ULnjudaCgzSsfwM5x2dTy7ogl1umkrP757zL+5sVVWZRpctyqR5bmihhT1Blq76L66L8c0RgK7z95xzv8wKYD0TBnKbdn5hNKUPabE56pfmdDSEpCmlKWc408/Jpcqf6jrM6lh8PxllP9v6LtkZBcETl+YkQCkeHbFbqpQ6LoREWMhle9h1L1jZSrzUWLfwbFoIUW/gfFcWxLZffaOc/6u3kN6lrLv3W683zWhNI0q9rlfKOWQplLm0G3G+4KBlLVY31ePPgt84ka6pei7HBv8SfVnqSbxQ4prV/xFdd23ImIspMC+23ktHbYvwe/huH7HqC4X9QbGdW1JZPdZO475rp5D2lKG3gic/DAz82ddH0CTzQ8W0tn6BeYti/a6vu+J48uVP8W6Bu95+PjypY4csNE4OYze+7Jj+ap9tu+f0KBMozH7jbP2P9emfOmGw/4JrR4GadvtDUvn1r/VZKG/0alyTo0zZxU7qaweaHul0E3uIXS1ccOBLrU/2p2fEbrBs4SurNbEPvkk3WRBUlda16NKmeaT7OuNMqG9C7NofSy7IlDR8t2sm1+hUZSNUTa35K66id6a16FC+ZPmhgaJsO/2VaClRbtE+RZru+cbW7aFWoTNGvat76T3nqhZMXx5dYdE/KcxC2ygur6xOcvOrVaq+avWcMErR2AX2b8jhV9Pii4dcfepO0UdOGwFZVRzx9nftT3NsFb4JPRvVHyndqT3rfOn0ykO14JAiUAaZe15u0toQNi/yIzLqc2EacbNJTrmul7l6UVjr51FDW8cfhbVWGd+5R2VegyoTZcNqdO/d675Wf1SqnHLnW3o2O3BFVVIe06ilrfe2IBa7TX/7692zfAr8jNeDDuprh7oSTo+ePLfX403WqnSf1ofvUKd9aLvckV/WmSdbH3oNBOSutKfdajDsGuq97WuN8qEJSBFsyvslOXfH0OVJ0z4O/AJZWPUzS25q4bQ9WW63tItI+uT4CCR9l19+jo0mfIt/qhNne7pX/1a89qjXtHVbz2Sbi57cb+w5cN2SKR/GqvABqrrj6KRFTvfdDbRx7py5QjsIgtS+PVEuXTE3afuFHVgdQV1VHPH2d91Il1grXAtPR34HsV36jPUxzr/MnM7SlwLAiUCqSO9FDo9k44O/xd5w/qxOZXOMqb+uWy5HcZebrtbN/+L6K7rY6mUscG/ZuU0/EPXJ1IXXX+dWhqj7b+ebg+uqEJ6i9oYy/zX0PyvoSmtNs5Zn9cm7KS6eqALlHsXrHrRw9b7k+md0HlLqM83dJ15ajnd/IYJSV3pLuphnNxc3bzeqBPau/Djj3bHsCsCqcuH3TZRNkbd3JK7aihlTjcu+QC1CQwSad+tyMz52+lb3EUXmlt2qLll6qzqtx5DFY3fLcOWV3dIxH8aO3sD1fXHUq65p241r5vKlcPeRRak8OtJ0aUj7z51p6gDqyuoo1o7zvqu28vm/m5cYu8hpf4KrRW+U//MrbzHOHt3xTLbHa4FgRKBVIeWhE5vpOz9DteezrTAPGfCLb/o7cm+1ZKbu8vYvDPM08fSE8bbLdRI10+jOdaez6kRXFGFtPZt64fGELrX+K4Zm83T/+lhJ9XVAx1nL1nUF9TYfLcyo/be0HkGJL1pZfPf5nZaZkFSV2pOC83Td5vXG3XCEreio9gVgdTlwyApG6NubsldNdS+DbK7bMYf9iD8vtvydn3zx6PDt2hOn5snRxSHFP6tLRHq8uoOsWP+aeycIFnDf01tw64cCqTw60nRpSPvPnWnqAOrK6ijFkEyftA8arz9yPqfJbBWsZ16Dn1gnHyXeka4FiQCqVLR7Xt9B9FOh2tPOQper/eXJvu/xqbGLamxNMQ82cHak7vocF0vT/Yt/OPo18AXFL+zYfvmzXebV/TrqOGkzfZZRSfDVg9U3/5HV2pOXxhvb1PvDTAhPUxTjdsMtVvrJiR1pX259vgfmNcbdcISkKLYFQ67IQySsl3q5pbcVUPpNusSzeh7exCnfRfqyv+cvoWxZTvN97OLQwr/1jeZHyrLh+2QkrMW2wRnSNb9PCvpWPXKoUAqdj0punTk3afuFHV/FF8hOKoCaZ75e6Lej94rWqvYTp1KV+nmTfaZEa4FiUCqW3Sl1ddTrsPtmX+odPAC280LmJ1Cs43Ne8A82ZGWG2//NfbHrqJ/+S8DXxAG6Z32pa1PGrtgzzU5RMfctkZXT6qr7+1o9rHegmbp4T1t/oDfc2j2pqKzTEi/lzJ+Z/rQ+Ec0IakrbQuMv8i43oRNWAKS+64ITBW2G6xrQnBcZbvUzS2xq4xvbt9k7Wj852gO4rjvrLu/T6SGP5kflvwW26hUaMvCIYV9a/OefHV5dYdE/qexc4Jkbc4qaq5eORRITtcT89J2JXdfIGWnhO2PsBWUURVI++vRD3phlap7itYqtlN3lq+6V/8375DCCNeCRCCdQc+ETk+nZg7Xnl2UtT9wgR2UY5842fg5WeLa8S9ljAgU/GdQIT1DeYNenTn7Gvvu6U1Pd8uj3NfVk+rqhdZ2TtUvLnE/9D8Vyvytv0kXKWeZkPTumb/plxifMiGpK/0duLp9ZVxvwiYsAcl9VwSmCtsN1jUhOK6yXermOkGaYK3QwfgF2hyE33e7jjCubrrTtwhu2cLikEp+a3V5dYe4/dOYRYSkXDkUSE7XkyJIJXdfIGWnhO0PdQV1VAWScfv2ZuNH8w3KWsV3ai9jV79FA/UI14JEII0vuulq/NwzfrReT8+bp98uuj2TR1rwEmXJ/mWuMX3ncO2oSMX/RK1CqmX/Ceiu0N95/n0qu9Ju9aS6eqBJdGTw73X7Ry233g8wftM4k+aFfZc+5k+j0dvL9NItSOpKe7PsGx/vmtcbdcISkKLYFQ67oeQfQuyNUTfXCdIw67LNaJk9CL/vZtNh24t9yv4We7PtLXvb3DJ1VodvrSwftkPsIv3TBDZQXV+9YitXDuV3JIfrSREkdvepO0XdH+oK6qgqpLUZtfZfptykKLlTZ9J1ek/rEuy1IBFIf1UI3WM4LzP3N/O+Eevn6x1F155O1v+D+phTvzD+r7Dur/8ju8xuh2tHZ3rTWuiP0OoKpN1U3ny3v7W5C9bZN8w60E/qSXX1QLsOCdxu1vV7qZ31/n90UkFW6AEO9nfpY/zGcFirF837Ry1I6koN6Svz9K3m9UadsASkKHZFIHX5MEhFGxO2uU6QrHvvt+dmbbMHibDvLqJrlSmV/dXIvhl4k7llyqxO31pdXt0hVtw/TWC3WBuo7gv1iq1cORRIDtcTBRK3+9Sdog6srBA2qgrJGGRW+Ua6slbxnVpYtc6ucg2L7wy5RzaYDx0w/8fb/WRZ606lJ6mj8cP6p6rmHptO5xrnvEAtjV/O1lYu+6c+hdqZv/feYv7iVvLa8To1ManPzw7dd6L+RMon47q5f2R1GqAvpVPMZbbXzvpdORm2erDpGdTX/N1383VUIfBP2zHjDnpE3QQLkj4i89R6+wOQ1JVutu5NX1PZvvu7aEJ7F376SQhtFLsikLq8ek1QN0bZXEdIWea9KI9Sp9A9tey+21gh4zPHb3ErnW/Mt7qSuWXqrA7fWl1e3SGR/mkCn7Q3UF1fpaFcOexdZEFyuJ4okLjdp+4UdWB1BXXbrB0X/IeZQvVIeWCXw069lkab9/U5XQsCJfZYu5fLU+4J/9e+HJW2HkNQUIHa3tSz/AN0lvFvlJFzRX99Xxeq2/+yPHrO2IJu1GTwnafS0b87XTuM32hq3Tzigpy80J8Pl1DZs+1eMf79jrr33jYNPqQq962/hI4YcNd1delG80ZU6KS6eqg3K1Bm89Ob5FKdHwLnvEbZZf9SLhGAtC6T7tYDkNSVNlal42/oWdH+D1iZsMQfZKPYFYHU5cN+Iikbo26uE6Q+eX3u6Z2VszA4iMO+C/4n9AjV3+X0LTZVoxNu7VVxgLll6qwO31pdPmyHWDH/NHb2Bqrrq1ds5cph7yILksP1RIHE7T51p6gDqyuoo1o7LvgPszOPjN+SlbVK7NQFlJextvjOkHz099a721bNyW89LHA/2I+nlC1/wrsanWycvq9qqRbGD8XxzcqU62D9VlL4cIuypRreYV6PHSDte65tXnbt3stDSxfdhTtC/3fokaXqXPe7fnm56j/se6Jd1ayKJz2/33yISeikunpRv49qUzW7coeJwYeo6nuqF3t4lQ1JPz3TvOfYghS20vJulUo3fe4POiF8QgdI7rsikLJ8GCRlY9TNdYL05NyO5ct3nB8axGHfhR4i1LLocYVh++tnc8smLrVEKLM6fOuw5dUdYsX809gFNlBZP4yGcuWwdpH9EKGS1xMFErf71J2iDqyuoI5q77jgP8yV5oN/lLVK7NT9h9NJevGdIQkJeZ/kc5Cifj5EupfYThmjPDAlzgDJdwGSQwntlD11qu52v1TkAMl3AZJDCe2UQSX+4hh7gOS7AMmh+HfK8iEnUvNd7pdzCZB8FyA5FP9O+Tiz/MUJPd3fDpAQEgiQEBIIkBASCJAQEgiQEBIIkBASCJAQEgiQEBIIkBASKAFI//zt1r+F210vE3//7Eri4n8X/pfM1XcndccU7kzi6tuSu2MKtyVx9Z3SO6boifwJQPpbc2u3/qfrZeJvx84kLq7pe5K5+p5k7pjt+o4krv5HYRIX1/bovydx9X+kd8yfgOQWIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApeQbp3YnCkysBEhMgcfkXUrOsn4VHLwqQmACJy7+QjqYvhEcvCpCYAInLv5COpE+FRy8KkJgAicu/kOrSHOHRiwIkJkDi8i+k2jRTePSiAIkJkLj8C6k6vSM8elGAxARIXP6FVI2mCY9eFCAxARKXfyHl0wPCoxcFSEyAxOVfSBWor/DoRQESEyBx+RdSWeomPHpRgMQESFz+hVSKThIevShAYgIkLv9CyqLjhEcvCpCYAInLt5AKiZoIj14UIDEBEpdvIe0kaig8elGAxARIXL6F9BfREcKjFwVITIDE5VtIBUR1hEcvCpCYAInLt5DWE1UXHr0oQGICJC7fQlpDlC88elGAxARIXL6FtIIoT3j0ogCJCZC4fAtpGVFp4dGLAiQmQOLyLaQlRFnCoxcFSEyAxOVbSIuIqEB49lCAxARIXL6FtNCAtEF49lCAxARIXL6FNN+AtFZ49lCAxARIXL6F9LEBaaXw7KEAiQmQuHwL6QMD0nLh2UMBEhMgcfkW0vsGpB+EZw8FSEyAxOVbSG8ZkBYLzx4KkJgAicu3kF4zIH0tPHsoQGICJC7fQnrRgJS0g38DEhMgcfkW0iTKTN7BvwGJCZC40hPSjr/d2vM0laVPXS8WZ7t2J2tlM31vMlcv3J7ExXfpu5K4+vbk7hh9WxJX/1d6x2yXgLTbtb2PURWa7365+Crcm6yVzfR9yVx9339JXLxQL0zi6v8ld8dEcbWKv0LxHSMBKYqbdg9RLZou/NM0FG7aMeGmHVd63rSLAtL9VD95R9EHJCZA4vItpNHUNHlH0QckJkDi8i2kkdSaXhGePRQgMQESl28hDacONEV49lCAxARIXL6FdAedTs8Lzx4KkJgAicu3kG6js+k54dlDARITIHH5FtLNdB49LTx7KEBiAiQu30K6kXrQE8KzhwIkJkDi8i2kAXQZPSY8eyhAYgIkLt9C6k9X0sPCs4cCJCZA4vItpH7Un8YLzx4KkJgAicu3kPrSjTROePZQgMQESFy+hXQZDaaxwrOHAiQmQOLyLaRLaCiNFp49FCAxARKXbyF1p5E0Snj2UIDEBEhcvoV0AY2lEcKzhwIkJkDi8i2kc+lBGiY8eyhAYgIkLt9COocepTuFZw8FSEyAxOVbSGfTkzREePZQgMQESFy+hXQWPUe3Cs8eCpCYAInLt5A602S6SXj2UIDEBEhcvoV0Gr1CNwrPHgqQmACJy7eQTqFpdL3w7KEAiQmQuHwL6WR6m64Vnj0UIDEBEpdvIXWg6XS18OyhAIkJkLh8C6k9zaarhGcPBUhMgMTlW0jtaC71FZ49FCAxARKXbyGdQJ9Rb+HZQwESEyBx+RZS68wF1Et49lCAxARIXL6FdHz2l9RTePZQgMQESFy+hdQi52u6SHj2UIDEBEhcvoV0bKlv6QLh2UMBEhMgcfkWUvPSS6ib8OyhAIkJkLh8C6lp2R+oi/DsoQCJCZC4fAupcbmf6Czh2UMBEhMgcfkWUqO8FdRZePZQgMQESFy+hdSw4mo6RXj2UIDEBEhcvoV0dOW1dLLw7KEAiQmQuHwL6aj89XSi8OyhAIkJkLh8C+mIqhuprfDsoQCJCZC4fAvp8GoF1Ep49lCAxARIXL6FdNihWkZL4dlDARITIHH5FlLtGlp2c+HZQwESEyBx+RZSzVpaqSbCs4cCJCZA4vItpOp1tLKNhGcPBUhMgMTlW0iH1tUqHC08eyhAYgIkLt9CqlZPq3yE8OyhAIkJkLh8C6nKkVrVw4RnDwVITIDE5VtIletrh9YWnj0UIDEBEpdvIVU8WqtVQ3j2UIDEBEhcvoWU11CrU0149lCAxARIXL6FVO4YrV6+8OyhAIkJkLh8C6lME61+ReHZQwESEyBx+RZSqWZaw/LCs4cCJCZA4vItpJxjtWNKC88eCpCYAInLt5CyjtOa5QrPHgqQmACJy7eQMlpqx2YJzx4KkJgAicu3kOh4rWWG8OyhAIkJkLj8CmkXtdZa0Vbh4YMBEhMgcfkV0k46QTuBNgsPHwyQmACJy6+QdlBbrR1tFB4+tDogOQdIXH6FtJ3aaSfSeuHhgwESEyBx+RXS33Si1pHWCQ8fDJCYAInLr5D+opO0TvSL8PDBAIkJkLj8CukP6qidSquEhw8GSEyAxOVXSL8bkDrTz8LDBwMkJkDi8iskjU7WzqTlwsMHAyQmQOLyK6QC6qSdTcuEhw8GSEyAxOVXSFvoVO0c+l54+GCAxARIXH6FtJlO07rREuHhgwESEyBx+RXSJjpdO5++Ex4+GCAxARKXfyF11i6ib4SHDwZITIDE5VdIG+hMrQd9JTx8MEBiAiQuv0JaT2dpF9MXwsMHAyQmQOLyK6Tf6P+0XjRfePhggMQESFx+hfQrna31ps+Ehw8GSEyAxOVXSOuoi9aX5gkPHwyQmACJy6+Q1tA52pU0V3j4YIDEBEhcfoX0C3XV+tGHwsMHAyQmQOLyK6TV1E3rT7OEhw8GSEyAxOVXSCvpPO06mik8fDBAYgIkLr9CWkHnazfQ+8LDBwMkJkDi8iukn+kCbRC9Izx8MEBiAiQuv0L6iS7SbqK3hIcPBkhMgMTlV0j/o+7arTRNePhggMQESFx+hbSMemi30WvCwwcDJCZA4vIrpB+pp3YHvSI8fDBAYgIkLr9C+p4u0YbSS8LDBwMkJkDi8iukpdRLG05ThIcPBkhMgMTlV0hLDEgjabLw8MEAiQmQuPwKaTFdqo2iicLDBwMkJkDi8iuk7+gybQw9Izx8MEBiAiQuv0L6lnprY+lJ4eGDARITIHH5FdIi6qM9QI8JDx8MkJgAicuvkL6hy7WH6BHh4YMBEhMgcfkV0tfUV3uEHhIePhggMQESl18hfUVXaI/ROOHhgwESEyBx+RXSQrpSe4rGCg8fDJCYAInLr5C+pKu0Z2m08PDBAIkJkLj8CukL6qdNoruFhw8GSEyAxOVXSJ/T1doLdJfw8MEAiQmQuPwM6SUaKjx8MEBiAiQuv0JaQNdor9IdwsMHAyQmQOLyL6T+2us0WHj4YIDEBEhcfoU0n67V3qSbhYcPBkhMgMTlZ0jv0CDh4YMBEhMgcfkV0mcGpPfpBuHhgwESEyBx+RXSp3SdNsvAlJwAiQmQuPwK6RMD0gfUT3j4YIDEBEhcfoU0j67XPqKrhIcPBkhMgMTlZ0jzqK/w8MEAiQmQuPwK6WO6QZtPlwkPHwyQmACJy6+Q5tJA7XO6RHj4YIDEBEhcfoa0kHoKDx8MkJgAicuvkD6iG7VFdKHw8MEAiQmQuFIIacf4PpfcXWCf/uOBS7vfviIGSHNokLaYzhMePhggMQESVwoh3TNkzcYHBuyzTt805JdND/b6NzZIS6mr8PDBAIkJkLhSB0nr+ovxU+ncpebp7WN+0/Wt56yMHtKHdJP2I3URHj4YIDEBElfqIH15wX7j7fWvh85Y3s382l0bjLQ/3fqQbvlzBZ3lern42rkrSQtb6YXJXL3w7yQu/o++M4mr/7U3iYv/uUf/K4mr75TeMX9HC+mDy823Q58Nfrz9usnmu3ktjb6O/KVGc2mo/jud7Xo5hPzZvtApN0h9zbchSOuvftL8CaUvG2K0fLdbc+j23Vuos+vl4qtwb5IWttL3JXP1ff8lcfFCvTCJq/+3P4mL796nJ3P1QvEdEy2kr+ybdm/YHy29ZLryOfffkWbRrdo66ih8uzQYfkdiwu9IXKn7HemPrqt0fVu3ZdYH/7v4W/Vz7pBm0mBtPZ0oPHwwQGICJK4U3v09dtCaDSNv3q/PeV//r99U82tjuPvbhLSR2ggPHwyQmACJK4WQdk7o3WuMcfFxw/Sl51jNiB7SDLpN20qthIcPBkhMgMTl14cImZC07ObCwwcDJCZA4vIrpOk0RNNKHyM8fDBAYgIkLr9Cet+EVOFo4eGDARITIHH5GlL+4cLDBwMkJkDi8iuk90xIh9YWHj4YIDEBEpd/Id2uaXUOER4+GCAxARKXXyG9ax5Av16+8PDBAIkJkLj8CukdE1KDPOHhgwESEyBx+RfSnZrWuJTw8MEAiQmQuPwK6W0T0nFZwsMHAyQmQOLyL6ShmtaKCoSnDwRITIDE5VdIb5mQ2tMG4ekDARITIHH5FdKbNEzTTqa1wtMHAiQmQOLyL6ThmnYarRSePhAgMQESl18hvWFC+j/6n/D0gQCJCZC4/AvpLk3rSkuFpw8ESEyAxOVXSNNMSBfQIuHpAwESEyBx+RfSCE3rSV8KTx8IkJgAicuvkF43IV1G84WnDwRITIDE5WtIV9Jc4ekDARITIHH5FdJrdLemXUOzhacPBEhMgMTlV0hTTUgDaIbw9IEAiQmQuPwLaZSm3UjvCk8fCJCYAInLr5BeNSHdQm8ITx8IkJgAicu/kO7RtNvoNeHpAwESEyBx+RXSKyakofSy8PSBAIkJkLh8DWkETRaePhAgMQESl18hvUyjNW0UTRSePhAgMQESl18hvWRCGktPC08fCJCYAInL15DG0WPC0wcCJCZA4vIrpBdpjKY9RA8LTx8IkJgAicuvkKaYkB6jB4WnDwRITIDE5V9IYzXtSfNNMgIkJkDi8iukF+g+TXuW7hWePhAgMQESl18hTTYhTaaRwtMHAiQmQOLyL6T7zXschgpPHwiQmACJy9eQXjVf2yUZARITIHH5F9I488ANtwpPHwiQmACJy6+QnqcHzAOADxKePhAgMQESl18hTTL/hPQ+3SA8fSBAYgIkLr9CmkjjNW0W9ReePhAgMQESl18hPWdCmkNXCU8fCJCYAInLv5Ae0rR51Fd4+kCAxARIXH6F9CxN0LT5dKnw9IEAiQmQuPwK6Rnzgd9fUk/h6QMBEhMgcfkV0tMmpG/oIuHpAwESEyBx+RfSI5q2mM4Vnj4QIDEBEpdfIT1Fj2raD9RFePpAgMQESFx+hfSk+Szzn+gs4ekDARITIHH5GtJKOk14+kCAxARIXH6F9IQJ6RfqJDx9IEBiAiQuv0J6nB7XtF+pg/D0gQCJCZC4/ArpMRPSRmonPH0gQGICJC7/QnpC07ZQa+HpAwESEyBx+RXSo/Sk8TajpfD0gQCJCZC4/ArpEXrKeJt9rPD0gQCJCZC4/ArpYeuw36WaCE8fCJCYAInLr5AmWJDKNRKePhAgMQESl38hPWO8rXiU8PSBAIkJkLj8CukhetZ4m19PePpAgMQESFx+hTTegnRIHeHpAwESEyBx+RXSg9aL9dWoJTx9IEBiAiQuv0J6wIJU51Dh6QMBEhMgcfkX0iTj7eFVhKcPBEhMgMTlV0jj6Hnjbf1KwtMHAiQmQOLyK6T7abLxtkF54ekDARITIHH5FdJ9FqRjSgtPHwiQmACJy9+QmuUITx8IkJgAicuvkMbSFONti0zh6QMBEhMgcfkV0hgLUisqEB7fDpCYAInL35Da0kbh8e0AiQmQuPwKaTS9aLw9kX4THt8OkJgAicuvkO61IJ1Ma4XHtwMkJkDi8jekU2ml8Ph2gMQESFx+hXQPvWy8PYN+Fh7fDpCYAInLr5BGWZDOpmXC49sBEhMgcfkX0ivG2660VHh8O0BiAiQuv0K6m1413p5Hi4XHtwMkJkDi8jek7vS18Ph2gMQESFx+hSNkk4kAACAASURBVDSCphpve9KXwuPbARITIHH5G9KlNF94fDtAYgIkLv9Ces14ezl9Ijy+HSAxARKXJ5DKKeXKQLrLgnQVfSQ8vh0gMQESlyeQehg1yGl7wbnHZrS8XgrS68bbq+kD4fHtAIkJkLi8umn3RpNN5rufG74vA2m4Bek6mik8vh0gMQESl1eQmkyz3z/VXBLSQHpXeHw7QGICJC6vIOXODfxkKiUDaRhNM97eRG8Jj28HSEyAxOUVpJq9rHf7e9SQhDTYeisfIDEBEpdXkEZQ04H33DOgEd0uBelN4+3t1uMb5AMkJkDi8grS/vtrkFHV4XslIQ21npUkHyAxARKXd3+Q3f/r11/9si8aRtFAGmr9djTCOiiXfIDEBEhc3kH695u3Nb1QFtK99Jzw+HaAxARIXJ5BejCPaKF+5+VRUXKHdAe9o5lHt3tKeHw7QGICJC6vID1LXZ82IE3JHicD6XYL0jh6THh8O0BiAiQuryA166//a0DS7zhaBtIQC9JD9LDw+HaAxARIXF5BKv2RDenDHBlIu3Xz+vIoPSg8vh0gMQESl1eQDpluQ5pWQRLSE3Sf8Ph2gMQESFxeQTqt4y4T0h9NOktCeppGC49vB0hMgMTlFaRPsurfSFf0qZDzuSSkiTRKeHw7QGICJC7P7v6ee5z5yIbWn0bjKGpIk+ku4fHtAIkJkLg8fKp5wZIlf+rRFS2kl2io8Ph2gMQESFxeQWo7M0pDMUGaSkOEx7cDJCZA4vIKUu3xyYA0jW4VHt8OkJgAicsrSO81emePPKS3aZDw+HaAxARIXF5BOqkp5dasayYJ6X26QXh8O0BiAiQuryC1P+XUQJKQZtG1wuPbARITIHF5fYDIHSslIX1I/YTHtwMkJkDi8hrS3HxJSB/TFcLj2wESEyBxeQZpRq+T2rdv3yavqiSkz6i38Ph2gMQESFxeQZpK2bWpZmnqFNXfk6KF9DldIjy+HSAxARKXV5Banrldz/qx8NGTt0tC+op6CI9vB0hMgMTlFaS8Gbqe9YOuDxogCelbukB4fDtAYgIkLs+e2Ddb1yvM1/UFNSUhLaFuwuPbARITIHF5Bem4C//TGw/V9ffKSUL6gboIj28HSEyAxOUVpJfoVH14Vr+7a7WThPQTnSk8vh0gMQESl2d3f08dq+88najOomggbf/Trf/0v423q6mz6yXj6Z9/k7JsIL0wmasXbkvi4v/oO5O4+t9J3TF79L+SuPou6R3zNwPJatVP0T1ydfcet/bphcZbjTq7XjKe9u5LyrKB9P3JXH1/YRIX36vvTeLqhcndMXoyV98rvWP+iwQp2qK9afcrdRD+gWqHm3ZMuGnH5dVNuyrB8iQhbaR2wuPbARITIHF5BambVesyTUT/jrSFWguPbwdITIDE5fGDVjd3mCEJSctoKTy+HSAxARKX14/+XtRSFFJOc+Hx7QCJCZC4vIa0uYwopNKNhce3AyQmQOLyGNL+0bVFIZVvKDy+HSAxARKXV5CaWzWpSreKQqpUX3h8O0BiAiQubyEdd8oj/5VQkwik/MOFx7cDJCZA4vL6d6ToihrSIXWEx7cDJCZA4vI5pJo1hce3AyQmQOLyClJ22XJKYpDqHCI8vh0gMQESl1eQrjsmu8355x6bcWzPHkZikA6vIjy+HSAxARKXV5DeaLrRfLe8wXQ3RDFBql9JeHw7QGICJC6vIDV+w37/VHNRSA3KC49vB0hMgMTlFaTcj+z300qJQjqmjPD4doDEBEhcXkGqecl+893ec2qIQmqWKzy+HSAxARKXV5DuovrXjhgx4Bi6QxTScZnC49sBEhMgcXkFad+YGuZryFYbsVcUUivaKjy/FSAxARKXd3+Q3f/r11/9si8aRjFAakObhOe3AiQmQOLyDNLOTbq+a/KDv8hCak/rhee3AiQmQOLyCtLyQ8bqhccTVVwsCqkjrROe3wqQmACJyytI5zddrb9ET65ud6EopFNotfD8VoDEBEhcXkE65BVdP6+Jrr9SRxRSZ1ohPL8VIDEBEpdnf5Cdp++tfJuuz8kVhXQW/SQ8vxUgMQESl1eQ6kzU59A8XZ8k+wfZLvSD8PxWgMQESFxeQbqy+u11j9yrFzST/R3pXFosPL8VIDEBEpdXkDa1oaoLdb1Hxe9FIV1A3wrPbwVITIDE5d0fZLeZx89ftCUaR9FD6kFfCc9vBUhMgMTl4VPNd80uiEpRLJAuoc+F57cCJCZA4vIQ0lp6RxzSZTRfeH4rQGICJC6fQ+pL84TntwIkJkDi8jmkq2iO8PxWgMQESFw+h3QNzRKe3wqQmACJy0NI/y35W4+yqCENoBnC81sBEhMgcXl+gMi1opBupHeF57cCJCZA4vIG0med63eeZZ7Yfa/sy7rcTG8Kz28FSEyAxOUJpIU5GYflZEzT9Q+PogaikAbTNOH5rQCJCZC4PIHUreJSveD4RusvpEoT9ohCup1eFZ7fCpCYAInLE0iH32i8mU2ls67VomEUA6Sh9KLw/FaAxARIXJ5Ayn7CeLOOOv4YHaMYII2gycLzWwESEyBxeQKJnjPebKbZ0TqKHtIomig8vxUgMQESl88hjaGnhee3AiQmQOLyOaT76Qnh+a0AiQmQuLyBdMfChQtn0oSFZqKQHqRHhee3AiQmQOLyBpKaKKSH6SHh+a0AiQmQuDyBNEJNFNJjNE54fitAYgIkLp+/GPOTNFZ4fitAYgIkLp9DepbuFZ7fCpCYAInL55Cep5HC81sBEhMgcfkc0gs0XHh+K0BiAiQun0N6me4Unt8KkJgAicvnkF6n24TntwIkJkDi8g7Sv9+8remFwpDepJuF57cCJCZA4vIM0oN5RAv1Oy+PilLUkN6lgcLzWwESEyBxeQXpWer6tAFpSvY4UUgz6Trh+a0AiQmQuLyC1Ky//q8BSb/jaFFIH1I/4fmtAIkJkLi8glT6IxvShzmikOZRX+H5rQCJCZC4PHvpy+k2pGkVRCHNp8uE57cCJCZA4vIK0mkdd5mQ/mjSWRTSl9RTeH4rQGICJC6vIH2SVf9GuqJPhZzPRSEtoguF57cCJCZA4vLs7u+5x5lPRmr9aTSOooe0mM4Vnt8KkJgAicvDRzYULFnypx5dUUP6kboIz28FSEyAxOXzhwj9TGcIz28FSEyAxOUJpAZqopBW0anC81sBEhMgcXkCqb2aKKR11FF4fitAYgIkLp/ftNtI7YTntwIkJkDi8g7SllmTp3ywRRjSFmolPL8VIDEBEpdXkP66KNu8+zuj1z+ikLTMFsLzWwESEyBxeQXp8pwrp8x495lu1F8WUm5T4fmtAIkJkLi8glR5iv1+SBVZSGUbCc9vBUhMgMTlFaRSm+3388rKQqpwlPD8VoDEBEhcXkFq8YX9/skOspDyDxee3wqQmACJyytIc49fsF/X985s/J0spENrC89vBUhMgMTlFaQ21ajcEUeUoToNo3l0Q/SQalcXnt8KkJgAicuzm3ZtY3l0Q/SQ6lYRnt8KkJgAicvnj2zQ6lcUnt8KkJgAictDSNv/spKF1Kic8PxWgMQESFxeQfrl7HLJeKExrUmu8PxWgMQESFxeQTq5Yq9bh1jJQjouU3h+K0BiAiQuryCV+yIaQLFDakUFwhtgBkhMgMTl2eG4NiYHUlvaKLwBZoDEBEhcXkG65Z7kQOpAvwpvgBkgMQESl1eQ/jut/a1jrWQhnUKrhTfADJCYAInLK0hjiZJyr11n+ll4A8wAiQmQuLyCVOOCz1evtZKFdDYtE94AM0BiAiQuz55GkaQ7G7rREuENMAMkJkDi8grScUuTA+kCWiS8AWaAxARIXF5B+uyU75MCqSctFN4AM0BiAiQuryC1r03l61rJQrqU5gtvgBkgMQESl1eQTjo1mCyky2me8AaYARITIHF5/TSKHStlIfWjD4U3wAyQmACJy2tIc/NlId1A04U3wAyQmACJyzNIM3qd1L59+zZ5VWUhDaZpwhtgBkhMgMTlFaSplF2bapamTjNlId1FLwpvgBkgMQESl1eQWp65Xc/6sfDRk7fLQhpLzwhvgBkgMQESl1eQ8mboetYPuj5ogCykCfSI8AaYARITIHF5Ban0bF2vMF/XF9SUhfQU3Se8AWaAxARIXJ49ROjC//TGQ3X9vXKykCbTSOENMAMkJkDi8grSS3SqPjyr39212slCep2GCG+AGSAxARKXZ3d/Tx2r7zydqM4iWUjv0Y3CG2AGSEyAxOXtH2RX/bQnGkcxQPqQrhbeADNAYgIkLs8g7dyk67smP/iLMKT51Ft4A8wAiQmQuLyCtPyQsXrh8UQVF8tCWkTdhTfADJCYAInLK0jnN12tv0RPrm53oSykH+kc4Q0wAyQmQOLy7Lh2r+j6eU10/ZU6spBW0WnCG2AGSEyAxOUVpNx5+t7Kt+n6nFxZSBvoROENMAMkJkDi8gpSnYn6HJqn65NqyELSMo8X3gAzQGICJC6vIF1Z/fa6R+7VC5oJ/46klWksvAFmgMQESFxeQdrUhqou1PUeFaM6BkoMkPKPEN4AM0BiAiQu7/4gu838W+yiLdE4igVSrZrCG2AGSEyAxOXhIxt2zS6ISlFskI7MF94AM0BiAiQuDyGtpXeSAKlxGeENMAMkJkDi8j2kVhmbhLdAAyQ2QOLyPaSu9J3wFmiAxAZIXL6HdAO9I7wFGiCxARKXJ5DW79TX/qf/t+TvMC07xve55O7g/Q8bbukWH6RxyThoAyAxARKXJ5BKT9ep5BP67hmyZuMDA/ZZp+f3nhAnpNfpZuEt0ACJDZC4PIFU5rL5NHFBoOCZWtdfjJ9K59ov9/Lx1oVxQlpIFwlvgQZIbIDE5QmkS0gpeOaXF+w33l7/euDDIKTCbUZ//u7Wbv0v+8TGzBNcLxxz/+ySX7MovTCZq+/5K4mL79D/SeLqfyZ3x+h/JHH1ndI75i8HSIUzXqARkwMFz/zgcvPt0GeLQZrX0ujrErcD+Y4s/08Ml0bIH+0LnQq71+7UFcUv+EFf820JSEuvNfpxj1v79MLAqcH0guulY23vXvEllfT9yVw9qYvv1ZO6Z5K7Y/Rkri5+lfnPGZKu/z7j2YkfKAcs/sq+afdGMUhWMfyOpC2gDsK3TvE7Eht+R+Ly6u9I+27JMX9BKjcudM4fXVfp+rZuyxKFpLXK+ER4GwCJC5C4vII0js6bNGvGM2fQlNBZYwet2TDy5v36nPd1Q8Wcbpr2b1yQXqQTpR/cAEhMgMTlFaRGN9vvr24ROmvnhN69xhgXHzdM1688x+y9uCAVtKT8LbIbAUhMgMTlFaRSH9vvZ5bRoygmSNqmk2mx7EYAEhMgcXkFqdx0+/275eUhaddLP94OkJgAicsrSCd2su7O+7fzyUmA9CBNkN0IQGICJC6vIM3MOKz/PaP61cz8KAmQ3pQ+kj4gMQESl2dPo3inoXn3d9OoXkI2VkjfUVfZjQAkJkDi8vD5SBu/ifLQJzFD2pLbTHYjAIkJkLh8f/ATq3p5shsBSEyAxOX7Z8hadaIVohsBSEyAxHVgQLqSPhDdCEBiAiSuAwPSUHpRdCMAiQmQuA4MSA8J/yEJkJgAictDSMUPfiIIaQoNE90IQGICJC6vILX8yX7/ZqNkQJpJ14puBCAxARKXV5ACRxEqvFv4hcbsvhJ+JVlAYgIkLm8gKcc+aeHgJmFIq+hU0Y0AJCZA4vIG0tJHqNuVZlfdtT4ZkLZmHyu6EYDEBEhcXt20O2NlNIDihaRVqyO6EYDEBEhcHt5rl0xIjWRf3AWQmACJyytIVYLlJQXSifSb5EYAEhMgcXkFqZtV6zJNBiQFUjdaIrkRgMQESFwe37Tb3GFGUiBdQXMlNwKQmACJy+vfkRa1TAqkwfS65EYAEhMgcXkNaXMSjiJkdB89IbkRgMQESFweQ9o/unZSID1HoyQ3ApCYAInLK0jNrZpUpVuTAultGiS5EYDEBEhc3kI67pRH/iuhRgLSfLpUciMAiQmQuA6QP8guo/+T3AhAYgIkLs8grX7/lRkbkgZpU0YbyY0AJCZA4vII0ntNrId+t/00SZC0vKMlNwKQmACJyxtI46lsr4cnT7i4bObzSYJUt4rkRgASEyBxeQJpaWb7TdaJje1ySrwEpgykFlkFghsBSEyAxOUJpMsr/x449Xvla5ID6TRaKbgRgMQESFyeQDq8X+jk1fWTA6k7fSW4EYDEBEhcnkAq9UDo5EPJeYiQ1p9mCW4EIDEBEpcnkMqPDZ28LznPR9LupJcENwKQmACJyxNITS8KnTyneXIgPUiPCG4EIDEBEpcnkG7LWRY49WXmsORAmkx3CW4EIDEBEpcnkDZVrDXbfL9van6V30uykYD0Ht0guBGAxARIXN78QXZuBTr8/D5da1DVL6NxFAekBdRLcCMAiQmQuDx6iNC662oRUb1bN0flKA5I/6OzBDcCkJgAicu7R39v27AjOkVxQdqU0VpwIwCJCZC4DpCnUWhaxfqCGwFITIDEdcBAqpcvuBGAxARIXAcMpJaZW+Q2ImZI3z557RMbor0wIDEBklKqIJ0u+XrMsUDaOGtUl0PMJ1vlX78ouq8AJCZAUkoVpIvpc7mNiBbS8hdvOKGUYajKmXdNu7YyZZ7yYjQ/FgGJCZCUUgXpVslDREYFafMNRxqGMhte9pj9wPP1j7Ygqn3n/1y/EJCYAEkpVZAm0ENyGxENpI1dqOyJN09drZ43t1cZyun2rstXAhITICmlCtI0ukVuI6KAtLEznbCm5NmrRx9FdPTYXyJ9KSAxAZJSqiB9Tj3lNsId0vpOdOKvjp/Z+vY5OVR2bISvBSQmQFJKFaR1dJLcRrhC+vVE6rSe/eyPQ/IzJ/NfDEhMgKSUKkhapXpyG+EGad2JdGrEPxvNK1v6I/aTgMQESEopg9Q4d6vYRrhAWn08dd4YeYXJGbV/4j4HSEyApJQySJ1pudhGRIa0qgV12+S2xK3UirMGSEyApJQySFcQf2Mq1iJCWn4Mnb/ZdYmt51J35lOAxARISimDNIxeENuISJB+akSXRnMwyvXH0QjnzwASEyAppQzSUzRabCMiQFp6BPWJ7qCuP9bIfNnxE4DEBEhKKYM0na4T2wge0uLDqH+0d2rMLpW3wOl8QGICJKWUQVpM3cQ2goW0pTndGP0yT1Bdp4ekAxITICmlDNKmzFZiG8FCuj82rQOpncPde4DEBEhKKYOk5cv9RZaDtCK//A+xrFNwJl1c8lxAYgIkpdRBOrKS2EZwkC6mkbEttO4YGlPiTEBiAiSl1EFqleH+150oYyDNyWzg+ofYYi2umjW1+HmAxARISqmD1FnuyebOkLY0pbdjXmpmbom77gCJCZCUUgepBy2U2ghnSGPpgjjWeqzEXXeAxARISqmDJPgSSY6QlleM7Z6GYNdQx/DbnIDEBEhKqYN0Jzk/kCCOHCH1pFFxLbalM10VdgYgMQGSUuogPUCPSW2EE6SZGQ1jvach0NqG9ID6MSAxAZJS6iBNorulNsIB0ubGGe/Hu9w3lXM+UT4EJCZAUkodpLdpkNRGOEAaTRfFv95L1EJ5oCsgMQGSUuogfUq9pTaiJKSfKub9mMCCXei+og8AiQmQlFIH6Xs6R2ojSkK6KLEnaSyrkFd0jx8gMQGSUuogracTpTaiBKQZGY3ivKch0BjqGjoNSEyApJQ6SFrpY6Q2ojikzY0z3ktsxYLji+6cByQmQFJKIaSaNaQ2ojikUYkfffKznNrrAicBiQmQlFIIqXEpqY0oBmlZXkX22FpR15+uD5wCJCZAUkohpJPI+SDCsVcM0mUOz4WIuXW1sz+1TwESEyAppRBSV1oqtBHhkL7KPszlcJBRNZWOs189CZCYAEkphZAup08cLhlP4ZC60HMiq55N91vvAYkJkJRSCGkQvSW0EWGQ5mQ0ju74W24F/5gESEyApJRCSKNootBGhEE6id4QWvZeOtd8B0hMgKSUQkiP0TihjVAhvU7thFbVClrSKxogsQGSUgohvUJ3CG2EAmlrs4zZQqtq2qfZdX4FJDZAUkohpFl0jdBGKJCeVh7bk3hX00BAYgMkpRRCWkg9hDaiCNKmw7O/ElrUbG3NnPmAxAVISimE9DOdIbQRRZDGUh+hNe1eoBYFgMQESEophLQp4wShjQhB+vWQ0t8LrRnoLHoAkJgASSmFkLRyDYU2IgTpdrln3QZaWi5vIyA5B0hKqYRUq7rQRgQhrcirtFJoyVCj6GJAcg6QlFIJ6ZjSQhsRhHRNrMf6jqItzehd8UWVAIkJkJxyhNSONshsRADS4twav8ksqPZR1mFSj1J3CpCYAMkpR0j/R/+T2YgApB70iMx64Q0U/8VLDZCYAMkpR0gX0xcyG2FDmp9VX+z1LdS21wo+MykZARITIDnlCOlaqaN/25A60xSZ5Yqlv04tZR5P7hQgMQGSU46Q7qBXZTbCgvQdtYj2ZZdjS99zBo1PyspmgMQESE45QrqPnpTZCAvSWBors1rx9D1LylX6OTlrAxIbIDnlCOlpqau+Belk+k5mteLpe7S76IrkrA1IbIDklCOk12iIzEaYkNblNpJZrEQGpI31soq/jp9UgMQESE45Qpot9TwKE9LzSbuT2nys3UTqnKTVAYkJkJxyhLQw8eM42pmQesq9AGCxrAetnkDTkrM6IDEBklOOkJbTmTIbYUAqqFZli8xiJbIgfZBxTHLWByQmQHLKEdJGaiOzEQakWVI/3UpmP43iPHo4KasDEhMgOeUISSsrdP+AAWkQPS+zVslsSEvLVFuTjNUBiQmQnHKGJHUYfQNSo5xfZNYqWeCJfTfSLclYHZCYAMkpZ0iNyshsxI6dS6ijzFIOBSCtPaT0kiSsDkhMgOSUM6S2JHGQbhPS2MReoS9iwaeajxM7WIsaIDEBklPOkM6ixF9+xWzHzk70jchKTgUhbWmU+ZH86oDEBEhOOUPqSV+KbMSOraUaiCzkWOjgJ1PlDuJaFCAxAZJTzpD6C/0RdcdUukFkIceKjiLUKQlP1AAkJkByyhnS7TRVZCN29KbpIgs5VgTp8+zDZX6pUwIkJkByyhnSWHpKZCO2HVo5Kc+NtVOOa3eZ/H0agMQESE45Q3pK6HkUn9BFIus4p0BanldphfDqgMQESE45Q5pKt4tsxG1Cr9HnnHqk1Tuov/DqgMQESE45Q5oldLVslr1aZB3nVEjr6+R8Lbs6IDEBklPOkL6UeaTp9xknSSzDFXbs7yepi+zqgMR0MELa9rtbu/W/HM5dQae7fmUUPUD3SSzDpRcqH2jH0XTR1fc47Ripduj/JHH1PwvdLxN/e/Q/krj6Tukd85cEpD173dqvO527J6uV61dG0Vn0s8QyXPp+9aPPM45139wY2u9+kfjbp+9L5vJJnd35KiPVPukdUygBKd6bdlqVwwR+qK4vc9RO90vFX7GXdelCT0iujpt2TAfjTbu4ITUoL7AJL9FALyF9K3uEcUBiAiSnGEhtJQ6jfxnN8hKS1l/sRaTNAIkJkJxiIHWhxF9gb2uNCn96CmlVfrllcqsDEhMgOcVAupzmJbwFH9H5OzyFpI2my+RWByQmQHKKgXQLvZHwFgympz2GtOnIzI/FVgckJkByioE0hp5OeAuOzV7pMSRtCp0stjogMQGSUwykZxJ/NPWarOM1ryFp7YSe/6EBEhsgOcVAeoNuTnQDptEA7yF9knmU1PM2AIkJkJxiIH1CfRLdgFtpsveQtJ40Tmh1QGICJKcYSD8k/hDQjrQsBZB+KFtF6Dh6gMQESE4xkBI/aPGWvLpaCiBpt9BAmdUBiQmQnGIgaXlHJzj/p+aTY1MAaV31Ut+KrA5ITIDkFAepbn6C899HD6QEkva40BOTAIkJkJziILXMTPDFUi6gz1IDaWsLelNidUBiAiSnOEidKcFXOa6TtyU1kLTZGY0lXjEJkJgAySkO0sX0RULj/0idtBRBMn4YPiSwOiAxAZJTHKTr6f2Exn+ebtNSBemHslUF7gIHJCZAcoqDNIImJzR+f+sXldRA0gbTgMRXByQmQHKKg/SoeadbArXMMl9GL0WQ1tfJ/Srh1QGJCZCc4iC9ktizTdfnNjHfpQiS9rTAq0kDEhMgOcVB+oD6JTL9+3Sl+S5VkLaeQNMSXR2QmADJKQ7St3R+ItMPs4/CnypI2seZDRJ9FDggMQGSUxykNYk9Re4MWmy+SxkkrQfdn+DqgMQESE5xkDT7l5w425p/qPU+dZB+LJe/KrHVAYkJkJxiIdWsnsDwC+kc633qIGl30tWJrQ5ITIDkFAupWc7W+Id/hO6x3qcQ0oY62QsSWh2QmADJKRZSJ0rgplEv+sB6n0JI2iTqmNDqgMQESE6xkLpTAn/TPLq0/ZKuqYSktaPXElkdkJgAySkW0nU0I+7ZV2W2tU+kFNInWUdtSmB1QGICJKdYSMMTeLDdq3SjfSKlkIwbmIkcUwyQmADJKRbSI/Rg3LMPopftE6mFtLxCIq/QDEhMgOQUC+mVBF6PuV1G4FmBqYWk3UVXxL86IDEBklMspA/pqnhH31SmfuBUiiFtPCJrftyrAxITIDnFQvqOzo139Dl0SeBUiiFpUyj+V4MGJCZAcoqF9Fv818HRNCFwKtWQtE70UryrAxITIDnFQtLKNIp39G6h4z2kHNLn2YdvjHN1QGICJKd4SHWqxTt6zUoFgVMph6RdTiPjXB2QmADJKR5Si6w4D2q1mE4Pnkw9pJWVy/8vvtUBiQmQnOIhnU5x/hHmGRoaPJl6SNo91Du+1QGJCZCc4iFdTJ/HN/lV9G7wZBpA2lQ/c25cqwMSEyA5xUO6ochDbDXP/jV4Mg0gaa9Q+7hW9x7ShqVzXn1seP8BsxJ4AosVICmlHNLdNCmuwddmtwidTgdIWqf4HjXoEaRVX06ffN/gvmefUD+Pgh12U2JPpQIkpZRDboLdGAAAGDBJREFUepzui2vw1+i60Om0gPR5dt0NcayeREibfvxi9rOjru95WrMauSE9lY5qc86Vt42bMnNKt9JEjUcsjX99QFJKOaTXaXBcgw+kV0Kn0wKS8UvbsDhWTwakn4df1a3t0fkhPDnVG5/S/dqRj7827wf1z11rH++URZntxq+M89sAklLKIX1MfeMavGXW6tDp9IC0snL5ZbGvLg9py/2VTD3l6rU664q7Hpz4/hcR7hb9aUxLotyzJq6P5xsBklLKIS0NHMAkxtZlNy/6ID0gaWPp4thXF4c0uxmVG/7BEotGNPfaLbr9KKLyPabF/uc8QFJKOaQN1DaeuV+na4s+SBNImxtkfhDz6sKQfr4kg877IfhRlHd/f3xtDaJq/WIdHpCUUg5JyzsqnrlvDD6pzyxNIGlvUeOYD7wqCsm8Vdfg7aKPo/47UsHbvSoSHX7Ll7F8N0BSSj2kenG9jOzxmUW/IqUNJO0iujvW1SUhzW1JZQardyfE8gfZjS93L0vUYPB3UX8FICmlHlKrjDiOHbIup5nyUdpAWpFfJvrroZ0cpJX9MqnzkrCzYnxkwy+Pd86mzNajo3zQFiAppR7SWRTHwz2nqb8ipQ8k7SE6NcbVpSAVPJ5PRxZ/bYzYHyK0YnzrDMrt/Piv7hcFJLXUQ7rMfF3yWBsU9ky69IG09YRYH98gBGlWUyo/ssSP9rgea/fV4COIKlz8puvdeICklHpIg+M5wGKrTPUArekDSVuQc2hsLywrAsnhVp1VvA9aXTDwUKL83jMiPxoPkJRSD+nhOA7I9VtuU/XDNIJk/KyM7aj6ApAcb9VZxf/o74IZvfOI6gyMdBxcQFJKPaQ36KaYp55G16gfphOk9XWzYno+ReKQnG/VWSX0NIr1z/9fLlHLd9gLAJJS6iEtpO4xTz2IXlQ/TCdI2uvUPJYHCSQKafnFGXT+j8wnE30+0qoJ7TMzevzMfBaQlFIPaX1Gu5inbpUZdg9tWkHSzqUxMVw6MUj8rTorgSf2fXwsVRxd4PgpQFJKPSQt/7BYh/6t2Ov8pRek/1Us/330l04I0kctqOzgCMcvkniG7ObR5am14wEwAUkpDSA1y3H+D4+v2K9IaQZJu5+6RH/hBCBx99UVJfNU8x/Ooex+60qeD0hKaQDp/4i7ic81iKaEfZxmkApaxXC8yLghudyqs5I6ZsPLdahGyb+PAZJSGkDqR7NjHLp1+K9I6QZJm59T2+E/cOfiheR2q85K7OAnvw3OKfnDD5CU0gDSSJoY28y/5TYOPyPdIGkDaEC0F40PkvutOivBowh9ejyVHR5+dyQgKaUBpImxHqX0jeJ/80w7SL8dlj0vyovGAymaW3VWkofj2mp80yZhT1kCJKU0gDSb+sU28830QvgZaQdJe4VaRnkPShyQfmxJ5e+O6iHzsse1W3YuZV21puhjQFJKA0g/0v/FNvMJxX5FSkNIWhe6P7oLxg5pTnXqEuXdM9IHiJx2OFV/PvQRICmlAaSC3Gbs55z6LfeYYuekIaRlFfJ+cL+UFgekiWUyBkd7aEfxI62uH5xLJwUffwdISmkASTsstufIvlnipmAaQtLGRPkKajFC2jo8o9wL7hcLlIRDFn/dkUoH7i0EJKV0gNSOfotl5JtLPOUnHSEVtFQOvBeh2CD9ejbV/Dj6iyfj2N9bH69CR75lngIkpXSA1J0WxjJym4ziT4VOR0jap9l1onmaaUyQvm9OrX6K4fLJOYj+qn6ZGd1/BqSw0gHSTfRGDBOvzy3xGn9pCUm7mgZFcalYIM2qRhfGdFjkZL0axYyGVHn8VkBSSgdID9LDMUz8VsnXQU9PSGtr5kTxYucxQHo0N+ue2EZI2su6bBxamk78CpCKSgdIr8V0+O9r1CPa2aUnJO0Fau3+x6SoIW25liq+HuMESXx9pG9PoVJD43nVgGgDJKciQloQ06F+65UpcddEmkLSzqSHXC8TLaS1Z9DhMb8kW1JfaOzlWnR4NA+viDNAcioipHXUIfqB5zv8+TZdIS0pV2m522WihLSoAbXlnqnKl9xX7Ft7fRZ1juGpV7EFSE5FhKRVrhv9wEPpkRLnpSskbRRd6HaR6CC9VZmuivloyMl/6cs5TaniA7E+myzKAMmpyJBaZ0b9rAOtZWbJ40mmLaTNTelNl4tEBWl8TvbYeL5/0l9DtmB8HjX7KCmrA5JTkSFdTh9GO+9Pma1Knpm2kLQ5WfVcfiGPAtLmflT5bddLOeXBizEv607Z/dYmYXVAcioypPsdbq4xPUTDS56ZvpC0K+mWyBdwh7SyAx0Z01+si/LkVc1frUPV43sd4IgBklORIb1f7BgMETqDvih5ZhpDWlMj12FgJVdIX9WnU2I7eGtRnkCyHsnaebH06oDkVGRIq6K+2259mcMdzk1jSNpEahvxsdpukF6vSP1ifzG9QN5A0rT5rYq9noxAgORUZEhajSpRjvui8lrmRaUzJK0zPRbp0y6QRmflRvzyyHkFyXr67DGzRFcHJKdcIJ1KUT4Usxe953BuWkP6rkx+pNcbighpY086JJGrp2eQNO2nCynzeskfSoDklAukG+itqKYtqFbZ6a8paQ1JG06tI/xZNhKkn1pR06WJfGsPIWnaW3WoScyPveADJKdcID1B90Y17Sy6yOns9Ia06Wyq8wn72QiQPqlN50TzTAw+TyFpa7pT6bHRPnnXNUByygXSPOoV1bQ30vNOZ6c3JPNZrWXZlx/jIb2alzEwwUcNeAtJ0ybnU8fonmHvHiA55QJpY3aLqKZtkOv4p780h2QdZ4EzwUHaOjyz1JOJfl+vIWn/O5XypzhcNI4AySkXSNpRZaP5v3cRneJ4ftpD0j6pw91KYyBtuIiqz0n423oOSds6Ope6R/+ArwgBklNukLrRN1EMew9zjKv0h6QtO56aOB4Z1RnSoubUclni39V7SJr2aSOq94HTJ2IMkJxyg3Q7RXODoH2G831YPoCkbehB1WY6nO8E6eercukiiSfNpQKStuHqjOwhcTxUvViA5JQbpCl0u/usK7ObOn/CD5A0bWRWrsNDCktCWjekPNV5QuRbpgSSpk2rTq0WJbo6IDnlBumbaF5S6EnuKen+gKRNrUDXlHi0T3FIm+6rRvn3CP1lM0WQtBVdqHzUj0NmAiSn3CBtrVPK/VeCrsQc0s0nkLQvjqBOq4udFw5p67OHU5lB8T5GtUSpgqRpj5SnLpEe0OEeIDnlBkkb6/ggurBWl6/F/LXPL5C0VR1LPCMiDNIbzSm7j8CdDMFSB0lb1IqqJ3REB0ByyhXS+mrl3P4DG8T+HuUbSOZz9IodCkiBNLcjZZzzVfEvSaQUQtI235adcXUC95gAklOukLS73J4Bt6xsFe6JmP6BpGkTcrPCnpoYgrS4dya1miH5rVILSdM+PJIaRPsyUSUDJKfcIf2aX6H4rw/hXU7sYQv8BEmbWY0uWF/0YQDSioG51FD8eaaphaSt70c5g+N9lBMgOeUOSbuNhkb69Dc5h7F3ZfkKkra4sXoAbwvS2lvLUZ0n5Q/Hk2JImvZCPnWI84BdgORUFJBWls+9NcJN6vOIf+SZvyBp67pQrdBNHgPSxjFVqMpo4SeYWqUckrasE1WK8SWCAwGSU1FA0l46hOqx9/N8nHEM/x+2zyBpW2/JKBO8Gbfn96frUtlb1kT8gnhLPSRt69jS1COerQMkp6KBpK3um0l3MJ/rRK/yX+g3SJo2qUzGLfZ9+TOaUk7fkofqkykNIGnagiZ0WBx3ogCSU1FB0rQPazK/KL1DbSJ8mf8gafNqUZd1mjbnJMro9nUS1rdLC0jaxuszs26K6sWj1QDJqSghad/UcDpunba1JTk94DOYDyGZTyNv/HbXDDolhhfgi7n0gGT8P1iLjo310HyA5FS0kLSvq9OIkudOpjMjfZEfIWkbLyaiptNif1XzGEoXSNrq86jsg7GtDkhORQ1JW3govVb8vC+PzIr4kl2+hKRpo497piD2VzWPpbSBpGlPVqAzXF+aQw2QnIoekvZR1pHh9wQv7plFvSN+iU8hWR0skLTFbahahDuMSgRITsUASetDdykf/TywFB02PvKxRgGJKZ0g2c9Cj/6oSIDkVCyQVlQu/2Po9PVlqM4jbk+3BCSmtIKkaXOPoqPmRnthQHIqFkjafcGj160dkkeHjHX/kz8gMaUZJO23vhk5Q6M8kjkgORUTpC3HZJh3dv86qgpVGhbNbQFAYko3SJr2ajVqG90LVwCSUzFB0t6jZpP7t8imcjdFfkB4MEBiSj9I2vIzqEJUx+sDJKdig6SdS0RZzQZFe3cpIDGlISRNe7AMnRfFf5CA5FSMkJZddOu0GF5OEZCY0hKStvBYqvWO66UAyakYIcUYIDGlJyRt8+CsjH5u9yEBklOAxHUwQtK02fWo0aeRLwJITgES18EJSVvTg0r3nhbpIeEHLKQd4/tccndBydOAlFAHKSRNm1iVqGL3F37jPn/AQrpnyJqNDwzYV+I0ICXUQQtJ2zKjX3Wi0p0fd74L70CFpHX9xfhJdO7S4qcBKbEOXkhGBXMH1yfKaj3a4aCYByqkLy/Yb7y9/vXipwEpsQ5qSGYLBjcnymw9vPjzhA9USB9cbr4d+myx0ws6GX273y1dd71I2pbc2ZO7Y/yx+i8T2mcQHXPXorDF/XWV2Rs1pL4KpKLTiy41+r7Qrf36XtfLxN++fUlcvFDfn8zV9ydzx+zVk7pnBHfMmqfOziGqd/2ne0KL63Krl0z8KrMnWkhf2Tfn3ih+2gw37eLvoL9pF2rl451ziWr3ftl+1syBetPuj66rdH1bt2XFTwNSYgGS0m8vd88jyu/+8sYDF5I+dtCaDSNv3q/Peb/oNCAlHCCFt+Hl3tWIynZ+/I8DFdLOCb17jTEuPm5Y0WlASjhAKtGWGf1qEJXu8lBMB0uJLTxEyClAYvInJLMFg4+2/sD0o/tF4wqQnAIkJv9CMnbMsrtaE2U2Hyz60mvBAMkpQGLyNSTjd6TFo1tnEDUYHPURU6IOkJwCJCa/QzL6+fHOOUR1+81gXj84zgDJKUBiOgAgaeYfmLqWJarV++WYD8TPB0hOARLTgQHJaO1z3coRVb3sNbdDHEYbIDkFSEwHDCTN/ANTz3zD0tUyr94BSE4BEtOBBMlo81u9KxE1vOuHxFcHJKcAiekAg2S08eWuOZTZevy6BFcHJKcAienAg2S0anxrotJdX07o1yVAcgqQmA5ISEZfDD6MqEY/l+MQRQqQnAIkpgMVkqYVzOhdjqjB8HgfjgdITgES04ELyWj9pM7ZlNXxcfY4RJECJKcAiemAhmT0w+gm5jG93oz9UQ+A5BQgMR3okIwWDKxKVGfgohhXBySnAInpIICkaVve7F6GqPnoFbGsDkhOARLTQQHJaPXjHTOoVOdJ0T8YD5CcAiSmgwWS0ZLhhxMd2i/aJ1wAklOAxHQQQTKa2y/fvEf8f9FcFpCcAiSmgwuS+cBW8wFEHR93f+FhQHIKkJgONkhGK80HEOW53iMOSE4BEtNBCEkz7xE/lKjmwIjHegAkpwCJ6eCEpGkFb3Yva94j/jN7CUByCpCYDlZIRmvMe8RzO09iXqoWkJwCJKaDGJLR98OPJKrUe4bT5wDJKUBiOrghaeY94lWIjhr8XYlPAJJTgMR00EPStI2Tz8qlzA4Pzw1/Ti0gOQVITIBktmLscWRU++Srxr0dPAQyIDkFSEyAFOiLe3q3rWJqogoteg574ctNgOQUIDEBktrquZMGd22QZXLKrtep3/g3JV/tApBcAyQmv0Gy2/Tty8N7ty5v/Xiq1Lz78Je/LZBYFpBcAyQmf0Ky+2f9jPEDO9fNMDnlNug68PG5cT1jvShAcg2QmHwNyd4xv5i39pqXtn48Hdqx9+g3l8W7ICC5BkhMBwAku83fvjm6d8dqoVt7kxZsiXlBQHINkJgOGEiBVs193Li1l2lyyqnbeeD4GbEczhWQXAMkpgMNkt3GBZOGd29e1v7x1Nq4tfdtVMcoAiTXAInpwIRkt2XRqyMvPSHf4lSx5Q3uCwKSa4DEdCBDCmTe2uvaIOsU90sCkmuAxHQQQLLbGMV9eYDkGiAxHTSQogmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCURSO69O/pP9wulZ/tHT0n1CHH3w+jvUj1C3L0y+r9UjxBfSYU0rOX6ZC6fzPa17JvqEeJuRstpqR4h7q5ruTPVI8QXIDkHSKkJkJwCpJQESCkIkJwDpNQESAgdzAESQgIBEkICARJCAolC2jG+zyV3F4SfLv4+XYs0+w3nGF2U2vki5TS7vuGWbsU/l4ZFGj3dd3tYopDuGbJm4wMD9oWdLv4+XYs0e9/pmqb9keIBI+Q0+/zeE7oV/1waFmn0dN/tYUlC0rr+Yvy3cu5S9XTx94LfTrRIs+sXLkr1eBFzml3/eOvCbsU+l4ZFGj3dd3t4kpC+vGC/8fb619XTxd8LfjvRIs2+55xHb7xizIaUzhcpp9mNN9a1Mc33e6TR0323hycJ6YPLzbdDn1VPF38v+O1EizT735c9tGLFyMv+Sd10kXOaXQ9cG9N8v0caPd13e3iikKwHAwT2SuB08feC3060SLNbF9h10ZwUjeaa0+x6EFJ67/dIo1ul8W4PTxLSV/bP5jfU08XfC3470SLNbl/iuldTNpxLTrPrgWtjmu/3SKPbpe9uD08S0h9dV+n6tm7L1NPF3wt+O9Eizb7usUJd//eieamekctpdj1wbUzz/R5p9HTf7eGJ3v09dtCaDSNv3q/Peb/odPH36VqE2bdfMmHzhjF9d6d6RDan2f/U5nTTtH/Tfb9HGD3td3tYopB2Tujda8yfuj5uWNHp4u/TtUiz/zKsx6X3bEn1hHxOs19p/jXznPfSfb9HGj3dd3tYeIgQQgIBEkICARJCAgESQgIBEkICARJCAgESQgIBEkICAVKaNILM8jq85XrJ9g2YBRZG832Yr0YJBkhp0gi647nnnhl2GD3sdkmTwpKS/26AlNIAKU0KONheN+9fl0uaFB4FpDQLkNKkoIOb6Wtd//S0vDLHTTI+OunExafkVetpHhFkaqsyeS2n6haFM4xbgS3bV7FeuKFj1T1hC+hFX96+SqH54Qk19hatCEjJCZDSpKCDYfS5Pjerw/Q5/elBXT+1TquPCt7M6qPrr9F5M2acSTMsCiu70aKfJtGbxuU3Zw4MX0Av+vInyHxW3K8ZNysrAlJyAqQ0KejgxOy/9ePqmwfA7mrcyDvVYGVwqqnrY04xfv5sy+5lU7jS+HfbUf4c43OP0XfhC+hFX65lX22ceICWKCsCUnICpDRpBM3cvHnTN1fQtXoB3fiv0dP0jX5qWfNzfTKDl6p9UhEkvW+2cZPvpCahBQKQlC8/65B9ut6qsXoWICUnQEqT7Lu/Kfu63foSCvS2fmpd83Mmm23Dm1TIyqL2CqQFNF7fmDEutEAAkvLlL9En+lq6Tz0LkJITIKVJI2jC7NkfLPhLNyVcsdBKUyB1yLpz/g8/1lQh6Uc30x/O2hRaIAQp9OU7yg7Q78/4TT0LkJITIKVJyp1uf1Cf4MkQpFXUzzhRWDoM0lha1vrMEgsoX673qKkff3LYWYCUnAApTVL/DNS6ovmDacrQwiJIP9HduvnnozY2havIvGN7U9YlNLXkAkVfrr9H79CksLMAKTkBUpqkQvo0p9mUD4flXK78RNpTp9Z7n99y8sl58/4xKdxFd5v3fZ9NFXYVLXDLY2afKV+u78k/ovS2sBUBKTkBUpoU9sCEBafn5Rw9rlCBpC9qW/bQa7ZNr1p5hUlh/XE5Joi36CplAbsBypfr+tV0UfiKgJScAMnPvW8+DAKlQ4Dk4/Yc3ybVI6BAgOTbfnvvzKxvUj0ECgRIvm1SRr2ZqZ4BBQMkhAQCJIQEAiSEBAIkhAQCJIQEAiSEBAIkhAQCJIQE+n/r20q2hiJ7owAAAABJRU5ErkJggg==", - "text/plain": [ - "plot without title" - ] - }, - "metadata": { - "image/png": { - "height": 420, - "width": 420 - } - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de4CMdf//8fee7bJECNFRRSkVioSi033fodItUXSgRHdHpaJbErrJV+V3d5eSziqdnA9JRZEUKpGzsNbux3FZpz1cv7lm5vp8rtmdw/W55jPNzHW9nn/szs5c3vu5rp2Hnd2dA2kIoaijeC8AIScESAgpCJAQUhAgIaQgQEJIQYCEkIIACSEFAZJUK6lD4BlDaIKawa0yqm6NZoCqhUhW6YCE3FBqBwfT/+SWYf/whTtwVg+qZ7soIA0jUY2IW78zTW66afuVdKbs2ipV8Ezr2pkntxlb6Dn9DxrHz29Nb/PPQl2Nc2cQDQk6xhKkXc9cViejVsvBWyr/+5BHoRld+XBh6OVHLsTXfKX/C5R9zoAN0YwPlWVIlnfQe4ieP/U9qWVEcfh8By74F+avgTSlq6dLqI7+rmfErevdLTfdtL0CSG/nUEbLq8/NpJrzNG0WNTHOX021jvDPkpJe4D99c0oUkF7LpswW11xahdJHVfr3oY7CUapeEnEnwhYSUrVbPHXvUJOyv4zuMwTNKiTrOyh7RZGbXjnfgQv+Wf8aSN4+p06WtttEcsfHvH30kKZQ6pADnvcFD1DaCq3sdPrWf8GD9Kj4LBca36n2ZjWzD2kipT9X5Hl/eHwVerbCZSGPwj46NfJehC0kJP+xK76PTimL8nMEnd/B0naWd1D2iiI3PUjeAxfis8YD0lM07b8NPLfwyl9vXa1Kk6GH9PMOPNGkSmbjQZ6rcDf95kVbbShNW9KhWu0+ReXjz8luOqpcC9jec+nqG+pkNf+Ab+8rAJKYqWlTr6yZUf+62RVOHn+xZbWsMwfmiX9TdAK/yT2Y7vHccKDbfB8drZXCb/CspLvrNPOdfIUe9kIyT9p6y4nZzSf5rjemNfsOdRptN8bsyqap/pPzUlP/0LSBNFn/YCn9I3CvzOO7em9+8aWYdsa0u5UP3sP06cL21au1W8AXYj76FY5dWXVaJQ6J6VNs6V7Ls2e76OKAtQZ86mBfXPMBCful0fPvoHm+6avtWdx/W1at1vFb/oX3/YxkOv4BW4c9fOaDEuLaFbBU/cD5Pmtbmumd8DX/GlU8qB1ouvf8GdSx4rVAGaRn6JGcW/tp2m1U/9GnWtOFnv+Uj7ejFoMePIdalWoz76DW4z/WnqUnT7hlYEO6fXCj/r0z6R0tYPtn6Zka1zz8D6KvjO19mSGZZnr+769z79N31Up5J+Bk2d+oyYNP/43qix8+X6GWxskjf3resKwqe70fvU/X8I1W0l39abn35CUnfaxDMk/a24jaD7233p3e641pzZUgPeu9EvrqSQMDrpwBe2UeP30U1Rw/fr//AtPOmHe38sEbTPdnd3m0a0ra18ZCTCurdOwa0zK+MtOn2NOQrhzRv959+rXHfEU3f+ogX9yAAxLuS+PNv4Pm+aavtue2NJ07oFc1zz75D5EXkvn4B2wd9vCZD0qIa1fAUvUD5/usb1A374T76FX/56h4UF+jPt7zb9f3I/BaoAzSKKrh+fFD+4haeAaX309PaNqn1Nqz0GNNdMZTvd87R1OWZ/f+TMtoskfT3qDrA7cfTZnvejYapK92aoibduaZ59NGzznbc1sHnJxIbY5q+n9A3fk/6mb67YK3XvSi9/0V9Lnps/T5kQbop9bSI1N1SOZJ/6ZbPCfz6+nXG/OafYf6qy+PGmM60Lt85Cw6O/DKad6rgPEBt01MO2Pe3coHbwilzvBsOZZa+xdiXlnFY7cuNWN/sE/xb7pZ37OT9D0zr9X8qYN8cc0HJOyXxpdvB83zzV/tKfQ3zz/9I6fqQf8h8kIyHyDz1uEPn/mghLp2mZfqPXDez1qUk7nbs0Vp3ax9fFbgQd2bWfO45+yjNbKLKl4LlEEaTd6DdhXN9x63jPqe2wyfef+DH0zPCUjX6udcSP/1vN1FTQO3H+37lrqM2oSGZJ7ZKCVfP31MCzjZlny3iTIzDxv/6CLfJxF9T+fp79anNCw1fZY+2vk19a/NE7TaC8k8qTkt1U8P16835jVXuhXdiFby03mUXh4aknl8ACTTzph3t/LBG+I7YEdzUvb4FmJeWeCx2/VZY/3bY5BP0Zy+008Oqwgp8FNX+uKaD4ivEF8aX8Egia/2NbRYPz3+0U1mSOYDZN46/OEzH5RQ1y7zUgUkzzealz1vv/T+z+KfVeGgdqa5npNfUI9K1wKFkB7W31Uj3+3zi+hP7/ui/Pzh+pXSgDRYP7O997gdptMCtx/t+8l/PV0YGpJ55gBqMinfd5Y4WV6FfP/xnu+/nabpN2qWVlh2c/re8/Zx828DdEgv0hTPbYaGl2g6JPOkskzyspyrX2/Ma64E6QTxo452kKg4JKSAhQZAMu2XeXcrH7wh9Lh3iwvoF99CKh598etvT3cfC/YpPHtWrL+fUxFS4Keu+MUNOCCV11phF4JDEl/tqsT/0xOQAg6Qeevwh898UEJdu8xLNUFaqP+cqPWjaWJWhYM6hfpq+k32WZWuBQoh6b/sPSy+bks8F7at4j1pgjRW37YDrfW8PeLZ+4Dt/ZduoOZhIJlmHr83g+jcxzdr5pNFlOnbsCPNKe2g95V2Mc3WAntV/wZ//KT0neIsHdLuLM/PTPM8X0QdknnSAariPbncc70JWHMlSKcKv9p2fUIlSP5Vmcf7rgnGck37Zd7dSgfP88l9N1k7eP5z1BcSePT9e+X99ffl1GSN/mHlT3GAsvieBUIK+NQVv7jmAxL+S+MrGCT+1T7kH6YnIAUcIPN1w1vlw+fPdFBCXrvMSzVBKj+dftVKTqx9XMyqcFCLq9Uu1Y7k1i2pdC1QCElf5xFKGeZvs+dHs9yHPpg1594wkAK2twLJPFPTdr7aNZcyPzKfPEgZvi2voLkl3v2cot1a6ffQh6pn79c+oX+aztIhad1Tt2k9PRfpkMyT9vuvbj94rjcBa64E6Vp6jZ+eQRcEgeRflXm875pgLNe0X+bdDQZpvHdCe88P0PpCAlYWcOwOn+G5umnBPoWxZ0srQqr8qc3jzQck0pdGLyykw5RWbmwoIAUcoEqQKh8+f6aDEvLaZV6qCZLn9u0jnm/N/zLNqnhQe3kO9af0gFbpWqAYklaDxB+YT/b9uebfYSAFbG8FknmmtyP/Sz/hqPlkDvl+VDyPfjb+0SQ60/h7Xfmza73vB3p+0riOFpr2xAtpHo0syu6leSGZJ5Wm+W58fKFfb8xrrgRpnLgV77kJ4LmVcT+9qZ/+rNLPSOaFVv5DiG9nzLsbDNJQ77YX0GrfQswrCzx2c+iUogoX+T5Fabpvzz7T98y81iCf2jQ+4ID4Cvel8e+geb75q51LzJhi+hnJfIAqQQp5+MwHJdS1y7xUM6QtKSeX3266SVH5oM6iAVoP7xYVrgWqIV1Dn3g/3KP/ubmafqr8knCQTNtbgRQwc6vvhll7WmM+2d7314A96dn8N2mH6/pvN2vac3SZ9/3v1K4gjd/BwfdZ+nh+Yjil1Tv670e9kMyTmtAP+ulB+vXGvOZKkPZV5788XZiauU3/J96bGk9WgmQeHwBJ7EzA7gaD5P3tfVFm2gHfQswrq3Ds/kn3mVZpOl5NfTcDH9b3zLTWYJ/aPN58QLyF+tL4D4t3B83HwvzVvtL7TVIb1el7MyTzAaoMKdThMx+UENeugKWaIXkWMrtaU800q+JBLand6HDVJhUPRgwgfUTNdKiL0m/WtFrkuR6VP1NP/3XRDLpBC3ZdMG9vPli+7X2ZvyOZZq6ijvqPz0UN03abTmpv02X66Ue9Pxb6m5FCd+o/++YPoOr+L22HlCfpJfOeeCFpw1I7nV7uh2Se9Ij3t+mba/p+/S3W7IP0zdccrfdeFPp//kdfyfH+fu0V6uAZuKa2fuUx71XAePM1wbwz5kMYDFKa/luUl+lK/ptasbIKxy6vesq3QT/FILrJs76NJ+h7Zl5rkE9tHm8+IOG+NP4LfTtonm/+ar9FLTw/uW+pmbPXf4i8kMwHqDKkUIfPfFBCXbvM++Y9cMYX5m06nUx37ApyUO+jkfrv+ipdC1RD8vw8cvIjw7pl5C7Tj/VZzz3X+px5dOLz2zemZNzVP8h1wby9eVd92/taSTn/8PV+wMyedMbAfw84lR7Ub0Txk+VdqdljT3Wis3eblvlJdUptfnWzTGr0q/+cDyk9Z59pCz+krak0XPNDMk/Kq00t/9Wjhu8/YNOaK/1BVtPeq0aZl/69bVWq4r07RUF1avNwj2pj6W+BexUwPuA7kmlnzLsbDFKf3D4jeqdlLDUWYlqZsVfGf0IvUePDwT7Fzjp06aBeNQbqe2Zea5BPbR4fcEC8hfjS+PLtoHm++atddj2d2v/2XHrdOEReSOYDVBlSqMNnPiihrl3mpXoPnPGFKc4lz0/JplmVDupiyk3ZUvFgxAJS2ettctMb9vZ+sYecmdVowG7tjqr1ftWer511cTBIpu0DDpZ3e1/iV7jDAmaW/fey2mk12r1Zrt/FhJ/USl68OCeryZMBSrTdz7aunV6z/RvGXVS14/Uq3L3KB0m7OlX/zbEXUsCktV1PqHL+63vo0sA1B4GkFQ5vUzuj1iVD/b8S/K1jTrVLv2B0ReBeBYwPgGTaGfPuBoP0yoIO1ap1WMQXYlqZsVf8LkItxP0KA47XH/qevbHKK8K01iCfOmC8+YB4C/Gl8eXfQdP8gK92ybgLsqu29/7I6j1EvrsImQ5QZUihDp/5oIS6dpmX6jtwxhfmbv3OP6ZZlQ5q+WnUTqt4MBRAQnFM5WOQLD8eItGL7qCMMt0xRSpASuIAKUhRHZTjjWofjbxVsAApiQOkIEV1UB6q9BdHqwFSEgdIQbJ/UNYOvpyaH468XdAAKYkDpCDZPyhfpVa71fbD/QEJIQUBEkIKAiSEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQVFAenQ/kgdLimKuI2tDhyNzVzPig/GZvCBY7GZu784VivefzxGc4tLIl9z7BWrFR8KuWLx8P0oIO1nkTqi7Yu4ja32HI/NXHbYwl7ZandJbOayYu1AjCaXxmjuIa0oRpPLYjT3oHYwxCV7ASl4gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkIkKQDJBEgGQGSdIAkAiQjQJIOkESAZARI0gGSCJCMAEk6QBIBkhEgSQdIIkAyAiTpAEkESEaAJB0giQDJCJCkAyQRIBkBknSAJAIkI0CSDpBEgGQESNIBkgiQjABJOkASAZIRIEkHSCJAMgIk6QBJBEhGgCQdIIkAyQiQpAMkESAZAZJ0gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkoMSEtenXwUyNfm7Mumj0DJB4giVwEaecbl5C/c/p+kGd3zwCJB0gi10D64O56RO2fe2/K68/2vTyLKLfbh4W29gyQeIAkcgmkjdcT5fRZZHy4/dN+DYjOemGbjT0DJB4gidwBqaATtfx8R8DlhbNvzKDaz/wpvWeAxAMkkTsgPUqX51feZNX91aj2szsqXxA2QOIBksgFkBa/eQM1WB10o3UPVqVT3pLbM0DiAZLI+ZC6E1HTFaE2+6NfOl3xvcyeARIPkESOhzSZzhnywa4wGy5qRxmDg9zwCxUg8QBJ5HRIBeekLY606aST6ELr35QAiQdIIqdDeoVuirzYdTdQlREFFvcMkHiAJHI2pKUfnZT5o5Xlvl6LrrR4xyFA4gGSyNGQ3iaiIdbWu7oDNZhtaUtA4gGSyNGQbqRe71q9H9CuQakZI6xsDEg8QBI5GlLdE2XuTvdxLbp+a+TNAIkHSCInQ/qe/i615lWt6PxfIm4FSDxAEjkZ0nM0Tm7RebdQvQWRNgIkHiCJnAzpRloiu+whKTmR7jEESDxAEjkZUsu0ndLrnlQl9d/htwAkHiCJnAypQT0bC59Xl24I+ysHQOIBksjBkPZmNLez8lUt6OzvwlwOSDxAEjkY0la60tbS8x+gqhNDXwxIPEASORjSCrrZ5uJfzaHeIZ8cBZB4gCRyMKSvqK/d1S9qTBf9EOIyQOIBksjBkD6lR20vf/MNlPNC8LtFABIPkEQOhjSJnotiByadQFf+FuwCQOIBksjBkP6PJkSzByvaUO33gpwPSDxAEjkY0jB6O6pdKHg6k3r8XulsQOIBksjBkB6mz6PciYVNqOqQik/XBUg8QBI5GNJdtDDavdg5siY1ej3wlw6AxAMkkYMh3UQhn4TLehseyKSLAx46C0g8QBI5GFIn2qhiT5ZcRSkdJ4u7vwISD5BEDobUIs3qEwNF6OOLier8a5n/I0DiAZLIwZDOqKVsbxbeWZ1S2r7q/b0DIPEASeRgSCecqXB/tk24hKhm368ByRQgiZwLqTS1pdpd+q7/iUQXPL8JkIwASeRcSHvoKrW7xFjem53SKKvHp4p+9qoQIIkAySj+kDZRN7W75O2XJ08jqt/nPTuv9RchQBIBkpEKSAfH9ek5vMB3evuzvW554ncZSD/TXWp3yd/u+f+sTlTl6rGrFA8GJBEgGamANGLw5ryxA8v0k+X9JhQffe+fRRKQvqYH1e6Svz3H2c7P7juTiJo9PDvc68XIBkgiQDJSAIl12eT5rnTDKi+czms1bW/ndRKQpll92m/J/L+1W/ZsuwyimjdOWKNqMCCJAMlIAaQl3co9b+//yPvBY+OLjnzQ95gEpHfpebW75E/8+nvTGz3qEqU0f3iWkm9MgCQCJCMFkObeob8dMtH7wZ6BnTv33qifWtjC07Lw/1TvFXon8kbRVr5iZLt0zzem7h8Vx/6TIWSqjJ+KBOlO/a0PUsmDE/YXT+2lI1x+m6dfSiJV9h/6JOJGtiqv8DGbckd9oqq3fHIwurllWml0A0JWccWqwopFWozmlmllIS45bhXSD76bdlP10yu6HPG8vWu6cZmFm3bD6GO132T9BbtnQ+GCfzUiyu3+gfxTu4pw006Em3ZGCm7a7emyQdMOdF2tn/65s37TqbcMpMdoltpd8hfiLkKFc+71fF+q2Wuq7Z+XAEkESEYqfv09+qHNO555pFybP10r7j3h4LFPu+2UgDSQvlK7S/5C39euYMbddYjqPmzzD0yAJAIkIxWQisf37jXKs/mYoZq2dXivHo//yi+yAOkusv5K5TKFvdPqrk9vy6W0v38i8wJnRoAkAiSj+N9F6Fb6We0u+Yt07++tY5sSNR4p/6BCQBIBklH8Id1Iq9Xukj8LD6NY0D2Dsrp/LTkYkESAZBR/SH+jDWp3yZ+lxyOteboRUfNx22UGA5IIkIziD6kjxeAe2szyA/vyJ7dLoXrDLby6sxEgiQDJKP6QLiOV9ygVWX+E7Pd3Z1PNQeusbg5IIkAyij+kFplq98hI5qHmfzxSg3LusfjrcEASAZJR/CGdW13tHhnJPWfD1pH1KaO7pV/EA5IIkIziD+mMumr3yEj2yU/yXjiNUq//MvKGgCQCJKP4Q6rfUO0eGck/i1DBexcSXRLslS0CAiQRIBnFH1Ktxmr3yMjO03EVfnApUevZ4TcCJBEgGcUfUtWmavfIyObz2s28kugfS8JtAUgiQDKKP6T0C9XukZHtJ4icfSmldgnzvP6AJAIko7hDKqZWavfIyP4zrRZOOoOyHwp5JzxAEgGSUdwh7aXL1e6RUTRPWbxzTF2q9Wxe8AsBSQRIRnGHlE+d1O6RUXTP/b3t6erUYFzQ+1wAkgiQjOIOaTtdp3aPjKJ9Ev0/+mVS0ylBLgAkESAZxR3SRuqido+Mon81ip+6pdJl8yqdDUgiQDKKO6Q1dLPaPTJS8bIuC6+k1EovmA5IIkAyijuk7+h2tXtkpOb1kT4+m3KfCfytAyCJAMko7pCG0r1q98hI0QuN7XyuBjX+0HwOIIkAySjukN6gx9TukZGyV+xbe1sqXbNMfAxIIkAyijukgnuXRdzGVgpf+nJBK8p8YIvxESCJAMko7pCOaPvU7pGRyteQLZzUkE4a538BQEASAZIRIFlr22NZdKHvbuGAJAIkI0Cy2vIulNJd/1U4IIkAyQiQrPfRWVR9RD4gmQIkI0CSaOfw6nTePEASAZIRIEm15mZK7X8AkIwAyQiQJJt2FtV/NSaTAUkESI6HxHYMyaKrwzyA1n6AxAMk50Nih9e3p+zHonnNvxABEg+Q3ABJ2/f/atF5c5UPBiQeILkC0n62pjul9t6seDAg8QDJJZAY+6IxnfT/1A4GJB4guQYS2/5YJl2j9JcOgMQDJPdAYmxRK8p+WuFr0gASD5DcBIkVjK1Bzb9VNhiQeIDkKkiMre5KGYNCPP+ddIDEAySXQWJsSgNqUvmZhmwFSDxAch0ktql3Snq/P1UMBiQeILkPEmMfN6TTvlAwGJB4gORGSGzbA6kpvSVeFz1EgMQDJFdCYmzmmXTKp9EOBiQeILkUEtv+QFpKtPcZAiQeILkVEmOzz6Z670Y1GJB4gOReSGzHYxnUZV0UgwGJB0guhsTYN+dT3cn2BwMSD5BcDYntfDqTuvxhdzAg8QDJ3ZC835RqT7I5GJB4gOR2SCx/SCb9fbWtwYDEAyTXQ2Lsu1ZU46VCG4MBiQdIgMRY4biq1MbGa2wAEg+QAElvxRVURf4hf4DEAyRA8jWpFrX6TnIwIPEACZD8/X49ZTwg95A/QOIBEiDx3qtHTefLDAYkHiABkmhjb5J6yB8g8QAJkMx91JBO+8zyYEDiARIgBeR9yJ/VR1cAEg+QAKlCs86iem9b2xSQeIAESBWz/ugKQOIBEiBVbsF5VMfKHVkBiQdIgBSknU9mUrdNETcDJB4gAVLQvmtOjaZH2giQeIAESMHLfywtpV+EOzoAEg+QAClUs0+jpovCbgFIPEACpJBt7k1VRoZ7nBIg8QAJkMI0qSZd+VvoiwGJB0iAFK6VbenE0M99B0g8QAKksBWOzKTuoe7HCkg8QAKkCC1qSmctCH4RIPEACZAitaNfSvpjQR+GDkg8QAKkyH1cj1otD3I+IPEACZAs9Md1lDuu8tmAxAOkCh3YHakj2v6I29hqb0ls5u4+bGGvIvXfHOq6oeKZe0qjnhu8Yq0oRpPLYjT3UNKt2AMpxCX7VEA6VhKpMq004jb2Ko/RXCUrXt+GTllQ8cyEXnHQkm/FWozmlmllIS45rgISbtqFKsid73DTjoebdoBkuRmN6IIl5jMAiQdIgGS9TTdTNfMD/gCJB0iAJNOEKikD8vlHgMQDJECS6pvTqDV/DRhA4gESIMm1+XqqbTzzHSDxAAmQJCt8Oi39ad9JQOIBEiBJ90Ud+rv3mVEAiQdIgCTfLy2p8WIGSKYACZBslNfP+3twQOIBEiDZ6pXslH47AYkHSIBkr69PpTZrAMkIkADJZpv/QfXDP1uX/QBJBEiyJRkkVvh0qvF7cNUBkgiQZEs2SIx9UZdukniBP+sBkgiQZEs+SLs3t6CzZF8K3UqAJAIk2ZIQUkleb6r2pvrBgCQCJNmSERJj/0//PbjqwYAkAiTZkhMSW3gqXfa74sGAJAIk2ZIUElvfkerPVjsYkESAJFuyQmIFg1Iz/6N0MCCJAEm2pIXE2Acn0N1Bn4vVZoAkAiTZkhgS++lsukbhX5QASQRIsiUzJLbhcjr3F2WDAUkESLIlNSSW153qf61qMCCJAEm25IbECh9LqfahosGAJAIk2ZIcEmMTMtJfUDMYkESAJFvSQ2Kf1qB+4V682XKAJAIk2ZIfElvciLruUDAYkESAJJsDILHVzemSddEPBiQRIMnmBEjsz2vpnBVRDwYkESDJ5ghIbFdfqjs/2sGAJAIk2ZwBibGRqTnvRDkYkESAJJtTILHJVdJGRzcYkESAJJtjILG5talfQTSDAUkESLI5BxL76Sz6+7YoBgOSCJBkcxAktr4NXbzG/mBAEgGSbE6CxPK60Snf2x4MSCJAks1RkFjhY3TCF3YHA5IIkGRzFiTGXsrIfMXmYEASAZJsToPEpuamPGZvMCCJAEk2x0Fii06mW2095x0giQBJNudBYr+dTx022RgMSCJAks2BkNjWq6npSvnBgCQCJNmcCInt7EX1F0sPBiQRIMnmSEiMPZVSa6HsYEASAZJsDoXEXkitLvuMxoAkAiTZnAqJvZqe84ncYEASAZJsjoXE3sjIfFdqMCCJAEk250JiH2RlTpYZDEgiQJLNwZDYF1XTJkgMBiQRIMnmZEhsZm7qi9YHA5IIkGRzNCS2oFbKc5Y3BiQRIMnmbEjsu5NosNVtAUkESLI5HBJb2oAesLgpIIkASTanQ2IrTqO7rT01OCCJAEk2x0Niq86g3paeXQiQRIAkm/MhsTXnUrd8C9sBkgiQZHMBJLbhYupq4aF+gCQCJNncAIltakVXR37hF0ASAZJsroDE/uxAbbdG2giQRIAkmzsgsbzr6NLNEbYBJBEgyeYSSCzvemoe4cXIAEkESLK5BRLb1YPOWR12C0ASAZJsroHECm6jxr+E2wCQRIAkm3sgscJ7qdHyMJcDkgiQZHMRJMYG0ck/hL4UkESAJJurILGnqc63IS8EJBEgyeYuSOz5lBPmhboMkESAJJvLILFxoZ+mC5BEgCSb2yCFeZouQBIBkmyug8Tezcx8J+gFgCQCJNncB4m9n5X5VrDzAUkESLK5EBL7JCcj2BPeAZIIkGRzIyQ2LagkQBIlMqSD4/r0HF7g/2BW3xvv/xGQ7KQAEptZNW1ipTMBSZTIkEYM3pw3dmCZ9/SC3ssLvuhXDEg2UgGJzaia8XbF8wBJlMCQWJdNnu9KN6zyftDvq4DLAEkiJZCCSQIkUQJDWtKt3PP2/o/007s7f/Wvmx9dq58sOeBp7+5IHdH2R9zGVnuPx2bu7sPagdgM3lOiZMyMnIx3A88p1oqUTK5caYzmHorZiscXYRQAACAASURBVMtiNNcDKcQl+6xCmnuH/nbIRP3tus5Pbi+a2GO/5+TCFp6Whf+nKBZ9WzXzi3ivAfHK+KlIkO7U3xqQPLfwSm9d4Dm56j5Pvx2PVJlWEnEbe5XHaG7ir3h2duZn5o9LE37FFSvVSmM0+a9f8TGrkH7w3bSbqp9mnTd43g6calyGn5EkUvMzkt7UKgEvRYafkUQJ/DPSni4ePAe6rvZ+G+s9Q9OOdV8ESDZSB4l9nJX5nvgIkEQJDEkb/dDmHc88Uq7Nn65pU3utZC/3PgJINlIIKVASIIkSGVLx+N69Rnk2HzPU8y3p7dtvfGIbvwiQJFIJKUASIIkSGVKYAEkipZB0Se/7TwKSCJBkczsk9lFW5ge+U4AkAiTZXA9JSAIkESDJBkjsw8wq3sfMApIIkGQDJC4JkESAJBsgMa+kTwHJHCDJBkh6UzKzPwMkU4AkGyB5+8AjCZBEgCQbIPmanJE9G5B4gCQbIPmbnJGzEJCMAEk2QDLySJoZm8mAxAMk6ZIOEns/PfuL2EwGJCNAki75IBVPTc+JjSRAMgIk6ZIQkvZOes60WEwGJCNAki4ZIR2YFBtJgGQESNIlJST2RnrOdPWTAckIkKRLTkgeSblzlE8GJCNAki5JIcVEEiAZAZJ0yQqJvZ5efa7iyYBkBEjSJS0kNjFNtSRAMgIk6ZIXEpuQWmOB0smAZARI0iUxJI+kE5eqnAxIRoAkXTJDYmOowQqFkwHJCJCkS2pI7DE6Y426yYBkBEjSJTck1p8u2qJsMiAZAZJ0SQ6psAddvkPVZEAyAiTpkhwS23k1/S1f0WRAMgIk6ZIdEtvemnoUqpkMSEaAJF3SQ2KbzqcH1UwGJCNAki75IbG1Z9IwJZMByQiQpHMAJPbTSSnjVUwGJCNAks4JkNiimmlvKpgMSEaAJJ0jILG5OZkfRz8ZkIzkIFU1lQlIivtLIbFPMqt9GfVkQDKSg3SLp3My2nS74cKUFvcDkuL+WkjstdRa30c7GZCMpG/aTW22U3/3R5PpgKS4vxiSijuwApKRNKRmH/ve/685ICnur4ak4A6sgGQkDSlzgf87UxYgKe4vhxT9HVgByUgaUoNe3nflt9QHJMX99ZCivgMrIBlJQxpG5z8wYsTApvQEICnur4cU9R1YAclIGlL5f+qTp9pPlwKS4uIASb8D6y1R3IEVkIxs/EG2/M9lP2wqs8IIkKSKB6Qo78AKSEY2IB358TOmlQCS8uICKbo7sAKSkTykF3KJlmpP3WGJEiBJFB9IUd2BFZCMpCFNpC6veiC9nT4GkBQXJ0jR3IEVkIykIV3QXzvigaQ9eTYgKS5ekKK4AysgGUlDqvKlD9K8DEBSXNwg2b8DKyAZSUOqO8MH6ePqgKS4+EGyfQdWQDKShnRVh8M6pD3NrgEkxcURkt07sAKSkTSkr9MaP0h39ame8R0gKS6ekNjjtu7ACkhG8r/+XnCRfs+GS76x4giQZIorJHt3YAUkIzsPNS9YuXKvZi1Akii+kGzdgRWQjKQhtZll0RAgyRZfSLbuwApIRtKQGo4DJGdCsnMHVkAykoY0rennxwEpJsUbkn4H1gfkJgOSkTSkdudTZoNT9QBJcXGHxNY2lrwDKyAZSUNq27GTP0BSXPwhSd+BFZCMbD9B5MH1gKS4BICk34F1ksRkQDKyDWlBLUBSXCJAYnOrytyBFZCM5CHN7NWubdu2rXNrA5LiEgIS+zCj2kLLGwOSkTSkKZTekBpUoSst/T0JkCRKDEjsfyknWb7bHSAZSUNqcV2RlvZbyctXFAGS4hIEEhtKZ6+3uCkgGUlDyp2paWm/atpDAwFJcYkCifWl1hbvLARIRvIP7JujadUXadriBoCkuISBVHA9dSmwtCUgGUlDuujmY9p5QzRtWlVAUlzCQGLbW9F9ljYEJCNpSO9SJ+3ptH7DT74MkBSXOJDYusb0nJXtAMlI/tffU0ZrxVcTNVoOSIpLIEjspzqpVp5ZCJCMbP5BdsMaa/dcBSSJEgkS+zIna2bkrQDJCK8hK507ILEP0msuibgRIBlJQzrRKBeQFJdYkNj/UaPfI20DSEbSkLp6uyS7Gf6OpLoEg8Qeogu2RtgEkIzs3rTLbz8TkBSXaJAKe1CnCI89ByQj2z8jLW8BSIpLNEhs5xXUK/wWgGRkG1J+NiApLuEgsc3N6PGwGwCSkV1I5SMbWoFUtDdSR7UDEbex1f6S2Mzde8TCXtlqX2ls5u49rB20+S/XNEyZEO7yMptzI3VYOxSjybFacbFWHOKS/cEhNffWrDYNsgLpWEmkyrTSiNvYqzxGc1214l9qZswMc3ECrjhCWozmlmllIS4Rf3ENAumiji8dswIJN+0kSrybdp6mZVb7KvSluGlnhD/ISucuSGxiapjH+QGSESBJ5zJIYR/nB0hG0pDSc6qaAiSFJSikcI/zAyQjaUgDzk1vfdMNF6Zc2OMWT4CksESFFOZxfoBkJA1p6vl5+ru158yIhAiQJEtUSGEe5wdIRtKQzpvqe/+/5oCkuISFxNadGeJxfoBkJA0p80vf+4+zAElxiQsp5OP8AMlIGlKDnuX6u9LO9QFJcQkMKdTj/ADJSBrSv6nxfcOGDTyXngQkxSUypBCP8wMkI2lIZaPq668hW2dYKSApLqEhBX+cHyAZ2fiDbPmfy37YVGaFESBJldiQgj7OD5CM5CEV79S0w5Nf2ARIqktwSMEe5wdIRtKQ1tYdrZW0JKqxApAUl+CQgj3OD5CMpCHddP5G7V16ZeNlNwOS4hIdEtt8XsXH+QGSkTSkuu9r2o3NNO39RoCkuISHxH5rmPJSwBmAZCT/B9mFWmnNxzVtfiYgKS7xIbHFJ2R8ZP4YkIykITV6Q5tPCzVtEv4gq7okgFTxcX6AZCQN6e56T5x6ZqlWcAF+RlJdMkCq8Dg/QDKShrSzNdVeqmm31PgFkBSXFJACH+cHSEY2/iB7QH82h+W7rDgCJJmSA1LA4/wAycjOQ80PzymwpAiQ5EoSSObH+QGSkR1IW+hzQIpBSQLJ/Dg/QDICJOlcD8n0OD9AMgIk6QBJPM4PkIwASTpAEo/zAyQjO5COrdyvWQyQJEoiSMbj/ADJyP4TRG4BJMUlEyQ2hk75HZBEkpC+vabxNbP1E0efw8u6qC6pILEH6aJtgMSTg7Q0I+WUjJSPNW3eWXQOICkuuSAVdqerdwGSkRykrjVWaQUtm26/mU4YfzwYHECKouSCxPIup/6AZCQH6bQHPW/mUJW0+5gVRoAkVZJBYusb04uxmMucDyn9v543W6nDb9YYAZJUyQaJ/VQ79Z2YDHY8JHrd8yaf5lh1BEgyJR0kNi8ne15MBgMSINkv+SCxD1NPWhmLuYAESPZLQkilQ+mcjTGY63hITy5dunQWjV+qB0iKS0ZI7E5qm6d+ruMhmQMkxSUlpF3XUE/1c50OaZg5QFJcUkJiW86jocrnOh2SdIAkUXJCYr81SHlF9VxAAiT7JSkktrBq5ueK5wISINkvWSGxD9Nr/aB2LiABkv2SFhIbR6f+oXQuIAGS/ZIXEutPl+4Iu6FkgARI9ktiSAXXU9eCsFvK5QpIR378jGklgKS8JIbEtrekhxTOdQOkF3KJlmpP3WGJEiBJlMyQ2LrTaay6uS6ANJG6vOqB9Hb6GEBSXFJDYt+fkDFV2VwXQLqgv3bEA0l78mxAUlxyQ2LTM3O/VTXXBZCqfOmDNC8DkBSX5JDYayn1f1E01wWQ6s7wQfq4OiApLtkhsUF0wVY1c10A6aoOh3VIe5pdA0iKS3pIhbfQVbuUzHUBpK/TGj9Id/WpnvEdICku6SGxne2pj5K5LoCkLbhIfzDSJd9YcQRIMiU/JLapKY1UMdcNkDStYOXKvZq1AEkiB0BiP9dJfUvBXHdAkgiQJHICJPZldpU50c91OqRzzAGS4hwBiU1OPXF51HOdDqmtOUBSnDMgsWF09oZo5zodknSAJJFDILG76bJon1jIFZB2zZ789txdgKQ8p0DadR3dGuVcF0Da9890/dffKb0OAZLinAKJbWlGT0Y31wWQ7si4++2ZX7zWlfoDkuIcA4n9dnLKf6Oa6wJINd/2vR98IiApzjmQ2KLqmZ9FM9cFkLLyfe8X5gCS4hwEiX2UXnNpFHNdAOni733vX2kPSIpzEiQ2nk5Za3+uCyAtaLm4XNNKZ533MyApzlGQ2ED9xZrt5gJIretQ1TPOyKZGTazcuwGQJHIWpMKbqLPtJxZyAaSL28jcuwGQJHIWJLa9FT1gd64LIMkFSBI5DBJbdwb9x+Zcd0Aq2ucNkBTnNEhsea209+zNdQGkTf+oihcai81gx0FiMzKrfWNrrgsgXVGj16DB3gBJcc6DxCam1FtlZ64LIFX93gogQLKRAyGxwdR0k425LoBUNw+QAMkoIqTCHtQpX36uCyA9OgKQAMkoIiS2swPdLj/XBZCOXdV20Ghv/KyD4/r0HF7AP1zQeSkg2cmRkNjmpvSs9FwXQBpNVPG3diMGb84bO7DM/9G+27sBkq2cCYmtqJM6WXauCyDV7/bdxi3ejHNYl02e70o3rDKkTbodkGzlUEhsXnaVLyXnugBSVqVfNizpVu55e/9H/o/6HvFBKjngae/uSB3R9kfcxlZ7j8dm7u7D2oHYDN5TEpu5u4u1ohhNLrW01Vspp2yQm3soZisui9FcD6QQl4h7LgRAumiVVqG5d+hvh0z0fnCw90rNB2lhC0/LKm6M3NhTdFVpvNcQt8r4qQBI33b8pcKGc+/U3/ohvfii5oe0Wv+j7dqjkSrVjkXcxlbHymIz17Pi47EZfKw8NnOPlsRqxUctrvjwVTRUam7cVyxdiVYS6qLgkNo2pGqnejPO+cF3026qfnpl7yIDkjf8jCSRU39G8rS2QeqHMnNd8DNSu05Gxjl7umzQtANdV+unx3Tr2bNnl+6jAMlGDobE5mbW/FlirgsgGR1cz0+OfmjzjmceKdfmT/ft/m3zDwCSjZwMiY2iiySeNdJFkBbU4ieLx/fuNcqz+Zihvo9x085ejobEutOd1jd2A6SZvdq1bdu2dW7tYLwqBkgSORvSn01oguWNXQBpCqU3pAZV6MpZgKQ4Z0NiP+RmLbS6rQsgtbiuSEv7reTlK4oASXEOh8TeoNOtPqTCBZByZ2pa2q+a9tBAQFKc0yGxe+m6QmtbugBSlTmaVn2Rpi1uAEiKczyk/NY0zNqWLoB00c3HtPOGaNq0qoCkOMdDYr+flD7N0oYugPQuddKeTus3/OTLAElxzofEvkiv86uV7VwASZsyWiu+mqjRckBSnAsgsaHUysrfZd0AyduGNcetOAIkmdwAqfB6GmBhMzdAKt6paYcnv7AJkFTnBkhsy1kpkyJv5QJIa+uO1kpaEtVYAUiKcwUktii76vcRN3IBpJvO36i9S69svOxmQFKcOyCx/1HjLZG2cQGkuu9r2o3NNO39RoCkOJdAYr3pxkibuABS5kKttObjmjY/E5AU5xZIeRfS8xE2cQGkRm9o82mhpk2qD0iKcwsktrJWxqzwW7gA0t31njj1zFKt4AL8jKQ610Bin6Sd/EfYDVwAaWdrqr1U026pUfE5UAAp2twDiT1K7XeFu9wFkDTtgP632OW7rDgCJJlcBKmgIw0Kd7krIGmH5xRoFgMkiVwEia1vlPJOmIvdAWkLfQ5IMchNkNi8zBN+Cn0pIAGS/VwFiT1P520PeSEgAZL93AWJ9aDeIS8DJECyn8sgbW9GL4e6zOmQthdrW45px1buB6QY5DJIbFn1rK9CXOR0SFVmaGTpAX2AZCO3QWLvpDRaH/wSp0PKvn0RvbHYHyApznWQ2AC6JvjTCjkdUk8yBUiKcx+k/DY0NOgFTodUMvMtGjbZHyApzn2Q2Jp6qVODne90SJ46rbMCCJBs5EJIbHZG7WBPK+QCSJq2e+bEN+ZaesJiQJLKjZDYMGoZ5GmFXACp7NEM/QekqmMASXWuhFTYme6tfK4LII2hGyfNnvnatfQ2ICnOlZDYlrPplUpnugBS00d87++5GJAU505IbFFO1e8qnucCSFlf+d7PygYkxbkUEnuNGm+ucJYLIFWd4Xv/RTVAUpxbIbE76YYK57gA0uVXHtPfHbnmCkBSnGsh7WxFowLPcQGkWSmn9B/xbL8GqV8CkuJcC4mtqpUxM+AMF0DSPm+i//r7fEsvIQtIMrkXEvs0re5q88dugKRpeT9afOoTQJLKxZDY43S5+WmF3AEJT34Sm8FuhlTQiR4yfegOSHiEbGwGuxkS23BKylviI0ACJPu5GhL7ukqN5fwDQAIk+7kbEnuBzt1mnAYkQLKfyyGxW+mfxkl3QMKTn8RmsNsh7biAxvtPugBSizW+9580BSTFuR0S+7FG1gLfKRdA8j+LUMlwvNCY6lwPib2X0mid94TjIZme+wQPo1AdILEH6OoC/b3jIa16ibrerdf339sBSXGAxHZdQU/q7x0PSdOuXW8FECDZCJAYW1s/9WPmCkhyAZJEgORpTmbNFa6AdKJRLiApDpD0RtDFeW6A1NXbJdnNBgKS4gDJW1fq6wZI/vLbzwQkxQGSty1n0UT3QNKWtwAkxQGSr8XZNda6B1I+nkVIdYDk7z90WalbIJWPbAhIigMkf4VX0wuOh9TcW7PaNAiQFAdIRmvrZC2JzeREg3RRx5eOAZLiAIk3hZruiM3khIEkFyBJBEi8Q33o/thMThxIG6e/P3MHIMUgQOIdOnhG6ucxmZwokKY18971u803gKQ8QOId0uanNdgQi8kJAmkc5fR6cfL4W3NS3wQk1QES75BW9AB1j8XkxIC0KrXtTu+JvMsyLL0EJiBJBEg8D6SdF9LrMZicGJDuqLnbf2p3zXsBSXGAxNPvIvRdlRqr1E9ODEin9eMn72kMSIoDJJ73vnYj6IpC5ZMTA1LWWH7y/3AXIdUBEs8LqbAjjVY+OTEgVRvNTz6PxyOpDpB4vnt//1oza5HqyYkB6fx/8pOdmwOS4gCJ538YxWQ6P0/x5MSA9HjGav+pJalDAUlxgMQzHo90Ez2seHJiQNpZ4+Q5+vuyKbVO3F2ZDSBFFSDxDEgbG6ZOVzs5MSBpC6rTaTf16VKfai+x4giQZAIkHn+E7PTUU7YonZwgkLStA04motMH5VtyBEgyARJPPNS8P/VSOjlRIHk6sOOgNUWAJBcg8QSkvHPpTZWTEwiSTEX7InXUwja2OlASm7meFR+MzeD9pbGZu++IdihGk8tiNPewWPHSrBPXKZwcqxUXa8UhLjmgAtLR45Eq00oibmOrkvLYzI3dio/HasWlSbjiUn56JF1zTN3kv2LFgYlHwOKBfQHhpp0o9jftGCtoS+PUTU7Sm3aAJBEg8QKe127VCTk/KJsMSLIBEi/JIbEJdOFOVZMBSTZA4iU7JNaVBquaDEiyARIv6SFtaJA+V9FkQJINkHhJD4lNTTltq5rJgCQbIPGSHxK7i+5SMxmQZAMkngMgbWuc8oGSyYAkGyDxHACJfZVRe62KyYAkGyDxnACJPUF/VzEZkGQDJJ4jIOW3ogkKJgOSbIDEcwQk9lO1qj9GPxmQZAMknjMgsXF0ya6oJwOSbIDEcwgkdi0NjXoyIMkGSDynQPqjTsb8aCcDkmyAxHMKJPYunb09ysmAJBsg8RwDid1G90Y5GZBkAySecyD9eUbKh9FNBiTZAInnHEhsdlr99VFNBiTZAInnIEjsEbo+qsmAJBsg8ZwEKf9iejWayYAkGyDxnASJLcmuvjKKyYAkGyDxHAWJjabWBfYnA5JsgMRzFqTCq2iE/cmAJBsg8ZwFif1WK/Nb25MBSTZA4jkMEnuLmuywOxmQZAMkntMgsVvoX3YnA5JsgMRzHKRNjVK/sDkZkGQDJJ7jILGZaQ022JsMSLIBEs95kNj91MPeZECSDZB4DoSUdx69YWsyIMkGSDwHQmLfVTlhlZ3JgCQbIPGcCIk9S1cU2pgMSLIBEs+RkAra0fM2JgOSbIDEcyQk9mvNrMXykwFJNkDiORMS+x9dkCc9GZBkAySeQyGxG+lR6cmAJBsg8ZwKaWPD1OmykwFJNkDiORUS+zTl1C2SkwFJNkDiORYSu4dul5wMSLIBEs+5kPKa0mS5yYAkGyDxnAuJLcw88XepyYAkGyDxHAyJDaGOUndwACTZAInnZEgFl9H/yUwGJNkAiedkSOzn3JwfJCYDkmyAxHM0JPYyXbjT+mRAkg2QeM6GxLrQk9Y3BiTZAInncEjrTkqfa3ljQJINkHgOh8Q+Tjl9q9VtAUk2QOI5HRK7k+62uikgyQZIPMdD2tY4ZYrFTQFJNkDiOR4Sm5d+0jprWwKSbIDEcz4k9jjdYm1DQJINkHgugJTfjKZa2hCQZAMkngsgsXlpjf60sh0gyQZIPDdAYn3pISubAZJsgMRzBaQtJ6cvtLAZIMkGSDxXQGJT6KJdkbcCJNkAiecOSOx6Gh15I0CSDZB4LoH0e41qkZ8OHJBkAySeSyCxMXRVxG0ASTZA4rkFUsGl9GakbQBJNkDiuQUSW5pZN9IL+QGSbIDEcw0k9ijdEWELQJINkHjugZR3durM8FsAkmyAxHMPJDY9pXH4V6gAJNkAieciSKwXDQ57OSDJBkg8N0HaVC/z+3CXA5JsgMRzEyQ2kdqEe+pVQJINkHiugsSupfFhLgUk2QCJ5y5IK6tW/y30pYAkGyDx3AWJPUs3hr4QkGQDJJ7LIO1qTu+GvBCQZAMknssgsW8zGoZ8wkhAkg2QeG6DxAZS/1AXJTKkg+P69Bxe4Du9Z+xt3Z9YB0h2AiRelJC2nZo6J8RFiQxpxODNeWMHlnlPPzx4084Xeh0BJBsBEi9KSOxjOjfES70kMCTWZZPnu9INq/TTRaO2aVph5/WAZCNA4kULiXWjYcEvSGBIS7qVe97e/xE/Y21X/d8e3uGJ7Y3UUe1AxG1sta8kNnP3HtGKYjN4X2ls5u49rB2M0eTYrfhQdAM21M7+OegFZdHNDVmxVhzikv1WIc29Q387ZKLxcdGAyfq7hS08LQv/TxGKUW/StfFegr8yfioSpDv1txzS9nte0b9DaasHe1p7NFKl2rGI29jqWFls5npWfDw2g4+Vx2bu0ZJYrfho4q74SEeaHOz82K24JNRFViH94LtpN9X30aqeM0yX4WckifAzEi/qn5EYW1al1h9Bzk7gn5H2dNmgaQe6rvZ+8PutP5kvAySJAImnABJ7im4Ncm4CQ9JGP7R5xzOPlGvzp2vH+nlf8Am//rYTIPFUQMpvlvJJ5XMTGVLx+N69Rnk2HzNUW9XZ20xAshEg8VRAYvPSzthe6cxEhhQmQJIIkHhKILG7g7xABSDJBkg8t0La3CD964rnAZJsgMRzKyT2AV1c8QUqAEk2QOK5FhL7R6UXqAAk2QCJ515Iq2tU+yXwHECSDZB47oXE/kNXB54BSLIBEs/FkAouockBZwCSbIDEczEktiTzpI3mjwFJNkDiuRkSe4TuMn8ISLIBEs/VkPLOTp1l+hCQZAMknqshsWkpZ5leoAKQZAMknrshsZ70hPgAkGQDJJ7LIa2vY3qBCkCSDZB4LofEXjW9QAUgyQZIPLdDYtfQi8ZJQJINkHiuh7Sias01/pOAJBsg8VwPiQ2nm/ynAEk2QOIB0q7m9J7vFCDJBkg8QGLfGC9QAUiyARIPkBgbQAO87wFJNkDiAZL+AhVpC/T3gCQbIPEAydNHdJ7+AhWAJBsg8QBJ7yYazgBJPkDiAZLeulrZPwGSfIDEAyRvL9KVgCQfIPEAyVthe3oVkKQDJB4g+VqWVWsdIMkGSDxA8vcE9QQk2QCJB0j+8s9L+TIWcxkg2QiQREkGic1NPWtHTAYDknyAJEo2SOwueiQ2gwFJOkASJR2kzQ3Tv4nJYECSDpBESQeJfUItCmIxF5CkAyRR8kEq+zv9JxZzAUk6QBIlIaTVNXJ/ibyZdIAkHSCJkhASG13xBSqUBEjSAZIoGSEVtKK31M8FJOkASZSMkNiijMAXqFASIEkHSKKkhMQeoruVzwUk6QBJlJyQ8s4KeIEKJQGSdIAkSk5IbFpK07xIG0oGSNIBkihJIbFb6SnFcwFJOkASJSuk9bUzl6idC0jSAZIoWSGx/9FlheE3lAyQpAMkUdJCYtfQy0rnApJ0gCRKXkgrqtZcq3IuIEkHSKLkhcSGUTeVcwFJOkASJTGk/AvofYVzAUk6QBIlMST2TXqjrermApJ0gCRKZkisPw1UNxeQpAMkUVJD2naK7wUqlARI0gGSKKkhsQ/pvHxVcwFJOkASJTckdiM9q2ouIEkHSKIkh7TmBP0FKpQESNIBkijJIbHx1FHRXECSDpBEyQ6psB1NVDMXkKQDJFGyQ2JLM+usVzIXkKQDJFHSQ2JP0C1K5gKSdIAkSn5I+RfQOyrmApJ0gCRKfkjsm4x6Kp5TCJCkAySRAyCxh6iPgrmAJB0giZwAKa9JytTo5wKSdIAkcgIkNjdNwd3AAUk6QBI5AhLrT/dEPReQpAMkkTMgbTs9dUa0cwFJOkASOQMSm5Zy5vYo5wKSdIAkcggkdic9EOVcQJIOkEROgfTnqWnzo5sLSNIBksgpkNinKWdH92zggCQdIIkcA4n1oMeimgtI0gGSyDmQNtVPXxjNXECSDpBEzoHE3qdmO6OYC0jSAZLIQZDYjTQ0irmAJB0giZwEaV3tzO/sz40xpKJ9kTpqYRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns4HiueRC13255brBWHuOSACkhHI1aqHYu8kZ2OlcVmrmfFx2M0uTxGc0uwYl64Fd9AY23PLdFKQl2kAhJu2kmEm3a8eNy0Y+z3mtk/2p2Ln5GkAySRsyCxCdTW7uv4AZJ0gCRyGCTWkV6wOReQpAMkkdMg/Vojd6W9uYAk0i8KqwAAC59JREFUHSCJnAaJjaEO9m7cAZJ0gCRyHKTCDjTB1lxAkg6QRI6DxH6uWv1XO3MBSTpAEjkPEhtB19qZC0jSAZLIgZAKLqXXbcwFJOkASeRASGxpVq218nMBSTpAEjkREhtC3eTnApJ0gCRyJKT8C+kt6bmAJB0giRwJiX2bedIG2bmAJB0giZwJiQ2inrJzAUk6QBI5FFJeU/pIci4gSQdIIodCYgszGm6RmwtI0gGSyKmQ2L/oLrm5gCQdIIkcCynvnJRPpOYCknSAJHIsJDY79ZQ/ZeYCknSAJHIuJNaXBsjMBSTpAEnkYEjbTkudJTEXkKQDJJGDIbHPU87aYX0uIEkHSCInQ2K308PWNwYk6QBJ5GhIm09OX2B5Y0CSDpBEjobEptC5lp9XH5CkAySRsyGx7vSE1U0BSTpAEjkc0vq6mYssbgpI0gGSyOGQ2Lt08S5rWwKSdIAkcjok1pmesbYhIEkHSCLHQ1p3Yub3ljYEJOkASeR4SOwValVgZTtAkg6QRM6HxP5Oo61sBkjSAZLIBZB+OyFnuYXNAEk6QBK5ABIbT+0sPK8+IEkHSCI3QCq8ksZH3gqQpAMkkRsgsRXVcn+JuBEgSQdIIldAYqPp6ojbAJJ0gCRyB6SCNvS/SNsAknSAJHIHJPZDlVprImwCSNIBksglkNgw6hJhC0CSDpBEboFU0Iomh98CkKQDJJFbILElWSf+EXYDQJIOkESugcSeoO5hLwck6QBJ5B5I+c3pnXCXA5J0gCRyDyT2TUa9jWEuBiTpAEnkIkjsIeod5lJAkg6QRG6ClNck5ePQlwKSdIAkchMkNjet0daQFwKSdIAkchUkdh/1C3kZIEkHSCJ3Qdp2RuqMUJcBknSAJHIXJDYt5cztIS4CJOkASeQySOwu+leISwBJOkASuQ3Sn6emzQ9+CSBJB0git0Fin6WcnRf0AkCSDpBEroPEbqXHgp4PSNIBksh9kDY1SP8q2PmAJB0gidwHib1PzYK9aBIgSQdIIhdCYjfRkCDnApJ0gCRyI6R1dTIXVz4XkKQDJJEbIbHXqUXlF00CJOkASeRKSOx6GlHpPECSDpBE7oT0e83sHyueB0jSAZLInZDYBGpb8Xn1AUk6QBK5FBK7jsZWOAeQpAMkkVsh/Vojd2XgOYAkHSCJ3AqJjaX2gTfuAEk6QBK5FlLhFTQh4AxAkg6QRK6FxFZUrf6r+WNAkg6QRO6FxJ6ja8wfApJ0gCRyMaSCS2mi6UNAkg6QRC6GxJZm1VorPgIk6QBJ5GZIbAjdJD4AJOkASeRqSPkX0lv8A0CSDpBErobEvs08aYNxGpCkAySRuyGxx+hW46QKSAfH9ek5vKDyaUCSC5B4SQIprylN859UAWnE4M15YweWVToNSHIBEi9JILH5T+X7TymAxLps8nwnumFVxdOAJBkg8ZIFkkgBpCXdyj1v7/+owumiNZ4K9kXqqFYUcRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns46VZcrBWHuOSAVUhz79DfDplY4fTCFp6Whf+nCDk+/nNOREh36m/9kMTpdSM9rT8SqVLtWMRtbHW0LDZzj5TEasVHkm/F5TGaW6Idj9HkWK34eOgVW4X0g+/m3NSKp/XwM5JE+BmJ58qfkfZ02aBpB7qurngakCQDJJ4rIWmjH9q845lHyrX508VpQJIPkHjuhFQ8vnevUZ7NxwwVpwFJPkDiuRNSmABJIkDiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiAZJ0c0buiuX4GPTVyD/jvQTJFo/cGO8lSLZs5O/xXoJkK0eujLhNTCGNbvFHLMfHoJda/BzvJUg2scV38V6CZO+3mBfvJUj2eYvPI24DSAEBUuwDJOkAKfYBUuwDJOkAKfYBEkIoRICEkIIACSEFARJCCooJpIPj+vQcXlD5dOJmXuWesbd1f2JdfNcTuQrHdUHnpXFcjKUCVjyr7433/xjX5VjJvOTtz/a65YnQf0qOCaQRgzfnjR1YVul04mZe5cODN+18odeROK8oUoHHdd/t3RIeknnFC3ovL/iiX3GcVxQx05LL+00oPvreP4tCbRoLSKzLJg/mG1ZVPJ24mVdZNGqbphV2Xh/vNYWvwnEdPen2RIcUsOJ+X8V5NZYyL3l/57WatrdzyFsqsYC0pFu55+39H1U8nbhVWuXarntDb50IBa54Sd8jCQ/JvOLdnb/6182Pro3ziiIWcJAfG1905IO+x0JtGwtIc+/Q3w6ZWPF04lZxlUUDJsdtLdYKWPHB3iu1hIdkXvG6zk9uL5rYY398VxSxgIO8Z2Dnzr1D30E4JpDuFJ/ffDpxq7DK7fe8Uh7H1VgpYMUvvqglASTTitd19txcKr11QXxXFDHzkksenLC/eGqvkDdUYgHpB993xKkVTydugatc1XNGXFdjJfOKV/YuSgJI5hWzzhs8bwcm+LUiYMkruui/frpreqhtYwFpTxfPYTrQdXXF04lbwCp/v/WnOC/HQuYVj+nWs2fPLt1HxXtN4TOvuKy357+qY90XxXtNETIv+efO+u8Ye/+lkLTRD23e8cwj5dr86eJ0Ymda8bF+U/THPib6r79NK/Y+4PS2+QfivaQIma8VU3utZC/3TvRjbF5yce8JB4992m1nqE1jAql4fO9eozy3JscMFacTO9OKV3X2NjPeS4qQ+RjrJfxNu4AVl719+41PbIv3iiJmXvLW4b16PP5ryE1xFyGEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQUBEkIKAqQEaRjp5bb/NOKWbc8JMcDSn2RD/GsUZYCUIA2jJ19//bWhp9CLkbbUKays/HUDpLgGSAmS30HRqbmR7oGmU3gZkBIsQEqQDAeP0DJN++aq3OyLJnk+anf5io65dXroz78xpVV2bospmpfCtZ5bgS3anuh9uGaH2scDBmjin7c9sUT/8NL6pWIiIMUmQEqQDAdD6TttQVr7GfP70wua1qlRqy8LPknro2kf0o0zZ15HM70U1nel5Wsm0See7fNTHwgcoIl//l+a7/nwz5RHTBMBKTYBUoJkOLg8fb92UWP9sS9dPDfyOpH+zN6dGmjaqI6e7z8H0nv5KNzt+bodrNbZc9kE+jlwgCb+OUu/x3NiLK00TQSk2ARICdIwmpWfv/PHu+g+rYAePOLpVfpR65SjX9Yn1diqYTsBSbsz3XOTr10zPsAPyfTP/1a3TNNanWc+C5BiEyAlSL5ff1P6gKPaSvL3mdbpVP0ync2Bp5tVT0ujtiZIi2mclpcyhg/wQzL983fpa20LPW8+C5BiEyAlSMNo/Jw5cxfv03QJdy31xkyQ2qc9tejX3xqYIWlnX6C9mLaTD+CQ+D8/mDNQ+0/KNvNZgBSbAClBMv3SbQ/1MU5ySBuon+dESZUASKNp9SXXVRpg+ufaLQ20llcEnAVIsQmQEiTzn4EuqaF/Y3p7SImAtIaGa/qfj1r7KPQl/RfbO9N60pTKA8Q/16bR5zQp4CxAik2AlCCZIX2TccHb84Zm3GH6jnS80cnTvnv0iityFx7SKfybhuu/+/4HVT8sBjw6Qe9b0z/Xjtc6o8qBgImAFJsAKUEKuGPC4qtzM84eU2KCpC1vk3PSvQdm1K65Tqew/aIMHcSn1Nc0wNdA0z/XtHvon4ETASk2AVIyN12/GwRKhAApiTvesnW8l4D8AVLStm3adWmJ/1pdbgmQkrZJKafPivcakBEgIaQgQEJIQYCEkIIACSEFARJCCgIkhBQESAgpCJAQUtD/B4iXRsAuE6zrAAAAAElFTkSuQmCC", - "text/plain": [ - "plot without title" - ] - }, - "metadata": { - "image/png": { - "height": 420, - "width": 420 - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Create a data frame to store the results\n", - "results_y <- data.frame(\n", - " Alphas = model_y$lambda,\n", - " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", - ")\n", - "\n", - "results_d <- data.frame(\n", - " Alphas = model_d$lambda,\n", - " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", - ")\n", - "\n", - "# Plot Outcome Lasso-CV Model\n", - "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n", - "\n", - "# Plot Treatment Lasso-CV Model\n", - "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n" - ] + "id": "tNLVM4WEgL9v", + "outputId": "1f2683b7-630a-43c5-e110-74c527603850", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add Double Lasso results to the table\n", + "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smPkxqCpgMR8" + }, + "source": [ + "## Method 2: Lasso with Cross-Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MH-eUye8liRq" + }, + "source": [ + "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YhpTUkE_wQz9", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choose penalty based on KFold cross validation\n", + "set.seed(123)\n", + "# Given small sample size, we use an aggressive number of 20 folds\n", + "n_folds <- 20\n", + "\n", + "\n", + "# Define LassoCV models for y and D\n", + "model_y <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = y,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "model_d <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = D,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "# Get the best lambda values for y and D\n", + "best_lambda_y <- model_y$lambda.min\n", + "best_lambda_d <- model_d$lambda.min\n", + "\n", + "# Fit Lasso models with the best lambda values\n", + "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", + "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", + "\n", + "# Calculate the residuals\n", + "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", + "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cbVsr86tyqTY", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", + "colnames(tmp_df) = c(\"res_y\", \"res_D\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D7SzuZ2P0P0X", + "vscode": { + "languageId": "r" } - ], - "metadata": { + }, + "outputs": [], + "source": [ + "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", + "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", + "\n", + "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci_cv <- est_cv - 1.96 * se_cv\n", + "upper_ci_cv <- est_cv + 1.96 * se_cv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "R", - "name": "ir" + "id": "Ctl5T5vUygRk", + "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add LassoCV results to the table\n", + "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0LzDsUi8gmQM" + }, + "source": [ + "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 857 }, - "language_info": { - "name": "R" + "id": "7uzcIGhVgmei", + "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# Create a data frame to store the results\n", + "results_y <- data.frame(\n", + " Alphas = model_y$lambda,\n", + " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", + ")\n", + "\n", + "results_d <- data.frame(\n", + " Alphas = model_d$lambda,\n", + " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", + ")\n", + "\n", + "# Plot Outcome Lasso-CV Model\n", + "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n", + "\n", + "# Plot Treatment Lasso-CV Model\n", + "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_experiment_non_orthogonal.Rmd b/PM2/r_experiment_non_orthogonal.Rmd new file mode 100644 index 00000000..ef14f649 --- /dev/null +++ b/PM2/r_experiment_non_orthogonal.Rmd @@ -0,0 +1,515 @@ +--- +title: An R Markdown document converted from "PM2/r_experiment_non_orthogonal.irnb" +output: html_document +--- + +# Simulation Design + +```{r} +install.packages("hdm") +library(hdm) +library(stats) +``` + +## Generating RCT data + +```{r} +gen_data <- function(n, d, p, delta, base) { + X <- matrix(rnorm(n * d), nrow = n, ncol = d) + D <- rbinom(n, 1, p) + y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) + y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) + y <- y1 * D + y0 * (1 - D) + return(list(y=y, D=D, X=X)) +} +``` + +```{r} +n <- 100 # n samples +d <- 100 # n features +delta <- 1.0 # treatment effect +base <- 0.3 # baseline outcome +``` + +## Two Means Estimator + +```{r} +# Simple two means estimate and calculation of variance +twomeans <- function(y, D) { + hat0 <- mean(y[D == 0]) # mean of outcome of un-treated + hat1 <- mean(y[D == 1]) # mean of outcome of treated + V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated + V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated + hat <- hat1 - hat0 # estimate of the treatment effect + stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect + return(list(hat = hat, stderr = stderr)) +} +``` + +```{r} +# Set the random seed for reproducibility +set.seed(125) + +# Generate RCT data +data <- gen_data(n, d, 0.2, delta, base) +y <- data$y +D <- data$D +X <- data$X + +# Calculate estimation quantities +twomeans(y, D) +``` + +## Partialling-Out Estimator + +```{r} +# We implement the partialling out version of OLS (for pedagogical purposes) +partialling_out <- function(y, D, W) { + # Residualize outcome with OLS + yfit <- lm(y ~ W) + yhat <- predict(yfit, as.data.frame(W)) + yres <- y - as.numeric(yhat) + + # Residualize treatment with OLS + Dfit <- lm(D ~ W) + Dhat <- predict(Dfit, as.data.frame(W)) + Dres <- D - as.numeric(Dhat) + + # Calculate final residual ~ residual OLS estimate + hat <- mean(yres * Dres) / mean(Dres^2) + + # Calculate residual of final regression (epsilon in the BLP decomposition) + epsilon <- yres - hat * Dres + + # Calculate variance of the treatment effect + V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 + stderr <- sqrt(V / length(y)) + + return(list(hat = hat, stderr = stderr)) +} +``` + +```{r} +partialling_out(y, D, cbind(D * X, X)) +``` + +## Double Lasso Partialling-Out Estimator + +```{r} +# Now we simply replace OLS with Lasso to implement the Double Lasso process + +double_lasso <- function(y, D, W) { + require(hdm) + + # residualize outcome with Lasso + yfit_rlasso <- rlasso(W, y, post = FALSE) + yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) + yres <- y - as.numeric(yhat_rlasso) + + + # residualize treatment with Lasso + dfit_rlasso <- rlasso(W, D, post = FALSE) + dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) + Dres <- D - as.numeric(dhat_rlasso) + + # rest is the same as in the OLS case + hat <- mean(yres * Dres) / mean(Dres^2) + epsilon <- yres - hat * Dres + V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 + stderr <- sqrt(V / length(y)) + + return(list(hat = hat, stderr = stderr)) +} +``` + +```{r} +double_lasso(y, D, cbind(D * X, X)) +``` + +# Simulation + +### Two-Means + +```{r} +# We now check the distributional properties of the different estimators across experiments +# First is the simple two means estimate + +n_experiments <- 100 +# we will keep track of coverage (truth is in CI) and of the point estimate and stderr +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data(n, d, 0.2, delta, base) + y <- data$y + D <- data$D + X <- data$X + + # Calculate two-means estimate + results <- twomeans(y, D) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +# Calculate average coverage (should be .95 ideally) +coverage_rate <- mean(cov) + +cat("Coverage Rate (95% CI):", coverage_rate, "\n") +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally) +``` + +```{r} +sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs +``` + +```{r} +mean(stderrs) +``` + +### Partialling Out + +```{r} +# Let's repeat this for the partialling out process (OLS), controlling for X + +n_experiments <- 100 +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data(n, d, 0.2, delta, base) + y <- data$y + D <- data$D + X <- data$X + + # Calculate partialling out estimate with OLS + results <- partialling_out(y, D, X) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +mean(cov) +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +mean(hats) # ols is heavily biased... mean of estimates very far from delta=1 +``` + +```{r} +sd(hats) +``` + +```{r} +mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting +``` + +### Double Lasso + +```{r} +# Now let's try the double Lasso. +n_experiments <- 100 +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data(n, d, 0.2, delta, base) + y <- data$y + D <- data$D + X <- data$X + + # Calculate partialling out estimate with OLS + results <- double_lasso(y, D, X) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +mean(cov) +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +mean(hats) # much closer to 1... (almost the same as two-means) +sd(hats) # standard deviation much smaller than two means, which did not adjust for X +mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty +``` + +### Single Lasso + +```{r} +# Now we simply replace OLS with Lasso to implement the Double Lasso process + +double_lasso <- function(y, D, W) { + require(hdm) + + # residualize outcome with Lasso + yfit_rlasso <- rlasso(W, y, post = FALSE) + yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) + yres <- y - as.numeric(yhat_rlasso) + + + # residualize treatment with Lasso + dfit_rlasso <- rlasso(W, D, post=FALSE) + dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) + Dres <- D - as.numeric(dhat_rlasso) + + # rest is the same as in the OLS case + hat <- mean(yres * Dres) / mean(Dres^2) + epsilon <- yres - hat * Dres + V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 + stderr = sqrt(V / length(y)) + + return(list(hat = hat, stderr = stderr)) +} +``` + +```{r} +# Now let's try the double Lasso. + +n_experiments <- 100 +hats <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data(n, d, 0.2, delta, base) + y <- data$y + D <- data$D + X <- data$X + + # Calculate single lasso estimate + + + yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE) + hat <- yfit_rlasso$coefficients[2] + + hats[i] <- hat +} +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +# bias is comparable and larger than standard deviation. +# Even if we could estimate the standard deviation, confidence intervals would undercover +1 - mean(hats) +sd(hats) +``` + +### Post-Lasso OLS + +```{r} +# Now let's try the post-Lasso. +n_experiments <- 100 +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data(n, d, 0.2, delta, base) + y <- data$y + D <- data$D + X <- data$X + + + # run a big lasso y ~ D, X + DX = cbind(D,X) + yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality + coefs <- yfit_rlasso$coefficients[2:n] + selected_columns <- X[, abs(coefs) > 0.0] + # run OLS on y ~ D, X[chosen by lasso] + # calculate standard error as if lasso step never happened + results <- partialling_out(y, D - mean(D), selected_columns) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +mean(cov) +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +1 - mean(hats) # quite un-biased; bias < standard deviation +sd(hats) +``` + +```{r} +# we under-estimated a bit the uncertainty; smaller estimated stderr than true std. +# this is most prob a finite sample error, from ignoring the lasso variable selection step +# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior. +mean(stderrs) +``` + +### Not RCT Data + +```{r} +gen_data_nonRCT <- function(n, d, p, delta, base) { + X <- matrix(rnorm(n * d), nrow = n, ncol = d) + D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4) + y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1) + return(list(y = y, D = D, X = X)) +} +``` + +```{r} +# post-lasso +n_experiments <- 100 +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data_nonRCT(n, d, p, delta, base) + y <- data$y + D <- data$D + X <- data$X + + + # run a big lasso y ~ D, X + DX = cbind(D, X) + yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality + coefs <- yfit_rlasso$coefficients[2:n] + selected_columns <- X[, abs(coefs) > 0.0] + # run OLS on y ~ D, X[chosen by lasso] + # calculate standard error as if lasso step never happened + results <- partialling_out(y, D - mean(D), selected_columns) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +mean(hats) # very heavily biased +``` + +```{r} +# But now let's try the double Lasso. +n_experiments <- 100 +cov <- numeric(n_experiments) +hats <- numeric(n_experiments) +stderrs <- numeric(n_experiments) + +for (i in 1:n_experiments) { + # Generate data for each experiment + data <- gen_data_nonRCT(n, d, p, delta, base) + y <- data$y + D <- data$D + X <- data$X + + # Calculate partialling out estimate with OLS + results <- double_lasso(y, D, X) + hat <- results$hat + stderr <- results$stderr + + # Calculate the 95% confidence interval + ci_lower <- hat - 1.96 * stderr + ci_upper <- hat + 1.96 * stderr + + # Check if the true parameter delta is within the confidence interval + cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) + + hats[i] <- hat + stderrs[i] <- stderr +} +``` + +```{r} +mean(cov) # great coverage +``` + +```{r} +hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") +``` + +```{r} +1 - mean(hats) +sd(hats) # very small bias compared to standard deviation +mean(stderrs) +``` + diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index e830beff..a8e7be1c 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -1,1043 +1,1043 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "YUCX_4AeCFrO" - }, - "source": [ - "# Simulation Design" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mS89_Re5ECjm", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "install.packages(\"hdm\")\n", - "library(hdm)\n", - "library(stats)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VczKl9DMouLw" - }, - "source": [ - "## Generating RCT data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zHRwbkqncrH_", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "gen_data <- function(n, d, p, delta, base) {\n", - " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", - " D <- rbinom(n, 1, p)\n", - " y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", - " y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", - " y <- y1 * D + y0 * (1 - D)\n", - " return(list(y=y, D=D, X=X))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B3BX59QhcrK4", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "n <- 100 # n samples\n", - "d <- 100 # n features\n", - "delta <- 1.0 # treatment effect\n", - "base <- 0.3 # baseline outcome" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "A8lx8FOpoqzj" - }, - "source": [ - "## Two Means Estimator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uKriPptNcrQo", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Simple two means estimate and calculation of variance\n", - "twomeans <- function(y, D) {\n", - " hat0 <- mean(y[D == 0]) # mean of outcome of un-treated\n", - " hat1 <- mean(y[D == 1]) # mean of outcome of treated\n", - " V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated\n", - " V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated\n", - " hat <- hat1 - hat0 # estimate of the treatment effect\n", - " stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kUAwErtycrTf", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Set the random seed for reproducibility\n", - "set.seed(125)\n", - "\n", - "# Generate RCT data\n", - "data <- gen_data(n, d, 0.2, delta, base)\n", - "y <- data$y\n", - "D <- data$D\n", - "X <- data$X\n", - "\n", - "# Calculate estimation quantities\n", - "twomeans(y, D)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LxS_DAjWoyAk" - }, - "source": [ - "## Partialling-Out Estimator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bszks8xNcrWV", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# We implement the partialling out version of OLS (for pedagogical purposes)\n", - "partialling_out <- function(y, D, W) {\n", - " # Residualize outcome with OLS\n", - " yfit <- lm(y ~ W)\n", - " yhat <- predict(yfit, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat)\n", - "\n", - " # Residualize treatment with OLS\n", - " Dfit <- lm(D ~ W)\n", - " Dhat <- predict(Dfit, as.data.frame(W))\n", - " Dres <- D - as.numeric(Dhat)\n", - "\n", - " # Calculate final residual ~ residual OLS estimate\n", - " hat <- mean(yres * Dres) / mean(Dres^2)\n", - "\n", - " # Calculate residual of final regression (epsilon in the BLP decomposition)\n", - " epsilon <- yres - hat * Dres\n", - "\n", - " # Calculate variance of the treatment effect\n", - " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr <- sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eCoa9F1gcrY_", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "partialling_out(y, D, cbind(D * X, X))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3C5agjr2o0wA" - }, - "source": [ - "## Double Lasso Partialling-Out Estimator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YcHiwiJ1jtFz", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", - "\n", - "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", - "\n", - " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat_rlasso)\n", - "\n", - "\n", - " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", - " Dres <- D - as.numeric(dhat_rlasso)\n", - "\n", - " # rest is the same as in the OLS case\n", - " hat <- mean(yres * Dres) / mean(Dres^2)\n", - " epsilon <- yres - hat * Dres\n", - " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr <- sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HBeGQAW9jtIa", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "double_lasso(y, D, cbind(D * X, X))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dnouR1CPo3tF" - }, - "source": [ - "# Simulation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qS3i_zabo8e1" - }, - "source": [ - "### Two-Means" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "boYM55VIlzch", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# We now check the distributional properties of the different estimators across experiments\n", - "# First is the simple two means estimate\n", - "\n", - "n_experiments <- 100\n", - "# we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data(n, d, 0.2, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - " # Calculate two-means estimate\n", - " results <- twomeans(y, D)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - "\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - "\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "77Nr_nANngWz", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Calculate average coverage (should be .95 ideally)\n", - "coverage_rate <- mean(cov)\n", - "\n", - "cat(\"Coverage Rate (95% CI):\", coverage_rate, \"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3EdakeDKmAv4", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hNt5QmEKmCLo", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ku_EVTfemM_I", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4r5MP3PYmODP", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(stderrs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gI1ph04ro9-7" - }, - "source": [ - "### Partialling Out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GDcD1JEVmQ3A", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Let's repeat this for the partialling out process (OLS), controlling for X\n", - "\n", - "n_experiments <- 100\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data(n, d, 0.2, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - " # Calculate partialling out estimate with OLS\n", - " results <- partialling_out(y, D, X)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - "\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - "\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fwgT8Nd3m1-S", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(cov)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Y0yYO1xBmt6Z", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fd-9aNqImvLG", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(hats) # ols is heavily biased... mean of estimates very far from delta=1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1kVDTlZunN-c", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "sd(hats)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ta8s0QlunOrP", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cqiR8n54pAM3" - }, - "source": [ - "### Double Lasso" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CApI-UF0n3yx", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Now let's try the double Lasso.\n", - "n_experiments <- 100\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data(n, d, 0.2, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - " # Calculate partialling out estimate with OLS\n", - " results <- double_lasso(y, D, X)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - "\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - "\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ujw3sUicoOgK", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(cov)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NveiO9xnoEgv", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "spy0Fd8goGt6", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(hats) # much closer to 1... (almost the same as two-means)\n", - "sd(hats) # standard deviation much smaller than two means, which did not adjust for X\n", - "mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PbroSXpNpCaj" - }, - "source": [ - "### Single Lasso" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q7PkXPAdpEjh", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", - "\n", - "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", - "\n", - " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat_rlasso)\n", - "\n", - "\n", - " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post=FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", - " Dres <- D - as.numeric(dhat_rlasso)\n", - "\n", - " # rest is the same as in the OLS case\n", - " hat <- mean(yres * Dres) / mean(Dres^2)\n", - " epsilon <- yres - hat * Dres\n", - " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr = sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z86ziKegpLf_", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Now let's try the double Lasso.\n", - "\n", - "n_experiments <- 100\n", - "hats <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data(n, d, 0.2, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - " # Calculate single lasso estimate\n", - "\n", - "\n", - " yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE)\n", - " hat <- yfit_rlasso$coefficients[2]\n", - "\n", - " hats[i] <- hat\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JVHyQxSNrLFw", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5RK6CFCVrNVB", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# bias is comparable and larger than standard deviation.\n", - "# Even if we could estimate the standard deviation, confidence intervals would undercover\n", - "1 - mean(hats)\n", - "sd(hats)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-oeenku7sWC9" - }, - "source": [ - "### Post-Lasso OLS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VjjbagsFsYLe", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Now let's try the post-Lasso.\n", - "n_experiments <- 100\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data(n, d, 0.2, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - "\n", - " # run a big lasso y ~ D, X\n", - " DX = cbind(D,X)\n", - " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", - " coefs <- yfit_rlasso$coefficients[2:n]\n", - " selected_columns <- X[, abs(coefs) > 0.0]\n", - " # run OLS on y ~ D, X[chosen by lasso]\n", - " # calculate standard error as if lasso step never happened\n", - " results <- partialling_out(y, D - mean(D), selected_columns)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZpFKqURXsdGg", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(cov)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-XlZuHuFsw3E", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-_frAjzet5Oe", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "1 - mean(hats) # quite un-biased; bias < standard deviation\n", - "sd(hats)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K0sBTl3FtFeV", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# we under-estimated a bit the uncertainty; smaller estimated stderr than true std.\n", - "# this is most prob a finite sample error, from ignoring the lasso variable selection step\n", - "# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior.\n", - "mean(stderrs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EXqKbFLkuKZi" - }, - "source": [ - "### Not RCT Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C3ZJTnpLt9-0", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "gen_data_nonRCT <- function(n, d, p, delta, base) {\n", - " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", - " D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4)\n", - " y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1)\n", - " return(list(y = y, D = D, X = X))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4dnILAzPuTR7", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# post-lasso\n", - "n_experiments <- 100\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data_nonRCT(n, d, p, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - "\n", - " # run a big lasso y ~ D, X\n", - " DX = cbind(D, X)\n", - " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", - " coefs <- yfit_rlasso$coefficients[2:n]\n", - " selected_columns <- X[, abs(coefs) > 0.0]\n", - " # run OLS on y ~ D, X[chosen by lasso]\n", - " # calculate standard error as if lasso step never happened\n", - " results <- partialling_out(y, D - mean(D), selected_columns)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uOo3L6W9uXZd", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oTTJJUyBux9u", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrPu3ypku02M", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(hats) # very heavily biased" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2ZU1ihfcu6z5", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# But now let's try the double Lasso.\n", - "n_experiments <- 100\n", - "cov <- numeric(n_experiments)\n", - "hats <- numeric(n_experiments)\n", - "stderrs <- numeric(n_experiments)\n", - "\n", - "for (i in 1:n_experiments) {\n", - " # Generate data for each experiment\n", - " data <- gen_data_nonRCT(n, d, p, delta, base)\n", - " y <- data$y\n", - " D <- data$D\n", - " X <- data$X\n", - "\n", - " # Calculate partialling out estimate with OLS\n", - " results <- double_lasso(y, D, X)\n", - " hat <- results$hat\n", - " stderr <- results$stderr\n", - "\n", - " # Calculate the 95% confidence interval\n", - " ci_lower <- hat - 1.96 * stderr\n", - " ci_upper <- hat + 1.96 * stderr\n", - "\n", - " # Check if the true parameter delta is within the confidence interval\n", - " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", - "\n", - " hats[i] <- hat\n", - " stderrs[i] <- stderr\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x1BfN7HEu_E9", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mean(cov) # great coverage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4mSy5U0CvEBs", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ElXwDxR-vEzT", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "1 - mean(hats)\n", - "sd(hats) # very small bias compared to standard deviation\n", - "mean(stderrs)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "YUCX_4AeCFrO" + }, + "source": [ + "# Simulation Design" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mS89_Re5ECjm", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "library(hdm)\n", + "library(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VczKl9DMouLw" + }, + "source": [ + "## Generating RCT data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zHRwbkqncrH_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "gen_data <- function(n, d, p, delta, base) {\n", + " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", + " D <- rbinom(n, 1, p)\n", + " y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", + " y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", + " y <- y1 * D + y0 * (1 - D)\n", + " return(list(y=y, D=D, X=X))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B3BX59QhcrK4", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "n <- 100 # n samples\n", + "d <- 100 # n features\n", + "delta <- 1.0 # treatment effect\n", + "base <- 0.3 # baseline outcome" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A8lx8FOpoqzj" + }, + "source": [ + "## Two Means Estimator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uKriPptNcrQo", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Simple two means estimate and calculation of variance\n", + "twomeans <- function(y, D) {\n", + " hat0 <- mean(y[D == 0]) # mean of outcome of un-treated\n", + " hat1 <- mean(y[D == 1]) # mean of outcome of treated\n", + " V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated\n", + " V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated\n", + " hat <- hat1 - hat0 # estimate of the treatment effect\n", + " stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kUAwErtycrTf", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Set the random seed for reproducibility\n", + "set.seed(125)\n", + "\n", + "# Generate RCT data\n", + "data <- gen_data(n, d, 0.2, delta, base)\n", + "y <- data$y\n", + "D <- data$D\n", + "X <- data$X\n", + "\n", + "# Calculate estimation quantities\n", + "twomeans(y, D)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LxS_DAjWoyAk" + }, + "source": [ + "## Partialling-Out Estimator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Bszks8xNcrWV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# We implement the partialling out version of OLS (for pedagogical purposes)\n", + "partialling_out <- function(y, D, W) {\n", + " # Residualize outcome with OLS\n", + " yfit <- lm(y ~ W)\n", + " yhat <- predict(yfit, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat)\n", + "\n", + " # Residualize treatment with OLS\n", + " Dfit <- lm(D ~ W)\n", + " Dhat <- predict(Dfit, as.data.frame(W))\n", + " Dres <- D - as.numeric(Dhat)\n", + "\n", + " # Calculate final residual ~ residual OLS estimate\n", + " hat <- mean(yres * Dres) / mean(Dres^2)\n", + "\n", + " # Calculate residual of final regression (epsilon in the BLP decomposition)\n", + " epsilon <- yres - hat * Dres\n", + "\n", + " # Calculate variance of the treatment effect\n", + " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eCoa9F1gcrY_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "partialling_out(y, D, cbind(D * X, X))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3C5agjr2o0wA" + }, + "source": [ + "## Double Lasso Partialling-Out Estimator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YcHiwiJ1jtFz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", + "\n", + "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", + " # residualize outcome with Lasso\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", + "\n", + "\n", + " # residualize treatment with Lasso\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " Dres <- D - as.numeric(dhat_rlasso)\n", + "\n", + " # rest is the same as in the OLS case\n", + " hat <- mean(yres * Dres) / mean(Dres^2)\n", + " epsilon <- yres - hat * Dres\n", + " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HBeGQAW9jtIa", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "double_lasso(y, D, cbind(D * X, X))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dnouR1CPo3tF" + }, + "source": [ + "# Simulation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qS3i_zabo8e1" + }, + "source": [ + "### Two-Means" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "boYM55VIlzch", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# We now check the distributional properties of the different estimators across experiments\n", + "# First is the simple two means estimate\n", + "\n", + "n_experiments <- 100\n", + "# we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data(n, d, 0.2, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + " # Calculate two-means estimate\n", + " results <- twomeans(y, D)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + "\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + "\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "77Nr_nANngWz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Calculate average coverage (should be .95 ideally)\n", + "coverage_rate <- mean(cov)\n", + "\n", + "cat(\"Coverage Rate (95% CI):\", coverage_rate, \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3EdakeDKmAv4", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hNt5QmEKmCLo", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ku_EVTfemM_I", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4r5MP3PYmODP", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(stderrs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gI1ph04ro9-7" + }, + "source": [ + "### Partialling Out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GDcD1JEVmQ3A", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Let's repeat this for the partialling out process (OLS), controlling for X\n", + "\n", + "n_experiments <- 100\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data(n, d, 0.2, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + " # Calculate partialling out estimate with OLS\n", + " results <- partialling_out(y, D, X)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + "\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + "\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fwgT8Nd3m1-S", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(cov)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y0yYO1xBmt6Z", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fd-9aNqImvLG", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(hats) # ols is heavily biased... mean of estimates very far from delta=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1kVDTlZunN-c", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "sd(hats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ta8s0QlunOrP", + "vscode": { + "languageId": "r" } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "hide_input": false, - "kernelspec": { - "display_name": "R", - "name": "ir" - }, - "language_info": { - "name": "R" + }, + "outputs": [], + "source": [ + "mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cqiR8n54pAM3" + }, + "source": [ + "### Double Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CApI-UF0n3yx", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Now let's try the double Lasso.\n", + "n_experiments <- 100\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data(n, d, 0.2, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + " # Calculate partialling out estimate with OLS\n", + " results <- double_lasso(y, D, X)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + "\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + "\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ujw3sUicoOgK", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(cov)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NveiO9xnoEgv", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "spy0Fd8goGt6", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(hats) # much closer to 1... (almost the same as two-means)\n", + "sd(hats) # standard deviation much smaller than two means, which did not adjust for X\n", + "mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PbroSXpNpCaj" + }, + "source": [ + "### Single Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q7PkXPAdpEjh", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", + "\n", + "double_lasso <- function(y, D, W) {\n", + " require(hdm)\n", + "\n", + " # residualize outcome with Lasso\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", + "\n", + "\n", + " # residualize treatment with Lasso\n", + " dfit_rlasso <- rlasso(W, D, post=FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " Dres <- D - as.numeric(dhat_rlasso)\n", + "\n", + " # rest is the same as in the OLS case\n", + " hat <- mean(yres * Dres) / mean(Dres^2)\n", + " epsilon <- yres - hat * Dres\n", + " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", + " stderr = sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z86ziKegpLf_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Now let's try the double Lasso.\n", + "\n", + "n_experiments <- 100\n", + "hats <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data(n, d, 0.2, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + " # Calculate single lasso estimate\n", + "\n", + "\n", + " yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE)\n", + " hat <- yfit_rlasso$coefficients[2]\n", + "\n", + " hats[i] <- hat\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JVHyQxSNrLFw", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5RK6CFCVrNVB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# bias is comparable and larger than standard deviation.\n", + "# Even if we could estimate the standard deviation, confidence intervals would undercover\n", + "1 - mean(hats)\n", + "sd(hats)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-oeenku7sWC9" + }, + "source": [ + "### Post-Lasso OLS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VjjbagsFsYLe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Now let's try the post-Lasso.\n", + "n_experiments <- 100\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data(n, d, 0.2, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + "\n", + " # run a big lasso y ~ D, X\n", + " DX = cbind(D,X)\n", + " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " coefs <- yfit_rlasso$coefficients[2:n]\n", + " selected_columns <- X[, abs(coefs) > 0.0]\n", + " # run OLS on y ~ D, X[chosen by lasso]\n", + " # calculate standard error as if lasso step never happened\n", + " results <- partialling_out(y, D - mean(D), selected_columns)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZpFKqURXsdGg", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(cov)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-XlZuHuFsw3E", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-_frAjzet5Oe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "1 - mean(hats) # quite un-biased; bias < standard deviation\n", + "sd(hats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K0sBTl3FtFeV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# we under-estimated a bit the uncertainty; smaller estimated stderr than true std.\n", + "# this is most prob a finite sample error, from ignoring the lasso variable selection step\n", + "# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior.\n", + "mean(stderrs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EXqKbFLkuKZi" + }, + "source": [ + "### Not RCT Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C3ZJTnpLt9-0", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "gen_data_nonRCT <- function(n, d, p, delta, base) {\n", + " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", + " D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4)\n", + " y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1)\n", + " return(list(y = y, D = D, X = X))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4dnILAzPuTR7", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# post-lasso\n", + "n_experiments <- 100\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data_nonRCT(n, d, p, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + "\n", + " # run a big lasso y ~ D, X\n", + " DX = cbind(D, X)\n", + " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " coefs <- yfit_rlasso$coefficients[2:n]\n", + " selected_columns <- X[, abs(coefs) > 0.0]\n", + " # run OLS on y ~ D, X[chosen by lasso]\n", + " # calculate standard error as if lasso step never happened\n", + " results <- partialling_out(y, D - mean(D), selected_columns)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uOo3L6W9uXZd", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oTTJJUyBux9u", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrPu3ypku02M", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(hats) # very heavily biased" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2ZU1ihfcu6z5", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# But now let's try the double Lasso.\n", + "n_experiments <- 100\n", + "cov <- numeric(n_experiments)\n", + "hats <- numeric(n_experiments)\n", + "stderrs <- numeric(n_experiments)\n", + "\n", + "for (i in 1:n_experiments) {\n", + " # Generate data for each experiment\n", + " data <- gen_data_nonRCT(n, d, p, delta, base)\n", + " y <- data$y\n", + " D <- data$D\n", + " X <- data$X\n", + "\n", + " # Calculate partialling out estimate with OLS\n", + " results <- double_lasso(y, D, X)\n", + " hat <- results$hat\n", + " stderr <- results$stderr\n", + "\n", + " # Calculate the 95% confidence interval\n", + " ci_lower <- hat - 1.96 * stderr\n", + " ci_upper <- hat + 1.96 * stderr\n", + "\n", + " # Check if the true parameter delta is within the confidence interval\n", + " cov[i] <- (ci_lower <= delta) & (delta <= ci_upper)\n", + "\n", + " hats[i] <- hat\n", + " stderrs[i] <- stderr\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x1BfN7HEu_E9", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(cov) # great coverage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4mSy5U0CvEBs", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(hats, main = \"Distribution of Estimates\", xlab = \"Estimate\", col = \"skyblue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ElXwDxR-vEzT", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "1 - mean(hats)\n", + "sd(hats) # very small bias compared to standard deviation\n", + "mean(stderrs)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "hide_input": false, + "kernelspec": { + "display_name": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_heterogenous_wage_effects.Rmd b/PM2/r_heterogenous_wage_effects.Rmd new file mode 100644 index 00000000..f588c35a --- /dev/null +++ b/PM2/r_heterogenous_wage_effects.Rmd @@ -0,0 +1,99 @@ +--- +title: An R Markdown document converted from "PM2/r_heterogenous_wage_effects.irnb" +output: html_document +--- + +# Application: Heterogeneous Effect of Sex on Wage Using Double Lasso + + We use US census data from the year 2015 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. + + + +This analysis allows a closer look how the gender wage gap is related to other socio-economic variables. + + +```{r} +install.packages("hdm") +install.packages("xtable") +library(hdm) +library(xtable) +``` + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +data <- read.csv(file) +str(data) +data +``` + +```{r} +y <- data$lwage +Z <- subset(data, select = -c(lwage, wage)) +``` + +```{r} +center_colmeans <- function(x) { + xcenter <- colMeans(x) + x - rep(xcenter, rep.int(nrow(x), ncol(x))) +} +``` + +```{r} +# create the model matrix for the covariates +controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2' +Zcontrols <- model.matrix(as.formula(controls_formula), data = Z) +Zcontrols <- center_colmeans(Zcontrols) +``` + +Construct all the variables that we will use to model heterogeneity of effect in a linear manner + +```{r} +# create the model matrix for the linear heterogeneity +linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)' +Zhet <- model.matrix(as.formula(linear_het_formula), data = Z) +Zhet <- center_colmeans(Zhet) +``` + +Construct all interaction variables between sex and heterogeneity variables + +```{r} +# create the model matrix for the higher order heterogeneity +Zhet <- as.data.frame(cbind(Zhet, "sex" = Z$sex)) +nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)' +Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet) +interaction_cols <- Zinteractions[, grepl("sex", colnames(Zinteractions))] +``` + +Put variables all together + +```{r} +X <- cbind(Zinteractions, Zcontrols) +``` + +Get estimates and CIs + +```{r} +# this cell takes 30 minutes to run +index_gender <- grep("sex", colnames(Zinteractions)) +effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE) +result <- summary(effects_female) +result$coef +print(xtable(result$coef[, c(1, 2, 4)], type = "latex"), digits = 3) +``` + +Now, we estimate and plot confidence intervals, first "pointwise" and then the joint confidence intervals. + +```{r} +pointwise_ci <- confint(effects_female, level = 0.95) +pointwise_ci +print(xtable(pointwise_ci), type = "latex") +``` + +Finally, we compare the pointwise confidence intervals to joint confidence intervals. + +```{r} +joint_ci <- confint(effects_female, level = 0.95, joint = TRUE) +joint_ci +print(xtable(joint_ci), type = "latex") +``` + diff --git a/PM2/r_heterogenous_wage_effects.irnb b/PM2/r_heterogenous_wage_effects.irnb index 6524425f..0deee78b 100644 --- a/PM2/r_heterogenous_wage_effects.irnb +++ b/PM2/r_heterogenous_wage_effects.irnb @@ -1,1029 +1,307 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "6oWBhx2m_eQ8" - }, - "source": [ - "# Application: Heterogeneous Effect of Sex on Wage Using Double Lasso\n", - "\n", - " We use US census data from the year 2015 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. \n", - "\n", - "\n", - "\n", - "This analysis allows a closer look how the gender wage gap is related to other socio-economic variables.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3QN4EOYGQkmz", - "outputId": "a5a8f7f8-def7-4ca6-8c8f-973d00b7bd20", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’, ‘glmnet’, ‘checkmate’, ‘Formula’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n" - ] - } - ], - "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"xtable\")\n", - "library(hdm)\n", - "library(xtable)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "fLiMuKqN_eQ-", - "outputId": "88233975-7a27-4614-d878-c718e6dcb072", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "'data.frame':\t5150 obs. of 20 variables:\n", - " $ wage : num 9.62 48.08 11.06 13.94 28.85 ...\n", - " $ lwage: num 2.26 3.87 2.4 2.63 3.36 ...\n", - " $ sex : int 1 0 0 1 1 1 1 0 1 1 ...\n", - " $ shs : int 0 0 0 0 0 0 0 0 0 0 ...\n", - " $ hsg : int 0 0 1 0 0 0 1 1 1 0 ...\n", - " $ scl : int 0 0 0 0 0 0 0 0 0 0 ...\n", - " $ clg : int 1 1 0 0 1 1 0 0 0 1 ...\n", - " $ ad : int 0 0 0 1 0 0 0 0 0 0 ...\n", - " $ mw : int 0 0 0 0 0 0 0 0 0 0 ...\n", - " $ so : int 0 0 0 0 0 0 0 0 0 0 ...\n", - " $ we : int 0 0 0 0 0 0 0 0 0 0 ...\n", - " $ ne : int 1 1 1 1 1 1 1 1 1 1 ...\n", - " $ exp1 : num 7 31 18 25 22 1 42 37 31 4 ...\n", - " $ exp2 : num 0.49 9.61 3.24 6.25 4.84 ...\n", - " $ exp3 : num 0.343 29.791 5.832 15.625 10.648 ...\n", - " $ exp4 : num 0.24 92.35 10.5 39.06 23.43 ...\n", - " $ occ : num 3600 3050 6260 420 2015 ...\n", - " $ occ2 : int 11 10 19 1 6 5 17 17 13 10 ...\n", - " $ ind : num 8370 5070 770 6990 9470 7460 7280 5680 8590 8190 ...\n", - " $ ind2 : int 18 9 4 12 22 14 14 9 19 18 ...\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 5150 × 20
wagelwagesexshshsgsclclgadmwsoweneexp1exp2exp3exp4occocc2indind2
<dbl><dbl><int><int><int><int><int><int><int><int><int><int><dbl><dbl><dbl><dbl><dbl><int><dbl><int>
9.6153852.2633641000100001 7.0 0.4900 0.343000 0.24010000360011837018
48.0769233.872802000010000131.0 9.610029.791000 92.352100003050105070 9
11.0576922.403126001000000118.0 3.2400 5.832000 10.49760000626019 770 4
13.9423082.634928100001000125.0 6.250015.625000 39.06250000 420 1699012
28.8461543.361977100010000122.0 4.840010.648000 23.425600002015 6947022
11.7307692.4622151000100001 1.0 0.0100 0.001000 0.000100001650 5746014
19.2307692.956512101000000142.017.640074.088000311.16960000512017728014
19.2307692.956512001000000137.013.690050.653000187.416100005240175680 9
12.0000002.484907101000000131.0 9.610029.791000 92.35210000404013859019
19.2307692.9565121000100001 4.0 0.1600 0.064000 0.02560000325510819018
17.3076922.8511511010000001 7.0 0.4900 0.343000 0.24010000402013827018
12.0192312.486508001000000130.0 9.000027.000000 81.00000000422014827018
12.0192312.4865081001000001 5.5 0.3025 0.166375 0.09150625360011827018
13.4615382.599837100100000120.5 4.2025 8.615125 17.66100625364511819018
16.3461542.793993100010000125.0 6.250015.625000 39.06250000 110 1787017
27.8846153.328075001000000116.0 2.5600 4.096000 6.55360000635519 770 4
21.6000003.072693001000000127.0 7.290019.683000 53.14410000632019 770 4
8.6538462.1580040001000001 3.5 0.1225 0.042875 0.01500625741020 570 3
19.2307692.9565120010000001 6.0 0.3600 0.216000 0.129600007000204690 9
13.1868132.5792170010000001 8.0 0.6400 0.512000 0.40960000400013868020
10.6837612.3687251001000001 6.5 0.4225 0.274625 0.17850625430015859019
11.5384622.4456861001000001 8.5 0.7225 0.614125 0.52200625360011819018
17.7884622.878550100001000111.0 1.2100 1.331000 1.464100002310 8786017
19.2307692.956512000010000111.0 1.2100 1.331000 1.46410000620019 770 4
16.3043482.791432100100000124.5 6.002514.706125 36.03000625 20 15480 9
19.2307692.9565120010000001 8.0 0.6400 0.512000 0.40960000480016739014
14.4230772.6688291010000001 5.0 0.2500 0.125000 0.06250000 430 1739014
12.0000002.4849070001000001 4.5 0.2025 0.091125 0.04100625633019 770 4
16.8269232.8229800000100001 8.0 0.6400 0.512000 0.409600005400174870 9
19.6703302.979111000010000126.0 6.760017.576000 45.69760000623019657011
13.9860142.6380581000010010 7.0 0.4900 0.343000 0.2401000382012959022
15.8653852.7641400000010010 2.0 0.0400 0.008000 0.00160002200 8787017
38.4615383.6496591001000010 7.5 0.5625 0.421875 0.3164062 220 1 770 4
28.8461543.3619770000100010 5.0 0.2500 0.125000 0.06250001360 4 770 4
24.4755243.197674100001001024.0 5.760013.824000 33.17760002310 8786017
27.8846153.328075000100001028.5 8.122523.149125 65.9750063814021 770 4
8.6538462.158004101000001025.0 6.250015.625000 39.0625000393012768016
12.0192312.486508001000001019.0 3.6100 6.859000 13.03210002750 9856019
38.4615383.6496590000100010 8.0 0.6400 0.512000 0.4096000 710 2687012
12.5000002.525729101000001012.0 1.4400 1.728000 2.0736000401013868020
35.2564103.5626470000010010 6.0 0.3600 0.216000 0.12960002320 8786017
48.0769233.872802100010001025.0 6.250015.625000 39.0625000325510817018
9.6153852.263364001000001020.0 4.0000 8.000000 16.0000000411013868020
12.0192312.486508001000001017.0 2.8900 4.913000 8.3521000962022629010
12.0192312.486508000100001031.5 9.922531.255875 98.4560063551017638010
12.9807692.563469001000001015.0 2.2500 3.375000 5.06250002010 6937022
26.4423083.274965100010001034.011.560039.304000133.63360002310 8786017
13.4615382.5998371010000010 8.0 0.6400 0.512000 0.4096000472016859019
19.7115382.9812041000010010 8.0 0.6400 0.512000 0.40960002000 6809018
21.1538463.051822000010001028.0 7.840021.952000 61.4656000 40 1917021
45.5465593.8187351000100010 5.0 0.2500 0.125000 0.0625000325510819018
22.5961543.117780001000001015.0 2.2500 3.375000 5.06250009620225390 9
16.8269232.822980001000001011.0 1.2100 1.331000 1.4641000715020877021
24.0384623.179655100010001017.0 2.8900 4.913000 8.35210002550 8948022
13.8461542.628007000010001010.0 1.0000 1.000000 1.0000000 800 2 770 4
14.7692312.6925460000100010 9.0 0.8100 0.729000 0.65610004700164970 9
23.0769233.138833100100001012.0 1.4400 1.728000 2.0736000411013868020
38.4615383.649659000001001011.0 1.2100 1.331000 1.46410001550 43680 6
32.9670333.495508001000001010.0 1.0000 1.000000 1.00000002920 9657011
17.3076922.851151000001001014.0 1.9600 2.744000 3.84160001610 5746014
\n" - ], - "text/latex": [ - "A data.frame: 5150 × 20\n", - "\\begin{tabular}{llllllllllllllllllll}\n", - " wage & lwage & sex & shs & hsg & scl & clg & ad & mw & so & we & ne & exp1 & exp2 & exp3 & exp4 & occ & occ2 & ind & ind2\\\\\n", - " & & & & & & & & & & & & & & & & & & & \\\\\n", - "\\hline\n", - "\t 9.615385 & 2.263364 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 3600 & 11 & 8370 & 18\\\\\n", - "\t 48.076923 & 3.872802 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 3050 & 10 & 5070 & 9\\\\\n", - "\t 11.057692 & 2.403126 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 18.0 & 3.2400 & 5.832000 & 10.49760000 & 6260 & 19 & 770 & 4\\\\\n", - "\t 13.942308 & 2.634928 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 420 & 1 & 6990 & 12\\\\\n", - "\t 28.846154 & 3.361977 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 22.0 & 4.8400 & 10.648000 & 23.42560000 & 2015 & 6 & 9470 & 22\\\\\n", - "\t 11.730769 & 2.462215 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 1.0 & 0.0100 & 0.001000 & 0.00010000 & 1650 & 5 & 7460 & 14\\\\\n", - "\t 19.230769 & 2.956512 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 42.0 & 17.6400 & 74.088000 & 311.16960000 & 5120 & 17 & 7280 & 14\\\\\n", - "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 37.0 & 13.6900 & 50.653000 & 187.41610000 & 5240 & 17 & 5680 & 9\\\\\n", - "\t 12.000000 & 2.484907 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 31.0 & 9.6100 & 29.791000 & 92.35210000 & 4040 & 13 & 8590 & 19\\\\\n", - "\t 19.230769 & 2.956512 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 4.0 & 0.1600 & 0.064000 & 0.02560000 & 3255 & 10 & 8190 & 18\\\\\n", - "\t 17.307692 & 2.851151 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 7.0 & 0.4900 & 0.343000 & 0.24010000 & 4020 & 13 & 8270 & 18\\\\\n", - "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 30.0 & 9.0000 & 27.000000 & 81.00000000 & 4220 & 14 & 8270 & 18\\\\\n", - "\t 12.019231 & 2.486508 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 5.5 & 0.3025 & 0.166375 & 0.09150625 & 3600 & 11 & 8270 & 18\\\\\n", - "\t 13.461538 & 2.599837 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 20.5 & 4.2025 & 8.615125 & 17.66100625 & 3645 & 11 & 8190 & 18\\\\\n", - "\t 16.346154 & 2.793993 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 25.0 & 6.2500 & 15.625000 & 39.06250000 & 110 & 1 & 7870 & 17\\\\\n", - "\t 27.884615 & 3.328075 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 16.0 & 2.5600 & 4.096000 & 6.55360000 & 6355 & 19 & 770 & 4\\\\\n", - "\t 21.600000 & 3.072693 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 27.0 & 7.2900 & 19.683000 & 53.14410000 & 6320 & 19 & 770 & 4\\\\\n", - "\t 8.653846 & 2.158004 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 3.5 & 0.1225 & 0.042875 & 0.01500625 & 7410 & 20 & 570 & 3\\\\\n", - "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 6.0 & 0.3600 & 0.216000 & 0.12960000 & 7000 & 20 & 4690 & 9\\\\\n", - "\t 13.186813 & 2.579217 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4000 & 13 & 8680 & 20\\\\\n", - "\t 10.683761 & 2.368725 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 6.5 & 0.4225 & 0.274625 & 0.17850625 & 4300 & 15 & 8590 & 19\\\\\n", - "\t 11.538462 & 2.445686 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 8.5 & 0.7225 & 0.614125 & 0.52200625 & 3600 & 11 & 8190 & 18\\\\\n", - "\t 17.788462 & 2.878550 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 2310 & 8 & 7860 & 17\\\\\n", - "\t 19.230769 & 2.956512 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 11.0 & 1.2100 & 1.331000 & 1.46410000 & 6200 & 19 & 770 & 4\\\\\n", - "\t 16.304348 & 2.791432 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 24.5 & 6.0025 & 14.706125 & 36.03000625 & 20 & 1 & 5480 & 9\\\\\n", - "\t 19.230769 & 2.956512 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 4800 & 16 & 7390 & 14\\\\\n", - "\t 14.423077 & 2.668829 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 5.0 & 0.2500 & 0.125000 & 0.06250000 & 430 & 1 & 7390 & 14\\\\\n", - "\t 12.000000 & 2.484907 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 4.5 & 0.2025 & 0.091125 & 0.04100625 & 6330 & 19 & 770 & 4\\\\\n", - "\t 16.826923 & 2.822980 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 8.0 & 0.6400 & 0.512000 & 0.40960000 & 5400 & 17 & 4870 & 9\\\\\n", - "\t 19.670330 & 2.979111 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 26.0 & 6.7600 & 17.576000 & 45.69760000 & 6230 & 19 & 6570 & 11\\\\\n", - "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", - "\t 13.986014 & 2.638058 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 7.0 & 0.4900 & 0.343000 & 0.2401000 & 3820 & 12 & 9590 & 22\\\\\n", - "\t 15.865385 & 2.764140 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 2.0 & 0.0400 & 0.008000 & 0.0016000 & 2200 & 8 & 7870 & 17\\\\\n", - "\t 38.461538 & 3.649659 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 7.5 & 0.5625 & 0.421875 & 0.3164062 & 220 & 1 & 770 & 4\\\\\n", - "\t 28.846154 & 3.361977 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 1360 & 4 & 770 & 4\\\\\n", - "\t 24.475524 & 3.197674 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 24.0 & 5.7600 & 13.824000 & 33.1776000 & 2310 & 8 & 7860 & 17\\\\\n", - "\t 27.884615 & 3.328075 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 28.5 & 8.1225 & 23.149125 & 65.9750063 & 8140 & 21 & 770 & 4\\\\\n", - "\t 8.653846 & 2.158004 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3930 & 12 & 7680 & 16\\\\\n", - "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 19.0 & 3.6100 & 6.859000 & 13.0321000 & 2750 & 9 & 8560 & 19\\\\\n", - "\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 710 & 2 & 6870 & 12\\\\\n", - "\t 12.500000 & 2.525729 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4010 & 13 & 8680 & 20\\\\\n", - "\t 35.256410 & 3.562647 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 6.0 & 0.3600 & 0.216000 & 0.1296000 & 2320 & 8 & 7860 & 17\\\\\n", - "\t 48.076923 & 3.872802 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 25.0 & 6.2500 & 15.625000 & 39.0625000 & 3255 & 10 & 8170 & 18\\\\\n", - "\t 9.615385 & 2.263364 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 20.0 & 4.0000 & 8.000000 & 16.0000000 & 4110 & 13 & 8680 & 20\\\\\n", - "\t 12.019231 & 2.486508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 9620 & 22 & 6290 & 10\\\\\n", - "\t 12.019231 & 2.486508 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 31.5 & 9.9225 & 31.255875 & 98.4560063 & 5510 & 17 & 6380 & 10\\\\\n", - "\t 12.980769 & 2.563469 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 2010 & 6 & 9370 & 22\\\\\n", - "\t 26.442308 & 3.274965 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 34.0 & 11.5600 & 39.304000 & 133.6336000 & 2310 & 8 & 7860 & 17\\\\\n", - "\t 13.461538 & 2.599837 & 1 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 4720 & 16 & 8590 & 19\\\\\n", - "\t 19.711538 & 2.981204 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 8.0 & 0.6400 & 0.512000 & 0.4096000 & 2000 & 6 & 8090 & 18\\\\\n", - "\t 21.153846 & 3.051822 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 28.0 & 7.8400 & 21.952000 & 61.4656000 & 40 & 1 & 9170 & 21\\\\\n", - "\t 45.546559 & 3.818735 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 5.0 & 0.2500 & 0.125000 & 0.0625000 & 3255 & 10 & 8190 & 18\\\\\n", - "\t 22.596154 & 3.117780 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 15.0 & 2.2500 & 3.375000 & 5.0625000 & 9620 & 22 & 5390 & 9\\\\\n", - "\t 16.826923 & 2.822980 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 7150 & 20 & 8770 & 21\\\\\n", - "\t 24.038462 & 3.179655 & 1 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 17.0 & 2.8900 & 4.913000 & 8.3521000 & 2550 & 8 & 9480 & 22\\\\\n", - "\t 13.846154 & 2.628007 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 800 & 2 & 770 & 4\\\\\n", - "\t 14.769231 & 2.692546 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 1 & 0 & 9.0 & 0.8100 & 0.729000 & 0.6561000 & 4700 & 16 & 4970 & 9\\\\\n", - "\t 23.076923 & 3.138833 & 1 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 1 & 0 & 12.0 & 1.4400 & 1.728000 & 2.0736000 & 4110 & 13 & 8680 & 20\\\\\n", - "\t 38.461538 & 3.649659 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 11.0 & 1.2100 & 1.331000 & 1.4641000 & 1550 & 4 & 3680 & 6\\\\\n", - "\t 32.967033 & 3.495508 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 10.0 & 1.0000 & 1.000000 & 1.0000000 & 2920 & 9 & 6570 & 11\\\\\n", - "\t 17.307692 & 2.851151 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 1 & 0 & 14.0 & 1.9600 & 2.744000 & 3.8416000 & 1610 & 5 & 7460 & 14\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A data.frame: 5150 × 20\n", - "\n", - "| wage <dbl> | lwage <dbl> | sex <int> | shs <int> | hsg <int> | scl <int> | clg <int> | ad <int> | mw <int> | so <int> | we <int> | ne <int> | exp1 <dbl> | exp2 <dbl> | exp3 <dbl> | exp4 <dbl> | occ <dbl> | occ2 <int> | ind <dbl> | ind2 <int> |\n", - "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", - "| 9.615385 | 2.263364 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 3600 | 11 | 8370 | 18 |\n", - "| 48.076923 | 3.872802 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 3050 | 10 | 5070 | 9 |\n", - "| 11.057692 | 2.403126 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 18.0 | 3.2400 | 5.832000 | 10.49760000 | 6260 | 19 | 770 | 4 |\n", - "| 13.942308 | 2.634928 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 420 | 1 | 6990 | 12 |\n", - "| 28.846154 | 3.361977 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 22.0 | 4.8400 | 10.648000 | 23.42560000 | 2015 | 6 | 9470 | 22 |\n", - "| 11.730769 | 2.462215 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1.0 | 0.0100 | 0.001000 | 0.00010000 | 1650 | 5 | 7460 | 14 |\n", - "| 19.230769 | 2.956512 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 42.0 | 17.6400 | 74.088000 | 311.16960000 | 5120 | 17 | 7280 | 14 |\n", - "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 37.0 | 13.6900 | 50.653000 | 187.41610000 | 5240 | 17 | 5680 | 9 |\n", - "| 12.000000 | 2.484907 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 31.0 | 9.6100 | 29.791000 | 92.35210000 | 4040 | 13 | 8590 | 19 |\n", - "| 19.230769 | 2.956512 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 4.0 | 0.1600 | 0.064000 | 0.02560000 | 3255 | 10 | 8190 | 18 |\n", - "| 17.307692 | 2.851151 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 7.0 | 0.4900 | 0.343000 | 0.24010000 | 4020 | 13 | 8270 | 18 |\n", - "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 30.0 | 9.0000 | 27.000000 | 81.00000000 | 4220 | 14 | 8270 | 18 |\n", - "| 12.019231 | 2.486508 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 5.5 | 0.3025 | 0.166375 | 0.09150625 | 3600 | 11 | 8270 | 18 |\n", - "| 13.461538 | 2.599837 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 20.5 | 4.2025 | 8.615125 | 17.66100625 | 3645 | 11 | 8190 | 18 |\n", - "| 16.346154 | 2.793993 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 25.0 | 6.2500 | 15.625000 | 39.06250000 | 110 | 1 | 7870 | 17 |\n", - "| 27.884615 | 3.328075 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16.0 | 2.5600 | 4.096000 | 6.55360000 | 6355 | 19 | 770 | 4 |\n", - "| 21.600000 | 3.072693 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 27.0 | 7.2900 | 19.683000 | 53.14410000 | 6320 | 19 | 770 | 4 |\n", - "| 8.653846 | 2.158004 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3.5 | 0.1225 | 0.042875 | 0.01500625 | 7410 | 20 | 570 | 3 |\n", - "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.0 | 0.3600 | 0.216000 | 0.12960000 | 7000 | 20 | 4690 | 9 |\n", - "| 13.186813 | 2.579217 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4000 | 13 | 8680 | 20 |\n", - "| 10.683761 | 2.368725 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 6.5 | 0.4225 | 0.274625 | 0.17850625 | 4300 | 15 | 8590 | 19 |\n", - "| 11.538462 | 2.445686 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 8.5 | 0.7225 | 0.614125 | 0.52200625 | 3600 | 11 | 8190 | 18 |\n", - "| 17.788462 | 2.878550 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 2310 | 8 | 7860 | 17 |\n", - "| 19.230769 | 2.956512 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 11.0 | 1.2100 | 1.331000 | 1.46410000 | 6200 | 19 | 770 | 4 |\n", - "| 16.304348 | 2.791432 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 24.5 | 6.0025 | 14.706125 | 36.03000625 | 20 | 1 | 5480 | 9 |\n", - "| 19.230769 | 2.956512 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 4800 | 16 | 7390 | 14 |\n", - "| 14.423077 | 2.668829 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5.0 | 0.2500 | 0.125000 | 0.06250000 | 430 | 1 | 7390 | 14 |\n", - "| 12.000000 | 2.484907 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 4.5 | 0.2025 | 0.091125 | 0.04100625 | 6330 | 19 | 770 | 4 |\n", - "| 16.826923 | 2.822980 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 8.0 | 0.6400 | 0.512000 | 0.40960000 | 5400 | 17 | 4870 | 9 |\n", - "| 19.670330 | 2.979111 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 26.0 | 6.7600 | 17.576000 | 45.69760000 | 6230 | 19 | 6570 | 11 |\n", - "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", - "| 13.986014 | 2.638058 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 7.0 | 0.4900 | 0.343000 | 0.2401000 | 3820 | 12 | 9590 | 22 |\n", - "| 15.865385 | 2.764140 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 2.0 | 0.0400 | 0.008000 | 0.0016000 | 2200 | 8 | 7870 | 17 |\n", - "| 38.461538 | 3.649659 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 7.5 | 0.5625 | 0.421875 | 0.3164062 | 220 | 1 | 770 | 4 |\n", - "| 28.846154 | 3.361977 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 1360 | 4 | 770 | 4 |\n", - "| 24.475524 | 3.197674 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 24.0 | 5.7600 | 13.824000 | 33.1776000 | 2310 | 8 | 7860 | 17 |\n", - "| 27.884615 | 3.328075 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 28.5 | 8.1225 | 23.149125 | 65.9750063 | 8140 | 21 | 770 | 4 |\n", - "| 8.653846 | 2.158004 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3930 | 12 | 7680 | 16 |\n", - "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 19.0 | 3.6100 | 6.859000 | 13.0321000 | 2750 | 9 | 8560 | 19 |\n", - "| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 710 | 2 | 6870 | 12 |\n", - "| 12.500000 | 2.525729 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4010 | 13 | 8680 | 20 |\n", - "| 35.256410 | 3.562647 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 6.0 | 0.3600 | 0.216000 | 0.1296000 | 2320 | 8 | 7860 | 17 |\n", - "| 48.076923 | 3.872802 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 25.0 | 6.2500 | 15.625000 | 39.0625000 | 3255 | 10 | 8170 | 18 |\n", - "| 9.615385 | 2.263364 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20.0 | 4.0000 | 8.000000 | 16.0000000 | 4110 | 13 | 8680 | 20 |\n", - "| 12.019231 | 2.486508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 9620 | 22 | 6290 | 10 |\n", - "| 12.019231 | 2.486508 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 31.5 | 9.9225 | 31.255875 | 98.4560063 | 5510 | 17 | 6380 | 10 |\n", - "| 12.980769 | 2.563469 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 2010 | 6 | 9370 | 22 |\n", - "| 26.442308 | 3.274965 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 34.0 | 11.5600 | 39.304000 | 133.6336000 | 2310 | 8 | 7860 | 17 |\n", - "| 13.461538 | 2.599837 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 4720 | 16 | 8590 | 19 |\n", - "| 19.711538 | 2.981204 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 8.0 | 0.6400 | 0.512000 | 0.4096000 | 2000 | 6 | 8090 | 18 |\n", - "| 21.153846 | 3.051822 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 28.0 | 7.8400 | 21.952000 | 61.4656000 | 40 | 1 | 9170 | 21 |\n", - "| 45.546559 | 3.818735 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 5.0 | 0.2500 | 0.125000 | 0.0625000 | 3255 | 10 | 8190 | 18 |\n", - "| 22.596154 | 3.117780 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 15.0 | 2.2500 | 3.375000 | 5.0625000 | 9620 | 22 | 5390 | 9 |\n", - "| 16.826923 | 2.822980 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 7150 | 20 | 8770 | 21 |\n", - "| 24.038462 | 3.179655 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 17.0 | 2.8900 | 4.913000 | 8.3521000 | 2550 | 8 | 9480 | 22 |\n", - "| 13.846154 | 2.628007 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 800 | 2 | 770 | 4 |\n", - "| 14.769231 | 2.692546 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 9.0 | 0.8100 | 0.729000 | 0.6561000 | 4700 | 16 | 4970 | 9 |\n", - "| 23.076923 | 3.138833 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 12.0 | 1.4400 | 1.728000 | 2.0736000 | 4110 | 13 | 8680 | 20 |\n", - "| 38.461538 | 3.649659 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 11.0 | 1.2100 | 1.331000 | 1.4641000 | 1550 | 4 | 3680 | 6 |\n", - "| 32.967033 | 3.495508 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 10.0 | 1.0000 | 1.000000 | 1.0000000 | 2920 | 9 | 6570 | 11 |\n", - "| 17.307692 | 2.851151 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 14.0 | 1.9600 | 2.744000 | 3.8416000 | 1610 | 5 | 7460 | 14 |\n", - "\n" - ], - "text/plain": [ - " wage lwage sex shs hsg scl clg ad mw so we ne exp1 exp2 \n", - "1 9.615385 2.263364 1 0 0 0 1 0 0 0 0 1 7.0 0.4900\n", - "2 48.076923 3.872802 0 0 0 0 1 0 0 0 0 1 31.0 9.6100\n", - "3 11.057692 2.403126 0 0 1 0 0 0 0 0 0 1 18.0 3.2400\n", - "4 13.942308 2.634928 1 0 0 0 0 1 0 0 0 1 25.0 6.2500\n", - "5 28.846154 3.361977 1 0 0 0 1 0 0 0 0 1 22.0 4.8400\n", - "6 11.730769 2.462215 1 0 0 0 1 0 0 0 0 1 1.0 0.0100\n", - "7 19.230769 2.956512 1 0 1 0 0 0 0 0 0 1 42.0 17.6400\n", - "8 19.230769 2.956512 0 0 1 0 0 0 0 0 0 1 37.0 13.6900\n", - "9 12.000000 2.484907 1 0 1 0 0 0 0 0 0 1 31.0 9.6100\n", - "10 19.230769 2.956512 1 0 0 0 1 0 0 0 0 1 4.0 0.1600\n", - "11 17.307692 2.851151 1 0 1 0 0 0 0 0 0 1 7.0 0.4900\n", - "12 12.019231 2.486508 0 0 1 0 0 0 0 0 0 1 30.0 9.0000\n", - "13 12.019231 2.486508 1 0 0 1 0 0 0 0 0 1 5.5 0.3025\n", - "14 13.461538 2.599837 1 0 0 1 0 0 0 0 0 1 20.5 4.2025\n", - "15 16.346154 2.793993 1 0 0 0 1 0 0 0 0 1 25.0 6.2500\n", - "16 27.884615 3.328075 0 0 1 0 0 0 0 0 0 1 16.0 2.5600\n", - "17 21.600000 3.072693 0 0 1 0 0 0 0 0 0 1 27.0 7.2900\n", - "18 8.653846 2.158004 0 0 0 1 0 0 0 0 0 1 3.5 0.1225\n", - "19 19.230769 2.956512 0 0 1 0 0 0 0 0 0 1 6.0 0.3600\n", - "20 13.186813 2.579217 0 0 1 0 0 0 0 0 0 1 8.0 0.6400\n", - "21 10.683761 2.368725 1 0 0 1 0 0 0 0 0 1 6.5 0.4225\n", - "22 11.538462 2.445686 1 0 0 1 0 0 0 0 0 1 8.5 0.7225\n", - "23 17.788462 2.878550 1 0 0 0 0 1 0 0 0 1 11.0 1.2100\n", - "24 19.230769 2.956512 0 0 0 0 1 0 0 0 0 1 11.0 1.2100\n", - "25 16.304348 2.791432 1 0 0 1 0 0 0 0 0 1 24.5 6.0025\n", - "26 19.230769 2.956512 0 0 1 0 0 0 0 0 0 1 8.0 0.6400\n", - "27 14.423077 2.668829 1 0 1 0 0 0 0 0 0 1 5.0 0.2500\n", - "28 12.000000 2.484907 0 0 0 1 0 0 0 0 0 1 4.5 0.2025\n", - "29 16.826923 2.822980 0 0 0 0 1 0 0 0 0 1 8.0 0.6400\n", - "30 19.670330 2.979111 0 0 0 0 1 0 0 0 0 1 26.0 6.7600\n", - "⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "5121 13.986014 2.638058 1 0 0 0 0 1 0 0 1 0 7.0 0.4900\n", - "5122 15.865385 2.764140 0 0 0 0 0 1 0 0 1 0 2.0 0.0400\n", - "5123 38.461538 3.649659 1 0 0 1 0 0 0 0 1 0 7.5 0.5625\n", - "5124 28.846154 3.361977 0 0 0 0 1 0 0 0 1 0 5.0 0.2500\n", - "5125 24.475524 3.197674 1 0 0 0 0 1 0 0 1 0 24.0 5.7600\n", - "5126 27.884615 3.328075 0 0 0 1 0 0 0 0 1 0 28.5 8.1225\n", - "5127 8.653846 2.158004 1 0 1 0 0 0 0 0 1 0 25.0 6.2500\n", - "5128 12.019231 2.486508 0 0 1 0 0 0 0 0 1 0 19.0 3.6100\n", - "5129 38.461538 3.649659 0 0 0 0 1 0 0 0 1 0 8.0 0.6400\n", - "5130 12.500000 2.525729 1 0 1 0 0 0 0 0 1 0 12.0 1.4400\n", - "5131 35.256410 3.562647 0 0 0 0 0 1 0 0 1 0 6.0 0.3600\n", - "5132 48.076923 3.872802 1 0 0 0 1 0 0 0 1 0 25.0 6.2500\n", - "5133 9.615385 2.263364 0 0 1 0 0 0 0 0 1 0 20.0 4.0000\n", - "5134 12.019231 2.486508 0 0 1 0 0 0 0 0 1 0 17.0 2.8900\n", - "5135 12.019231 2.486508 0 0 0 1 0 0 0 0 1 0 31.5 9.9225\n", - "5136 12.980769 2.563469 0 0 1 0 0 0 0 0 1 0 15.0 2.2500\n", - "5137 26.442308 3.274965 1 0 0 0 1 0 0 0 1 0 34.0 11.5600\n", - "5138 13.461538 2.599837 1 0 1 0 0 0 0 0 1 0 8.0 0.6400\n", - "5139 19.711538 2.981204 1 0 0 0 0 1 0 0 1 0 8.0 0.6400\n", - "5140 21.153846 3.051822 0 0 0 0 1 0 0 0 1 0 28.0 7.8400\n", - "5141 45.546559 3.818735 1 0 0 0 1 0 0 0 1 0 5.0 0.2500\n", - "5142 22.596154 3.117780 0 0 1 0 0 0 0 0 1 0 15.0 2.2500\n", - "5143 16.826923 2.822980 0 0 1 0 0 0 0 0 1 0 11.0 1.2100\n", - "5144 24.038462 3.179655 1 0 0 0 1 0 0 0 1 0 17.0 2.8900\n", - "5145 13.846154 2.628007 0 0 0 0 1 0 0 0 1 0 10.0 1.0000\n", - "5146 14.769231 2.692546 0 0 0 0 1 0 0 0 1 0 9.0 0.8100\n", - "5147 23.076923 3.138833 1 0 0 1 0 0 0 0 1 0 12.0 1.4400\n", - "5148 38.461538 3.649659 0 0 0 0 0 1 0 0 1 0 11.0 1.2100\n", - "5149 32.967033 3.495508 0 0 1 0 0 0 0 0 1 0 10.0 1.0000\n", - "5150 17.307692 2.851151 0 0 0 0 0 1 0 0 1 0 14.0 1.9600\n", - " exp3 exp4 occ occ2 ind ind2\n", - "1 0.343000 0.24010000 3600 11 8370 18 \n", - "2 29.791000 92.35210000 3050 10 5070 9 \n", - "3 5.832000 10.49760000 6260 19 770 4 \n", - "4 15.625000 39.06250000 420 1 6990 12 \n", - "5 10.648000 23.42560000 2015 6 9470 22 \n", - "6 0.001000 0.00010000 1650 5 7460 14 \n", - "7 74.088000 311.16960000 5120 17 7280 14 \n", - "8 50.653000 187.41610000 5240 17 5680 9 \n", - "9 29.791000 92.35210000 4040 13 8590 19 \n", - "10 0.064000 0.02560000 3255 10 8190 18 \n", - "11 0.343000 0.24010000 4020 13 8270 18 \n", - "12 27.000000 81.00000000 4220 14 8270 18 \n", - "13 0.166375 0.09150625 3600 11 8270 18 \n", - "14 8.615125 17.66100625 3645 11 8190 18 \n", - "15 15.625000 39.06250000 110 1 7870 17 \n", - "16 4.096000 6.55360000 6355 19 770 4 \n", - "17 19.683000 53.14410000 6320 19 770 4 \n", - "18 0.042875 0.01500625 7410 20 570 3 \n", - "19 0.216000 0.12960000 7000 20 4690 9 \n", - "20 0.512000 0.40960000 4000 13 8680 20 \n", - "21 0.274625 0.17850625 4300 15 8590 19 \n", - "22 0.614125 0.52200625 3600 11 8190 18 \n", - "23 1.331000 1.46410000 2310 8 7860 17 \n", - "24 1.331000 1.46410000 6200 19 770 4 \n", - "25 14.706125 36.03000625 20 1 5480 9 \n", - "26 0.512000 0.40960000 4800 16 7390 14 \n", - "27 0.125000 0.06250000 430 1 7390 14 \n", - "28 0.091125 0.04100625 6330 19 770 4 \n", - "29 0.512000 0.40960000 5400 17 4870 9 \n", - "30 17.576000 45.69760000 6230 19 6570 11 \n", - "⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "5121 0.343000 0.2401000 3820 12 9590 22 \n", - "5122 0.008000 0.0016000 2200 8 7870 17 \n", - "5123 0.421875 0.3164062 220 1 770 4 \n", - "5124 0.125000 0.0625000 1360 4 770 4 \n", - "5125 13.824000 33.1776000 2310 8 7860 17 \n", - "5126 23.149125 65.9750063 8140 21 770 4 \n", - "5127 15.625000 39.0625000 3930 12 7680 16 \n", - "5128 6.859000 13.0321000 2750 9 8560 19 \n", - "5129 0.512000 0.4096000 710 2 6870 12 \n", - "5130 1.728000 2.0736000 4010 13 8680 20 \n", - "5131 0.216000 0.1296000 2320 8 7860 17 \n", - "5132 15.625000 39.0625000 3255 10 8170 18 \n", - "5133 8.000000 16.0000000 4110 13 8680 20 \n", - "5134 4.913000 8.3521000 9620 22 6290 10 \n", - "5135 31.255875 98.4560063 5510 17 6380 10 \n", - "5136 3.375000 5.0625000 2010 6 9370 22 \n", - "5137 39.304000 133.6336000 2310 8 7860 17 \n", - "5138 0.512000 0.4096000 4720 16 8590 19 \n", - "5139 0.512000 0.4096000 2000 6 8090 18 \n", - "5140 21.952000 61.4656000 40 1 9170 21 \n", - "5141 0.125000 0.0625000 3255 10 8190 18 \n", - "5142 3.375000 5.0625000 9620 22 5390 9 \n", - "5143 1.331000 1.4641000 7150 20 8770 21 \n", - "5144 4.913000 8.3521000 2550 8 9480 22 \n", - "5145 1.000000 1.0000000 800 2 770 4 \n", - "5146 0.729000 0.6561000 4700 16 4970 9 \n", - "5147 1.728000 2.0736000 4110 13 8680 20 \n", - "5148 1.331000 1.4641000 1550 4 3680 6 \n", - "5149 1.000000 1.0000000 2920 9 6570 11 \n", - "5150 2.744000 3.8416000 1610 5 7460 14 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", - "data <- read.csv(file)\n", - "str(data)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "GLjEqmK8hEU8", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "y <- data$lwage\n", - "Z <- subset(data, select = -c(lwage, wage))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "-IX2Sy1A-QCX", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "center_colmeans <- function(x) {\n", - " xcenter <- colMeans(x)\n", - " x - rep(xcenter, rep.int(nrow(x), ncol(x)))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "kaygPMYdelFI", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the model matrix for the covariates\n", - "controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2'\n", - "Zcontrols <- model.matrix(as.formula(controls_formula), data = Z)\n", - "Zcontrols <- center_colmeans(Zcontrols)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FYUE8DCsh6QL" - }, - "source": [ - "Construct all the variables that we will use to model heterogeneity of effect in a linear manner" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "qVF99n7dhyc-", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the model matrix for the linear heterogeneity\n", - "linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", - "Zhet <- model.matrix(as.formula(linear_het_formula), data = Z)\n", - "Zhet <- center_colmeans(Zhet)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ySaY8-X0iXyP" - }, - "source": [ - "Construct all interaction variables between sex and heterogeneity variables" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "jPGR47mfhzBu", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the model matrix for the higher order heterogeneity\n", - "Zhet <- as.data.frame(cbind(Zhet, \"sex\" = Z$sex))\n", - "nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", - "Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet)\n", - "interaction_cols <- Zinteractions[, grepl(\"sex\", colnames(Zinteractions))]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mt96NtDIll5_" - }, - "source": [ - "Put variables all together" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "niEXvfVSlk3v", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "X <- cbind(Zinteractions, Zcontrols)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hWtAmml_Kf2v" - }, - "source": [ - "Get estimates and CIs" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 854 - }, - "id": "YO7Hmw5nllBK", - "outputId": "39873473-1933-4d81-d536-da42d661072e", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 12 × 4 of type dbl
Estimate.Std. Errort valuePr(>|t|)
sex-0.0678499740.015091077-4.49603276.923309e-06
sex:shs-0.1978411950.108831873-1.81786086.908541e-02
sex:hsg 0.0123408060.049116741 0.25125468.016173e-01
sex:scl 0.0214461840.046697210 0.45926056.460471e-01
sex:clg 0.0616235880.043983877 1.40104951.611993e-01
sex:mw-0.1085458990.041038649-2.64496778.169872e-03
sex:so-0.0727902060.039651133-1.83576616.639226e-02
sex:we-0.0509359680.041895027-1.21579992.240612e-01
sex:exp1 0.0180149370.006981997 2.58019839.874358e-03
sex:exp2 0.0235870760.049880609 0.47287066.363055e-01
sex:exp3-0.0548539290.033450520-1.63985281.010358e-01
sex:exp4-0.0073130370.002027920-3.60617673.107416e-04
\n" - ], - "text/latex": [ - "A matrix: 12 × 4 of type dbl\n", - "\\begin{tabular}{r|llll}\n", - " & Estimate. & Std. Error & t value & Pr(>\\textbar{}t\\textbar{})\\\\\n", - "\\hline\n", - "\tsex & -0.067849974 & 0.015091077 & -4.4960327 & 6.923309e-06\\\\\n", - "\tsex:shs & -0.197841195 & 0.108831873 & -1.8178608 & 6.908541e-02\\\\\n", - "\tsex:hsg & 0.012340806 & 0.049116741 & 0.2512546 & 8.016173e-01\\\\\n", - "\tsex:scl & 0.021446184 & 0.046697210 & 0.4592605 & 6.460471e-01\\\\\n", - "\tsex:clg & 0.061623588 & 0.043983877 & 1.4010495 & 1.611993e-01\\\\\n", - "\tsex:mw & -0.108545899 & 0.041038649 & -2.6449677 & 8.169872e-03\\\\\n", - "\tsex:so & -0.072790206 & 0.039651133 & -1.8357661 & 6.639226e-02\\\\\n", - "\tsex:we & -0.050935968 & 0.041895027 & -1.2157999 & 2.240612e-01\\\\\n", - "\tsex:exp1 & 0.018014937 & 0.006981997 & 2.5801983 & 9.874358e-03\\\\\n", - "\tsex:exp2 & 0.023587076 & 0.049880609 & 0.4728706 & 6.363055e-01\\\\\n", - "\tsex:exp3 & -0.054853929 & 0.033450520 & -1.6398528 & 1.010358e-01\\\\\n", - "\tsex:exp4 & -0.007313037 & 0.002027920 & -3.6061767 & 3.107416e-04\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A matrix: 12 × 4 of type dbl\n", - "\n", - "| | Estimate. | Std. Error | t value | Pr(>|t|) |\n", - "|---|---|---|---|---|\n", - "| sex | -0.067849974 | 0.015091077 | -4.4960327 | 6.923309e-06 |\n", - "| sex:shs | -0.197841195 | 0.108831873 | -1.8178608 | 6.908541e-02 |\n", - "| sex:hsg | 0.012340806 | 0.049116741 | 0.2512546 | 8.016173e-01 |\n", - "| sex:scl | 0.021446184 | 0.046697210 | 0.4592605 | 6.460471e-01 |\n", - "| sex:clg | 0.061623588 | 0.043983877 | 1.4010495 | 1.611993e-01 |\n", - "| sex:mw | -0.108545899 | 0.041038649 | -2.6449677 | 8.169872e-03 |\n", - "| sex:so | -0.072790206 | 0.039651133 | -1.8357661 | 6.639226e-02 |\n", - "| sex:we | -0.050935968 | 0.041895027 | -1.2157999 | 2.240612e-01 |\n", - "| sex:exp1 | 0.018014937 | 0.006981997 | 2.5801983 | 9.874358e-03 |\n", - "| sex:exp2 | 0.023587076 | 0.049880609 | 0.4728706 | 6.363055e-01 |\n", - "| sex:exp3 | -0.054853929 | 0.033450520 | -1.6398528 | 1.010358e-01 |\n", - "| sex:exp4 | -0.007313037 | 0.002027920 | -3.6061767 | 3.107416e-04 |\n", - "\n" - ], - "text/plain": [ - " Estimate. Std. Error t value Pr(>|t|) \n", - "sex -0.067849974 0.015091077 -4.4960327 6.923309e-06\n", - "sex:shs -0.197841195 0.108831873 -1.8178608 6.908541e-02\n", - "sex:hsg 0.012340806 0.049116741 0.2512546 8.016173e-01\n", - "sex:scl 0.021446184 0.046697210 0.4592605 6.460471e-01\n", - "sex:clg 0.061623588 0.043983877 1.4010495 1.611993e-01\n", - "sex:mw -0.108545899 0.041038649 -2.6449677 8.169872e-03\n", - "sex:so -0.072790206 0.039651133 -1.8357661 6.639226e-02\n", - "sex:we -0.050935968 0.041895027 -1.2157999 2.240612e-01\n", - "sex:exp1 0.018014937 0.006981997 2.5801983 9.874358e-03\n", - "sex:exp2 0.023587076 0.049880609 0.4728706 6.363055e-01\n", - "sex:exp3 -0.054853929 0.033450520 -1.6398528 1.010358e-01\n", - "sex:exp4 -0.007313037 0.002027920 -3.6061767 3.107416e-04" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", - "% Tue Feb 13 18:38:15 2024\n", - "\\begin{table}[ht]\n", - "\\centering\n", - "\\begin{tabular}{rrrr}\n", - " \\hline\n", - " & Estimate. & Std. Error & Pr($>$$|$t$|$) \\\\ \n", - " \\hline\n", - "sex & -0.07 & 0.02 & 0.00 \\\\ \n", - " sex:shs & -0.20 & 0.11 & 0.07 \\\\ \n", - " sex:hsg & 0.01 & 0.05 & 0.80 \\\\ \n", - " sex:scl & 0.02 & 0.05 & 0.65 \\\\ \n", - " sex:clg & 0.06 & 0.04 & 0.16 \\\\ \n", - " sex:mw & -0.11 & 0.04 & 0.01 \\\\ \n", - " sex:so & -0.07 & 0.04 & 0.07 \\\\ \n", - " sex:we & -0.05 & 0.04 & 0.22 \\\\ \n", - " sex:exp1 & 0.02 & 0.01 & 0.01 \\\\ \n", - " sex:exp2 & 0.02 & 0.05 & 0.64 \\\\ \n", - " sex:exp3 & -0.05 & 0.03 & 0.10 \\\\ \n", - " sex:exp4 & -0.01 & 0.00 & 0.00 \\\\ \n", - " \\hline\n", - "\\end{tabular}\n", - "\\end{table}\n" - ] - } - ], - "source": [ - "# this cell takes 30 minutes to run\n", - "index_gender <- grep(\"sex\", colnames(Zinteractions))\n", - "effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE)\n", - "result <- summary(effects_female)\n", - "result$coef\n", - "print(xtable(result$coef[, c(1, 2, 4)], type = \"latex\"), digits = 3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n5ZjOw5t_eRA" - }, - "source": [ - "Now, we estimate and plot confidence intervals, first \"pointwise\" and then the joint confidence intervals." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 854 - }, - "id": "d88JnYGG_eRA", - "outputId": "a9221fd0-db3b-418b-8823-be9e79ba0b77", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 12 × 2 of type dbl
2.5 %97.5 %
sex-0.097427941-0.038272007
sex:shs-0.411147747 0.015465357
sex:hsg-0.083926237 0.108607850
sex:scl-0.070078666 0.112971034
sex:clg-0.024583227 0.147830403
sex:mw-0.188980172-0.028111625
sex:so-0.150504999 0.004924587
sex:we-0.133048712 0.031176776
sex:exp1 0.004330474 0.031699400
sex:exp2-0.074177122 0.121351274
sex:exp3-0.120415743 0.010707886
sex:exp4-0.011287686-0.003338387
\n" - ], - "text/latex": [ - "A matrix: 12 × 2 of type dbl\n", - "\\begin{tabular}{r|ll}\n", - " & 2.5 \\% & 97.5 \\%\\\\\n", - "\\hline\n", - "\tsex & -0.097427941 & -0.038272007\\\\\n", - "\tsex:shs & -0.411147747 & 0.015465357\\\\\n", - "\tsex:hsg & -0.083926237 & 0.108607850\\\\\n", - "\tsex:scl & -0.070078666 & 0.112971034\\\\\n", - "\tsex:clg & -0.024583227 & 0.147830403\\\\\n", - "\tsex:mw & -0.188980172 & -0.028111625\\\\\n", - "\tsex:so & -0.150504999 & 0.004924587\\\\\n", - "\tsex:we & -0.133048712 & 0.031176776\\\\\n", - "\tsex:exp1 & 0.004330474 & 0.031699400\\\\\n", - "\tsex:exp2 & -0.074177122 & 0.121351274\\\\\n", - "\tsex:exp3 & -0.120415743 & 0.010707886\\\\\n", - "\tsex:exp4 & -0.011287686 & -0.003338387\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A matrix: 12 × 2 of type dbl\n", - "\n", - "| | 2.5 % | 97.5 % |\n", - "|---|---|---|\n", - "| sex | -0.097427941 | -0.038272007 |\n", - "| sex:shs | -0.411147747 | 0.015465357 |\n", - "| sex:hsg | -0.083926237 | 0.108607850 |\n", - "| sex:scl | -0.070078666 | 0.112971034 |\n", - "| sex:clg | -0.024583227 | 0.147830403 |\n", - "| sex:mw | -0.188980172 | -0.028111625 |\n", - "| sex:so | -0.150504999 | 0.004924587 |\n", - "| sex:we | -0.133048712 | 0.031176776 |\n", - "| sex:exp1 | 0.004330474 | 0.031699400 |\n", - "| sex:exp2 | -0.074177122 | 0.121351274 |\n", - "| sex:exp3 | -0.120415743 | 0.010707886 |\n", - "| sex:exp4 | -0.011287686 | -0.003338387 |\n", - "\n" - ], - "text/plain": [ - " 2.5 % 97.5 % \n", - "sex -0.097427941 -0.038272007\n", - "sex:shs -0.411147747 0.015465357\n", - "sex:hsg -0.083926237 0.108607850\n", - "sex:scl -0.070078666 0.112971034\n", - "sex:clg -0.024583227 0.147830403\n", - "sex:mw -0.188980172 -0.028111625\n", - "sex:so -0.150504999 0.004924587\n", - "sex:we -0.133048712 0.031176776\n", - "sex:exp1 0.004330474 0.031699400\n", - "sex:exp2 -0.074177122 0.121351274\n", - "sex:exp3 -0.120415743 0.010707886\n", - "sex:exp4 -0.011287686 -0.003338387" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", - "% Tue Feb 13 18:41:01 2024\n", - "\\begin{table}[ht]\n", - "\\centering\n", - "\\begin{tabular}{rrr}\n", - " \\hline\n", - " & 2.5 \\% & 97.5 \\% \\\\ \n", - " \\hline\n", - "sex & -0.10 & -0.04 \\\\ \n", - " sex:shs & -0.41 & 0.02 \\\\ \n", - " sex:hsg & -0.08 & 0.11 \\\\ \n", - " sex:scl & -0.07 & 0.11 \\\\ \n", - " sex:clg & -0.02 & 0.15 \\\\ \n", - " sex:mw & -0.19 & -0.03 \\\\ \n", - " sex:so & -0.15 & 0.00 \\\\ \n", - " sex:we & -0.13 & 0.03 \\\\ \n", - " sex:exp1 & 0.00 & 0.03 \\\\ \n", - " sex:exp2 & -0.07 & 0.12 \\\\ \n", - " sex:exp3 & -0.12 & 0.01 \\\\ \n", - " sex:exp4 & -0.01 & -0.00 \\\\ \n", - " \\hline\n", - "\\end{tabular}\n", - "\\end{table}\n" - ] - } - ], - "source": [ - "pointwise_ci <- confint(effects_female, level = 0.95)\n", - "pointwise_ci\n", - "print(xtable(pointwise_ci), type = \"latex\")" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6oWBhx2m_eQ8" + }, + "source": [ + "# Application: Heterogeneous Effect of Sex on Wage Using Double Lasso\n", + "\n", + " We use US census data from the year 2015 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. \n", + "\n", + "\n", + "\n", + "This analysis allows a closer look how the gender wage gap is related to other socio-economic variables.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "I_QF5kVR_eRA" - }, - "source": [ - "Finally, we compare the pointwise confidence intervals to joint confidence intervals." - ] + "id": "3QN4EOYGQkmz", + "outputId": "a5a8f7f8-def7-4ca6-8c8f-973d00b7bd20", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "install.packages(\"xtable\")\n", + "library(hdm)\n", + "library(xtable)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 854 - }, - "id": "az7AJkhE_eRB", - "outputId": "ef8e3e87-3aac-483e-f5f5-4aacd5bf02be", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A matrix: 12 × 2 of type dbl
2.5 %97.5 %
sex-0.11304841-0.022651543
sex:shs-0.53156729 0.135884900
sex:hsg-0.13716201 0.161843621
sex:scl-0.12487424 0.167766609
sex:clg-0.07664857 0.199895749
sex:mw-0.22994084 0.012849045
sex:so-0.18897851 0.043398103
sex:we-0.17562318 0.073751241
sex:exp1-0.00200092 0.038030794
sex:exp2-0.11908721 0.166261366
sex:exp3-0.16472153 0.055013668
sex:exp4-0.01224237-0.002383705
\n" - ], - "text/latex": [ - "A matrix: 12 × 2 of type dbl\n", - "\\begin{tabular}{r|ll}\n", - " & 2.5 \\% & 97.5 \\%\\\\\n", - "\\hline\n", - "\tsex & -0.11304841 & -0.022651543\\\\\n", - "\tsex:shs & -0.53156729 & 0.135884900\\\\\n", - "\tsex:hsg & -0.13716201 & 0.161843621\\\\\n", - "\tsex:scl & -0.12487424 & 0.167766609\\\\\n", - "\tsex:clg & -0.07664857 & 0.199895749\\\\\n", - "\tsex:mw & -0.22994084 & 0.012849045\\\\\n", - "\tsex:so & -0.18897851 & 0.043398103\\\\\n", - "\tsex:we & -0.17562318 & 0.073751241\\\\\n", - "\tsex:exp1 & -0.00200092 & 0.038030794\\\\\n", - "\tsex:exp2 & -0.11908721 & 0.166261366\\\\\n", - "\tsex:exp3 & -0.16472153 & 0.055013668\\\\\n", - "\tsex:exp4 & -0.01224237 & -0.002383705\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A matrix: 12 × 2 of type dbl\n", - "\n", - "| | 2.5 % | 97.5 % |\n", - "|---|---|---|\n", - "| sex | -0.11304841 | -0.022651543 |\n", - "| sex:shs | -0.53156729 | 0.135884900 |\n", - "| sex:hsg | -0.13716201 | 0.161843621 |\n", - "| sex:scl | -0.12487424 | 0.167766609 |\n", - "| sex:clg | -0.07664857 | 0.199895749 |\n", - "| sex:mw | -0.22994084 | 0.012849045 |\n", - "| sex:so | -0.18897851 | 0.043398103 |\n", - "| sex:we | -0.17562318 | 0.073751241 |\n", - "| sex:exp1 | -0.00200092 | 0.038030794 |\n", - "| sex:exp2 | -0.11908721 | 0.166261366 |\n", - "| sex:exp3 | -0.16472153 | 0.055013668 |\n", - "| sex:exp4 | -0.01224237 | -0.002383705 |\n", - "\n" - ], - "text/plain": [ - " 2.5 % 97.5 % \n", - "sex -0.11304841 -0.022651543\n", - "sex:shs -0.53156729 0.135884900\n", - "sex:hsg -0.13716201 0.161843621\n", - "sex:scl -0.12487424 0.167766609\n", - "sex:clg -0.07664857 0.199895749\n", - "sex:mw -0.22994084 0.012849045\n", - "sex:so -0.18897851 0.043398103\n", - "sex:we -0.17562318 0.073751241\n", - "sex:exp1 -0.00200092 0.038030794\n", - "sex:exp2 -0.11908721 0.166261366\n", - "sex:exp3 -0.16472153 0.055013668\n", - "sex:exp4 -0.01224237 -0.002383705" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "% latex table generated in R 4.3.2 by xtable 1.8-4 package\n", - "% Tue Feb 13 18:41:16 2024\n", - "\\begin{table}[ht]\n", - "\\centering\n", - "\\begin{tabular}{rrr}\n", - " \\hline\n", - " & 2.5 \\% & 97.5 \\% \\\\ \n", - " \\hline\n", - "sex & -0.11 & -0.02 \\\\ \n", - " sex:shs & -0.53 & 0.14 \\\\ \n", - " sex:hsg & -0.14 & 0.16 \\\\ \n", - " sex:scl & -0.12 & 0.17 \\\\ \n", - " sex:clg & -0.08 & 0.20 \\\\ \n", - " sex:mw & -0.23 & 0.01 \\\\ \n", - " sex:so & -0.19 & 0.04 \\\\ \n", - " sex:we & -0.18 & 0.07 \\\\ \n", - " sex:exp1 & -0.00 & 0.04 \\\\ \n", - " sex:exp2 & -0.12 & 0.17 \\\\ \n", - " sex:exp3 & -0.16 & 0.06 \\\\ \n", - " sex:exp4 & -0.01 & -0.00 \\\\ \n", - " \\hline\n", - "\\end{tabular}\n", - "\\end{table}\n" - ] - } - ], - "source": [ - "joint_ci <- confint(effects_female, level = 0.95, joint = TRUE)\n", - "joint_ci\n", - "print(xtable(joint_ci), type = \"latex\")" - ] + "id": "fLiMuKqN_eQ-", + "outputId": "88233975-7a27-4614-d878-c718e6dcb072", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "data <- read.csv(file)\n", + "str(data)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GLjEqmK8hEU8", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "y <- data$lwage\n", + "Z <- subset(data, select = -c(lwage, wage))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-IX2Sy1A-QCX", + "vscode": { + "languageId": "r" } - ], - "metadata": { + }, + "outputs": [], + "source": [ + "center_colmeans <- function(x) {\n", + " xcenter <- colMeans(x)\n", + " x - rep(xcenter, rep.int(nrow(x), ncol(x)))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kaygPMYdelFI", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# create the model matrix for the covariates\n", + "controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2'\n", + "Zcontrols <- model.matrix(as.formula(controls_formula), data = Z)\n", + "Zcontrols <- center_colmeans(Zcontrols)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FYUE8DCsh6QL" + }, + "source": [ + "Construct all the variables that we will use to model heterogeneity of effect in a linear manner" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qVF99n7dhyc-", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# create the model matrix for the linear heterogeneity\n", + "linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "Zhet <- model.matrix(as.formula(linear_het_formula), data = Z)\n", + "Zhet <- center_colmeans(Zhet)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ySaY8-X0iXyP" + }, + "source": [ + "Construct all interaction variables between sex and heterogeneity variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jPGR47mfhzBu", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# create the model matrix for the higher order heterogeneity\n", + "Zhet <- as.data.frame(cbind(Zhet, \"sex\" = Z$sex))\n", + "nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet)\n", + "interaction_cols <- Zinteractions[, grepl(\"sex\", colnames(Zinteractions))]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mt96NtDIll5_" + }, + "source": [ + "Put variables all together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "niEXvfVSlk3v", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "X <- cbind(Zinteractions, Zcontrols)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hWtAmml_Kf2v" + }, + "source": [ + "Get estimates and CIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/", + "height": 854 }, - "kernelspec": { - "display_name": "R", - "name": "ir" + "id": "YO7Hmw5nllBK", + "outputId": "39873473-1933-4d81-d536-da42d661072e", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# this cell takes 30 minutes to run\n", + "index_gender <- grep(\"sex\", colnames(Zinteractions))\n", + "effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE)\n", + "result <- summary(effects_female)\n", + "result$coef\n", + "print(xtable(result$coef[, c(1, 2, 4)], type = \"latex\"), digits = 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n5ZjOw5t_eRA" + }, + "source": [ + "Now, we estimate and plot confidence intervals, first \"pointwise\" and then the joint confidence intervals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 854 }, - "language_info": { - "name": "R" + "id": "d88JnYGG_eRA", + "outputId": "a9221fd0-db3b-418b-8823-be9e79ba0b77", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "pointwise_ci <- confint(effects_female, level = 0.95)\n", + "pointwise_ci\n", + "print(xtable(pointwise_ci), type = \"latex\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I_QF5kVR_eRA" + }, + "source": [ + "Finally, we compare the pointwise confidence intervals to joint confidence intervals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 854 }, - "papermill": { - "default_parameters": {}, - "duration": 89.365707, - "end_time": "2021-02-28T17:18:51.003711", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-02-28T17:17:21.638004", - "version": "2.2.2" + "id": "az7AJkhE_eRB", + "outputId": "ef8e3e87-3aac-483e-f5f5-4aacd5bf02be", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "joint_ci <- confint(effects_female, level = 0.95, joint = TRUE)\n", + "joint_ci\n", + "print(xtable(joint_ci), type = \"latex\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" }, - "nbformat": 4, - "nbformat_minor": 0 + "papermill": { + "default_parameters": {}, + "duration": 89.365707, + "end_time": "2021-02-28T17:18:51.003711", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-02-28T17:17:21.638004", + "version": "2.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_linear_penalized_regs.Rmd b/PM2/r_linear_penalized_regs.Rmd new file mode 100644 index 00000000..b1ed57c1 --- /dev/null +++ b/PM2/r_linear_penalized_regs.Rmd @@ -0,0 +1,704 @@ +--- +title: An R Markdown document converted from "PM2/r_linear_penalized_regs.irnb" +output: html_document +--- + +# Penalized Linear Regressions: A Simulation Experiment + +```{r} +install.packages("xtable") +install.packages("hdm") +install.packages("glmnet") + +library(hdm) +library(xtable) +library(glmnet) +library(ggplot2) +``` + +## Data Generating Process + +We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients + +```{r} +gen_data <- function(n, p, regime = "sparse") { + # constants chosen to get R^2 of approximately .80 + if (regime == "sparse") { + beta <- (1 / seq(1:p)^2) * 7 + } else if (regime == "dense") { + beta <- rnorm(p) * 0.5 + } else if (regime == "sparsedense") { + beta_1 <- (1 / seq(1:p)^2) * 6.5 + beta_2 <- rnorm(p, 0, 0.5) * 0.7 + beta <- beta_1 + beta_2 + } + + true_fn <- function(x) { + x[, seq_len(dim(x)[2])] %*% beta + } + + X <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p) + gX <- true_fn(X) + y <- gX + rnorm(n) + + Xtest <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p) + gXtest <- true_fn(Xtest) + ytest <- gXtest + rnorm(n) + + Xpop <- matrix(runif(100000 * p, min = -0.5, max = 0.5), 100000, p) + gXpop <- true_fn(Xpop) + ypop <- gXpop + rnorm(100000) + + return(list( + X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest, + Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta = beta + )) +} +``` + +## Data Generating Process: Approximately Sparse + +```{r} +set.seed(1) +n <- 100 +p <- 400 +res <- gen_data(n, p, regime = "sparse") +``` + +```{r} +X <- res$X +y <- res$y +gX <- res$gX +Xtest <- res$Xtest +ytest <- res$ytest +gXtest <- res$gXtest +Xpop <- res$Xpop +ypop <- res$ypop +gXpop <- res$gXpop +betas <- res$beta +``` + +```{r} +plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) +print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example +``` + +```{r} +# Plot betas +plot(seq_along(betas), abs(betas), + log = "y", pch = 20, col = "blue", + xlab = expression(beta), ylab = "Magnitude (log scale)", + main = expression(paste("Beta Magnitude")) +) +``` + +## Lasso, Ridge, ElasticNet + +We use glmnet's penalized estimators, which choose the penalty parameter via cross-validation (by default 10-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. The parameter `alpha` controls what penalty (or allows for a convex combination of `l1` and `l2` penalty). Set `alpha=0.5` for elastic net. + +Features will be standardized (by glmnet) so that penalization does not favor different features asymmetrically. + +```{r} +r2_score <- function(preds, actual, ytrain = y) { + rss <- sum((preds - actual)^2) # residual sum of squares + # total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object + tss <- sum((actual - mean(ytrain))^2) + rsq <- 1 - rss / tss + return(rsq) +} +``` + +```{r} +# family gaussian means that we'll be using square loss +fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) +``` + +We calculate the R-squared on the small test set that we have + +```{r} +cat( + "lassocv R2 (Test): ", r2_score(predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), ytest), + "\nridge R2 (Test): ", r2_score(predict(fit_ridge, newx = Xtest, s = "lambda.min"), ytest), + "\nelnet R2 (Test): ", r2_score(predict(fit_elnet, newx = Xtest, s = "lambda.min"), ytest) +) +``` + +We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample) + +```{r} +r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) +r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) +r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) + +cat( + "lassocv R2 (Pop): ", r2_lasso_cv, + "\nridge R2 (Pop): ", r2_ridge, + "\nelnet R2 (Pop): ", r2_elnet +) +``` + +#### glmnet failure in Ridge + +**Note**: Ridge regression performs worse relatively to the Ridge in the correponding [Python notebook](https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM2/python_linear_penalized_regs.ipynb). Even if one were to control for the randomness in the data and use the same data for both, R's glmnet fails. + +To understand why, look at the cross-validated MSE curve with different $\lambda$ (). + +```{r} +plot(fit_ridge) +``` + +From the [glmnet documentation](https://glmnet.stanford.edu/articles/glmnet.html): + + + +> This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two special values along the $\lambda$ sequence are indicated by the vertical dotted lines. ```lambda.min``` is the value of $\lambda$ that gives minimum mean cross-validated error, while ```lambda.1se``` is the value of $\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum. + +Notice that the chosen ```lambda.min``` is at the boundary of the sequence. An easy way to check this instead of plotting is to extract the $\lambda$ sequence and the minimum chosen $\lambda_{min}$ from the fitted object. + +```{r} +cat("lambda sequence: ", fit_ridge$lambda) +cat("\nChosen minimum lambda: ", fit_ridge$lambda.min) +``` + +In general, it is good practice to examine the lambda sequence that R produces and searches over in cross-validation. When the penalty is chosen at the boundary like we see here, this indicates the generated penalty sequence is likely misspecified. Thus, we choose to supply our own sequence. In particular, we choose values that match up with those in Python's ```sklearn``` Ridge implementation. + + +```glmnet``` minimizes the elastic net loss function as follows: +$$\min_{\beta} \frac{1}{N} \| X\beta - y\|_2^2 + \lambda_{R} \left( \frac{1}{2} (1-\alpha) \|\beta\|_2^2 + \alpha \|\beta\|_1 \right) $$ + +For ridge, $\alpha=0$, so $$\min_{\beta} \frac{1}{N} \| X\beta - y\|_2^2 + \frac{\lambda_{R}}{2} \|\beta\|_2^2 $$ + +Meanwhile, ```sklearn``` minimizes $$\min_{\beta} \frac{1}{N} \|X\beta-y\|_2^2 + \frac{\lambda_{python}}{N} \|\beta\|_2^2$$ where $\lambda_{python}$ is chosen from the grid $(0.1,1,10)$. + +To translate this into R, we must set in glmnet $$\lambda_{R} :=\frac{2}{N} \lambda_{python}$$ + +```{r} +# sklearn lambdas (penalty) +lambdas_sklearn <- c(0.1, 1, 10) # defaults +l_seq <- 2 / nrow(X) * lambdas_sklearn +l_seq # note how different these are to the actual lambdas generated by glmnet +``` + +```{r} +fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5, lambda = l_seq) +r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) +``` + +```{r} +cat( + "lassocv R2 (Pop): ", r2_lasso_cv, + "\nridge R2 (Pop): ", r2_ridge, + "\nelnet R2 (Pop): ", r2_elnet +) +``` + +## Plug-in Hyperparameter Lasso and Post-Lasso OLS + +Here we compute the lasso and ols post lasso using plug-in choices for penalty levels. + +\We use "plug-in" tuning with a theoretically valid choice of penalty $\lambda = 2 \cdot c \hat{\sigma} \sqrt{n} \Phi^{-1}(1-\alpha/2p)$, where $c>1$ and $1-\alpha$ is a confidence level, and $\Phi^{-1}$ denotes the quantile function. Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\epsilon$ and $X$ in the regression equation decay quickly at the tails. + +In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks. + +We pull an anaue of R's rlasso. Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\ +The post lasso function makes OLS with the selected ***T*** regressors. +To select those parameters, they use $\lambda$ as variable to penalize\ +**Funny thing: the function rlasso was named like that because it is the "rigorous" Lasso.** + +```{r} +fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level +fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level +``` + +```{r} +r2_lasso <- r2_score(predict(fit_rlasso, newdata = Xtest), ytest) +r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xtest), ytest) + +cat( + "rlasso R2 (Test): ", r2_lasso, + "\nrlasso-post R2 (Test): ", r2_lasso_post +) +``` + +```{r} +r2_lasso <- r2_score(predict(fit_rlasso, newdata = (Xpop)), (ypop)) +r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = (Xpop)), (ypop)) + +cat( + "rlasso R2 (Pop): ", r2_lasso, + "\nrlasso-post R2 (Pop): ", r2_lasso_post +) +``` + +## LAVA: Dense + Sparse Coefficients + +Next we code up lava, which alternates the fitting of lasso and ridge + +```{r} +# Define function to compute lava estimator. Doing an iterative scheme with fixed +# number of iteration. Could iterate until a convergence criterion is met. +lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { + require(glmnet) + + # Need to demean internally + dy <- Y - mean(Y) + dx <- scale(X, scale = FALSE) + + sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" + de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) + + i <- 1 + while (i <= iter) { + sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) + de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) + i <- i + 1 + } + + bhat <- sp1$beta + de1$beta + a0 <- mean(Y) - sum(colMeans(X) * bhat) + + # Need to add intercept to output + + yhat <- newX %*% bhat + a0 + + return(yhat) +} +``` + +```{r} +# define function to get predictions and r2 scores for lava estimator + +lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { + # 5-fold CV. glmnet does cross-validation internally and + # relatively efficiently. We're going to write out all the steps to make sure + # we're using the same CV folds across all procedures in a transparent way and + # to keep the overall structure clear as well. + + # Setup for brute force K-Fold CV + n <- length(ytr) + Kf <- num_folds # Number of folds + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups + + + ## ------------------------------------------------------------ + # We're going to take a shortcut and use the range of lambda values that come out + # of the default implementation in glmnet for everything. Could do better here - maybe + + ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. + ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge + ridge_lambda <- ridge_mod$lambda # values of penalty parameter + + ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. + lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) + lasso_lambda <- lasso_mod$lambda # values of penalty parameter + + ## ------------------------------------------------------------ + + + # Lava - Using a double loop over candidate penalty parameter values. + + lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)] + lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)] + + cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod)) + + for (k in 1:Kf) { + indk <- cvgroup == k + + k_xtr_mod <- xtr_mod[!indk, ] + k_ytr <- ytr[!indk] + k_xte_mod <- xtr_mod[indk, ] + k_yte <- ytr[indk] + + for (ii in seq_along(lambda1_lava_mod)) { + for (jj in seq_along(lambda2_lava_mod)) { + cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] + + sum((k_yte - lava_predict(k_xtr_mod, k_ytr, + newX = k_xte_mod, + lambda1 = lambda1_lava_mod[ii], + lambda2 = lambda2_lava_mod[jj]))^2) + } + } + } + + # Get CV min values of tuning parameters + cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE) + cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]] + cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]] + + cat("Min Lava Lasso CV Penalty: ", cvlambda1_lava_mod) + cat("\nMin Lava Ridge CV Penalty: ", cvlambda2_lava_mod) + + + #### Look at performance on test sample + + # Calculate R^2 in training data and in validation data as measures + # Refit on entire training sample + + + #### CV-min model + + # In sample fit + cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr, + newX = xtr_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + ) + r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) + + # Out of sample fit + cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr, + newX = xte_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + ) + r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) + + + cat("\nIn sample R2 (CV-min): ", r2_lava_mod) + cat("\nOut of Sample R2 (CV-min): ", r2v_lava_mod) + + + #### Use average model across cv-folds and refit model using all training data + ###### we won't report these results. + ###### Averaging is theoretically more solid, but cv-min is more practical. + n_tr <- length(ytr) + n_te <- length(yte) + yhat_tr_lava_mod <- matrix(0, n_tr, Kf) + yhat_te_lava_mod <- matrix(0, n_te, Kf) + + + for (k in 1:Kf) { + indk <- cvgroup == k + + k_xtr_mod <- xtr_mod[!indk, ] + k_ytr <- ytr[!indk] + + # Lava + yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, + newX = xtr_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + )) + yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, + newX = xte_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + )) + } + + avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod) + avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod) + + r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) + r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) + + cat("\nIn sample R2 (Average Across Folds): ", r2_cv_ave_lava_mod) + cat("\nOut of Sample R2 (Average Across Folds): ", r2v_cv_ave_lava_mod) + + return(c( + cvlambda1_lava_mod, + cvlambda2_lava_mod, + cvmin_yhat_lava_tr, # CV_min + cvmin_yhat_lava_test, # CV_min + r2_lava_mod, # CV_min + r2v_lava_mod, # CV_min + avg_yhat_lava_tr, # Average across Folds + avg_yhat_lava_test, # Average across Folds + r2_cv_ave_lava_mod, # Average across Folds + r2v_cv_ave_lava_mod # Average across Folds + )) +} +``` + +```{r} +# Results for Test +cat("Test Results ...\n") +r2_lava_traintest <- lava_yhat_r2(X, Xtest, y, ytest) +``` + +```{r} +# Results for Pop +## note we don't have to re-train the entire model +## this is just due to the way the function is defined above +cat("Population Results ...\n") +r2_lava_pop <- lava_yhat_r2(X, Xpop, y, ypop) +``` + +```{r} +# report R2 using CV min +cat("LAVA R2 (Test): ", r2_lava_traintest[[6]]) +cat("\nLAVA R2 (Pop) ", r2_lava_pop[[6]]) +``` + +## Summarizing Results + +```{r} +table <- matrix(0, 6, 1) +table[1, 1] <- r2_lasso_cv +table[2, 1] <- r2_ridge +table[3, 1] <- r2_elnet +table[4, 1] <- r2_lasso +table[5, 1] <- r2_lasso_post +table[6, 1] <- r2_lava_pop[[6]] + +colnames(table) <- c("R2 (Population)") +rownames(table) <- c( + "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", + "Lasso", "Post-Lasso", "Lava" +) +tab <- xtable(table, digits = 3) +print(tab, type = "latex") # set type="latex" for printing table in LaTeX +tab +``` + +```{r} +# Creating a data frame with the predicted values for test +data <- data.frame( + gXtest = gXtest, + Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), + ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), + RLasso = predict(fit_rlasso, newdata = Xtest), + RLassoPost = predict(fit_rlasso_post, newdata = Xtest), + LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), + Lava = as.vector(r2_lava_traintest[[4]]) +) +colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") + +# Reshaping data into longer format for ggplot +data_long <- tidyr::gather(data, Model, Predicted, -gXtest) + +# Plotting +ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + + geom_point(aes(shape = Model)) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX + scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), + guide = guide_legend(title = "Model")) + + theme_minimal() + + labs( + title = "Comparison of Methods on Predicting gX", + x = "gXtest", + y = "Predictions" + ) + + guides(shape = "none") # Remove the shape legend +``` + +## Data Generating Process: Dense Coefficients + +```{r} +set.seed(1) +n <- 100 +p <- 400 +res <- gen_data(n, p, regime = "dense") + +X <- res$X +y <- res$y +gX <- res$gX +Xtest <- res$Xtest +ytest <- res$ytest +gXtest <- res$gXtest +Xpop <- res$Xpop +ypop <- res$ypop +gXpop <- res$gXpop +betas <- res$beta +``` + +```{r} +plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) +print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example +``` + +```{r} +# plot betas +plot(seq_along(betas), abs(betas), + log = "y", pch = 20, col = "blue", + xlab = expression(beta), ylab = "Magnitude (log scale)", + main = expression(paste("Beta Magnitude")) +) +``` + +```{r} +# family gaussian means that we'll be using square loss +fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) +fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level +fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level + +r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) +r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) +r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) +r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop) +r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop) +r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]] +``` + +```{r} +table <- matrix(0, 6, 1) +table[1, 1] <- r2_lasso_cv +table[2, 1] <- r2_ridge +table[3, 1] <- r2_elnet +table[4, 1] <- r2_rlasso +table[5, 1] <- r2_rlasso_post +table[6, 1] <- r2_lava + +colnames(table) <- c("R2") +rownames(table) <- c( + "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", + "Lasso", "Post-Lasso", "Lava" +) +tab <- xtable(table, digits = 3) +print(tab, type = "latex") # set type="latex" for printing table in LaTeX +tab +``` + +```{r} +# get lava prediction on test set for plot below +lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]] +``` + +```{r} +# Creating a data frame with the predicted values for test +data <- data.frame( + gXtest = gXtest, + Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), + ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), + RLasso = predict(fit_rlasso, newdata = Xtest), + RLassoPost = predict(fit_rlasso_post, newdata = Xtest), + LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), + Lava = as.vector(lava_yhat) +) +colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") + +# Reshaping data into longer format for ggplot +data_long <- tidyr::gather(data, Model, Predicted, -gXtest) + +# Plotting +ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + + geom_point(aes(shape = Model)) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX + scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), + guide = guide_legend(title = "Model")) + + theme_minimal() + + labs( + title = "Comparison of Methods on Predicting gX", + x = "gXtest", + y = "Predictions" + ) + + guides(shape = "none") # Remove the shape legend +``` + +## Data Generating Process: Approximately Sparse + Small Dense Part + +```{r} +set.seed(1) +n <- 100 +p <- 400 +res <- gen_data(n, p, regime = "sparsedense") + +X <- res$X +y <- res$y +gX <- res$gX +Xtest <- res$Xtest +ytest <- res$ytest +gXtest <- res$gXtest +Xpop <- res$Xpop +ypop <- res$ypop +gXpop <- res$gXpop +betas <- res$beta +``` + +```{r} +plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) +print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example +``` + +```{r} +# plot betas +plot(seq_along(betas), abs(betas), + log = "y", pch = 20, col = "blue", + xlab = expression(beta), ylab = "Magnitude (log scale)", + main = expression(paste("Beta Magnitude")) +) +``` + +```{r} +# family gaussian means that we'll be using square loss +fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) +# family gaussian means that we'll be using square loss +fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) +fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level +fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level + +r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) +r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) +r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) +r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop) +r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop) +r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]] +``` + +```{r} +table <- matrix(0, 6, 1) +table[1, 1] <- r2_lasso_cv +table[2, 1] <- r2_ridge +table[3, 1] <- r2_elnet +table[4, 1] <- r2_rlasso +table[5, 1] <- r2_rlasso_post +table[6, 1] <- r2_lava + +colnames(table) <- c("R2") +rownames(table) <- c( + "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", + "Lasso", "Post-Lasso", "Lava" +) +tab <- xtable(table, digits = 3) +print(tab, type = "latex") # set type="latex" for printing table in LaTeX +tab +``` + +```{r} +# get lava prediction on test set for plot below +lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]] +``` + +```{r} +# Creating a data frame with the predicted values for test +data <- data.frame( + gXtest = gXtest, + Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), + ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), + RLasso = predict(fit_rlasso, newdata = Xtest), + RLassoPost = predict(fit_rlasso_post, newdata = Xtest), + LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), + Lava = as.vector(lava_yhat) +) +colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") + +# Reshaping data into longer format for ggplot +data_long <- tidyr::gather(data, Model, Predicted, -gXtest) + +# Plotting +ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + + geom_point(aes(shape = Model)) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX + scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), + guide = guide_legend(title = "Model")) + + theme_minimal() + + labs( + title = "Comparison of Methods on Predicting gX", + x = "gXtest", + y = "Predictions" + ) + + guides(shape = "none") # Remove the shape legend +``` + diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index f170aa4b..6769569d 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -1,1331 +1,1331 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "_execution_state": "idle", - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "id": "EaMt_4G0ONZ7", - "papermill": { - "duration": 0.010774, - "end_time": "2021-02-15T11:01:41.782833", - "exception": false, - "start_time": "2021-02-15T11:01:41.772059", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Penalized Linear Regressions: A Simulation Experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Fw3Ya0m6vboO", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"glmnet\")\n", - "\n", - "library(hdm)\n", - "library(xtable)\n", - "library(glmnet)\n", - "library(ggplot2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GNTVs-CtE-U9" - }, - "source": [ - "## Data Generating Process" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UXGpnWeeFAHV" - }, - "source": [ - "We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "N1TPWyBtBrqB", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "gen_data <- function(n, p, regime = \"sparse\") {\n", - " # constants chosen to get R^2 of approximately .80\n", - " if (regime == \"sparse\") {\n", - " beta <- (1 / seq(1:p)^2) * 7\n", - " } else if (regime == \"dense\") {\n", - " beta <- rnorm(p) * 0.5\n", - " } else if (regime == \"sparsedense\") {\n", - " beta_1 <- (1 / seq(1:p)^2) * 6.5\n", - " beta_2 <- rnorm(p, 0, 0.5) * 0.7\n", - " beta <- beta_1 + beta_2\n", - " }\n", - "\n", - " true_fn <- function(x) {\n", - " x[, seq_len(dim(x)[2])] %*% beta\n", - " }\n", - "\n", - " X <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", - " gX <- true_fn(X)\n", - " y <- gX + rnorm(n)\n", - "\n", - " Xtest <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", - " gXtest <- true_fn(Xtest)\n", - " ytest <- gXtest + rnorm(n)\n", - "\n", - " Xpop <- matrix(runif(100000 * p, min = -0.5, max = 0.5), 100000, p)\n", - " gXpop <- true_fn(Xpop)\n", - " ypop <- gXpop + rnorm(100000)\n", - "\n", - " return(list(\n", - " X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest,\n", - " Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta = beta\n", - " ))\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5UedfBJpONZ7", - "papermill": { - "duration": 0.010616, - "end_time": "2021-02-15T11:01:41.804126", - "exception": false, - "start_time": "2021-02-15T11:01:41.793510", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Data Generating Process: Approximately Sparse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LV521EPdA05z", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "n <- 100\n", - "p <- 400\n", - "res <- gen_data(n, p, regime = \"sparse\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "REt70Qs_zBPl", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "X <- res$X\n", - "y <- res$y\n", - "gX <- res$gX\n", - "Xtest <- res$Xtest\n", - "ytest <- res$ytest\n", - "gXtest <- res$gXtest\n", - "Xpop <- res$Xpop\n", - "ypop <- res$ypop\n", - "gXpop <- res$gXpop\n", - "betas <- res$beta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3lvcbHdqv11D", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ry_b39bLDIDT", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Plot betas\n", - "plot(seq_along(betas), abs(betas),\n", - " log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\"))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g6jcTnhwUkhl" - }, - "source": [ - "## Lasso, Ridge, ElasticNet" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aRWiO93SUw1G" - }, - "source": [ - "We use glmnet's penalized estimators, which choose the penalty parameter via cross-validation (by default 10-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. The parameter `alpha` controls what penalty (or allows for a convex combination of `l1` and `l2` penalty). Set `alpha=0.5` for elastic net.\n", - "\n", - "Features will be standardized (by glmnet) so that penalization does not favor different features asymmetrically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Dy1XNF6JXPpe", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "r2_score <- function(preds, actual, ytrain = y) {\n", - " rss <- sum((preds - actual)^2) # residual sum of squares\n", - " # total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object\n", - " tss <- sum((actual - mean(ytrain))^2)\n", - " rsq <- 1 - rss / tss\n", - " return(rsq)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Cy7dThUhONZ_", - "papermill": { - "duration": 2.898022, - "end_time": "2021-02-15T11:01:45.358083", - "exception": false, - "start_time": "2021-02-15T11:01:42.460061", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# family gaussian means that we'll be using square loss\n", - "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7WQJRJ6l0n4" - }, - "source": [ - "We calculate the R-squared on the small test set that we have" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SMuo4MlvXtxH", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "cat(\n", - " \"lassocv R2 (Test): \", r2_score(predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"), ytest),\n", - " \"\\nridge R2 (Test): \", r2_score(predict(fit_ridge, newx = Xtest, s = \"lambda.min\"), ytest),\n", - " \"\\nelnet R2 (Test): \", r2_score(predict(fit_elnet, newx = Xtest, s = \"lambda.min\"), ytest)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fw7a-6_-Yhbb" - }, - "source": [ - "We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UKmjj0fdYiL1", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "\n", - "cat(\n", - " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", - " \"\\nridge R2 (Pop): \", r2_ridge,\n", - " \"\\nelnet R2 (Pop): \", r2_elnet\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QECIRikt3j5y" - }, - "source": [ - "#### glmnet failure in Ridge\n", - "\n", - "**Note**: Ridge regression performs worse relatively to the Ridge in the correponding [Python notebook](https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM2/python_linear_penalized_regs.ipynb). Even if one were to control for the randomness in the data and use the same data for both, R's glmnet fails.\n", - "\n", - "To understand why, look at the cross-validated MSE curve with different $\\lambda$ ()." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kUvo6YbaHaSN", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "plot(fit_ridge)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mVRvqs8fnRaA" - }, - "source": [ - "From the [glmnet documentation](https://glmnet.stanford.edu/articles/glmnet.html):\n", - "\n", - "\n", - "\n", - "> This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\\lambda$ sequence (error bars). Two special values along the $\\lambda$ sequence are indicated by the vertical dotted lines. ```lambda.min``` is the value of $\\lambda$ that gives minimum mean cross-validated error, while ```lambda.1se``` is the value of $\\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum.\n", - "\n", - "Notice that the chosen ```lambda.min``` is at the boundary of the sequence. An easy way to check this instead of plotting is to extract the $\\lambda$ sequence and the minimum chosen $\\lambda_{min}$ from the fitted object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZsjlfgrynSLx", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "cat(\"lambda sequence: \", fit_ridge$lambda)\n", - "cat(\"\\nChosen minimum lambda: \", fit_ridge$lambda.min)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9VKXxWilod6N" - }, - "source": [ - "In general, it is good practice to examine the lambda sequence that R produces and searches over in cross-validation. When the penalty is chosen at the boundary like we see here, this indicates the generated penalty sequence is likely misspecified. Thus, we choose to supply our own sequence. In particular, we choose values that match up with those in Python's ```sklearn``` Ridge implementation.\n", - "\n", - "\n", - "```glmnet``` minimizes the elastic net loss function as follows:\n", - "$$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\lambda_{R} \\left( \\frac{1}{2} (1-\\alpha) \\|\\beta\\|_2^2 + \\alpha \\|\\beta\\|_1 \\right) $$ \n", - "\n", - "For ridge, $\\alpha=0$, so $$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\frac{\\lambda_{R}}{2} \\|\\beta\\|_2^2 $$\n", - "\n", - "Meanwhile, ```sklearn``` minimizes $$\\min_{\\beta} \\frac{1}{N} \\|X\\beta-y\\|_2^2 + \\frac{\\lambda_{python}}{N} \\|\\beta\\|_2^2$$ where $\\lambda_{python}$ is chosen from the grid $(0.1,1,10)$.\n", - "\n", - "To translate this into R, we must set in glmnet $$\\lambda_{R} :=\\frac{2}{N} \\lambda_{python}$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "o-k2e0zMI65-", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# sklearn lambdas (penalty)\n", - "lambdas_sklearn <- c(0.1, 1, 10) # defaults\n", - "l_seq <- 2 / nrow(X) * lambdas_sklearn\n", - "l_seq # note how different these are to the actual lambdas generated by glmnet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gLH-u5we8QaY", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5, lambda = l_seq)\n", - "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "snYw1Gg0phee", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "cat(\n", - " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", - " \"\\nridge R2 (Pop): \", r2_ridge,\n", - " \"\\nelnet R2 (Pop): \", r2_elnet\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-GuaTiprcCqq" - }, - "source": [ - "## Plug-in Hyperparameter Lasso and Post-Lasso OLS" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T2te6CvUcEa5" - }, - "source": [ - "Here we compute the lasso and ols post lasso using plug-in choices for penalty levels." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NQGL2JsocEjC" - }, - "source": [ - "\\We use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, and $\\Phi^{-1}$ denotes the quantile function. Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", - "\n", - "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G7yKoP1IcI5y" - }, - "source": [ - "We pull an anaue of R's rlasso. Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\\\n", - "The post lasso function makes OLS with the selected ***T*** regressors.\n", - "To select those parameters, they use $\\lambda$ as variable to penalize\\\n", - "**Funny thing: the function rlasso was named like that because it is the \"rigorous\" Lasso.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fHDKDGlVcXBh", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", - "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YMpfjDycchEp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "r2_lasso <- r2_score(predict(fit_rlasso, newdata = Xtest), ytest)\n", - "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xtest), ytest)\n", - "\n", - "cat(\n", - " \"rlasso R2 (Test): \", r2_lasso,\n", - " \"\\nrlasso-post R2 (Test): \", r2_lasso_post\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7CLOwOKKIgB5", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "r2_lasso <- r2_score(predict(fit_rlasso, newdata = (Xpop)), (ypop))\n", - "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = (Xpop)), (ypop))\n", - "\n", - "cat(\n", - " \"rlasso R2 (Pop): \", r2_lasso,\n", - " \"\\nrlasso-post R2 (Pop): \", r2_lasso_post\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WUaAe00Uc5-r" - }, - "source": [ - "## LAVA: Dense + Sparse Coefficients" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YBN4j8FMONaA", - "papermill": { - "duration": 0.02899, - "end_time": "2021-02-15T11:01:56.880825", - "exception": false, - "start_time": "2021-02-15T11:01:56.851835", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Next we code up lava, which alternates the fitting of lasso and ridge" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jUqZjZJ-mIaG", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", - "# number of iteration. Could iterate until a convergence criterion is met.\n", - "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", - " require(glmnet)\n", - "\n", - " # Need to demean internally\n", - " dy <- Y - mean(Y)\n", - " dx <- scale(X, scale = FALSE)\n", - "\n", - " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", - "\n", - " i <- 1\n", - " while (i <= iter) {\n", - " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", - " i <- i + 1\n", - " }\n", - "\n", - " bhat <- sp1$beta + de1$beta\n", - " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", - "\n", - " # Need to add intercept to output\n", - "\n", - " yhat <- newX %*% bhat + a0\n", - "\n", - " return(yhat)\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tr_KBCwwovp6", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# define function to get predictions and r2 scores for lava estimator\n", - "\n", - "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", - " # 5-fold CV. glmnet does cross-validation internally and\n", - " # relatively efficiently. We're going to write out all the steps to make sure\n", - " # we're using the same CV folds across all procedures in a transparent way and\n", - " # to keep the overall structure clear as well.\n", - "\n", - " # Setup for brute force K-Fold CV\n", - " n <- length(ytr)\n", - " Kf <- num_folds # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", - " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", - "\n", - "\n", - " ## ------------------------------------------------------------\n", - " # We're going to take a shortcut and use the range of lambda values that come out\n", - " # of the default implementation in glmnet for everything. Could do better here - maybe\n", - "\n", - " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", - " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", - "\n", - " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", - " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", - "\n", - " ## ------------------------------------------------------------\n", - "\n", - "\n", - " # Lava - Using a double loop over candidate penalty parameter values.\n", - "\n", - " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", - " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", - "\n", - " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", - "\n", - " for (k in 1:Kf) {\n", - " indk <- cvgroup == k\n", - "\n", - " k_xtr_mod <- xtr_mod[!indk, ]\n", - " k_ytr <- ytr[!indk]\n", - " k_xte_mod <- xtr_mod[indk, ]\n", - " k_yte <- ytr[indk]\n", - "\n", - " for (ii in seq_along(lambda1_lava_mod)) {\n", - " for (jj in seq_along(lambda2_lava_mod)) {\n", - " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", - " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", - " newX = k_xte_mod,\n", - " lambda1 = lambda1_lava_mod[ii],\n", - " lambda2 = lambda2_lava_mod[jj]))^2)\n", - " }\n", - " }\n", - " }\n", - "\n", - " # Get CV min values of tuning parameters\n", - " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", - " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", - " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", - "\n", - " cat(\"Min Lava Lasso CV Penalty: \", cvlambda1_lava_mod)\n", - " cat(\"\\nMin Lava Ridge CV Penalty: \", cvlambda2_lava_mod)\n", - "\n", - "\n", - " #### Look at performance on test sample\n", - "\n", - " # Calculate R^2 in training data and in validation data as measures\n", - " # Refit on entire training sample\n", - "\n", - "\n", - " #### CV-min model\n", - "\n", - " # In sample fit\n", - " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", - " newX = xtr_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " )\n", - " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", - "\n", - " # Out of sample fit\n", - " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", - " newX = xte_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " )\n", - " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", - "\n", - "\n", - " cat(\"\\nIn sample R2 (CV-min): \", r2_lava_mod)\n", - " cat(\"\\nOut of Sample R2 (CV-min): \", r2v_lava_mod)\n", - "\n", - "\n", - " #### Use average model across cv-folds and refit model using all training data\n", - " ###### we won't report these results.\n", - " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", - " n_tr <- length(ytr)\n", - " n_te <- length(yte)\n", - " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", - " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", - "\n", - "\n", - " for (k in 1:Kf) {\n", - " indk <- cvgroup == k\n", - "\n", - " k_xtr_mod <- xtr_mod[!indk, ]\n", - " k_ytr <- ytr[!indk]\n", - "\n", - " # Lava\n", - " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", - " newX = xtr_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " ))\n", - " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", - " newX = xte_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " ))\n", - " }\n", - "\n", - " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", - " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", - "\n", - " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", - " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", - "\n", - " cat(\"\\nIn sample R2 (Average Across Folds): \", r2_cv_ave_lava_mod)\n", - " cat(\"\\nOut of Sample R2 (Average Across Folds): \", r2v_cv_ave_lava_mod)\n", - "\n", - " return(c(\n", - " cvlambda1_lava_mod,\n", - " cvlambda2_lava_mod,\n", - " cvmin_yhat_lava_tr, # CV_min\n", - " cvmin_yhat_lava_test, # CV_min\n", - " r2_lava_mod, # CV_min\n", - " r2v_lava_mod, # CV_min\n", - " avg_yhat_lava_tr, # Average across Folds\n", - " avg_yhat_lava_test, # Average across Folds\n", - " r2_cv_ave_lava_mod, # Average across Folds\n", - " r2v_cv_ave_lava_mod # Average across Folds\n", - " ))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5dEsONeRF51R", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Results for Test\n", - "cat(\"Test Results ...\\n\")\n", - "r2_lava_traintest <- lava_yhat_r2(X, Xtest, y, ytest)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kdAQN0yq_ISV", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Results for Pop\n", - "## note we don't have to re-train the entire model\n", - "## this is just due to the way the function is defined above\n", - "cat(\"Population Results ...\\n\")\n", - "r2_lava_pop <- lava_yhat_r2(X, Xpop, y, ypop)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GaTBT7NkhRmH", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# report R2 using CV min\n", - "cat(\"LAVA R2 (Test): \", r2_lava_traintest[[6]])\n", - "cat(\"\\nLAVA R2 (Pop) \", r2_lava_pop[[6]])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gv0bAoZZiLnH" - }, - "source": [ - "## Summarizing Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VtzIoSdyS9To", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "table <- matrix(0, 6, 1)\n", - "table[1, 1] <- r2_lasso_cv\n", - "table[2, 1] <- r2_ridge\n", - "table[3, 1] <- r2_elnet\n", - "table[4, 1] <- r2_lasso\n", - "table[5, 1] <- r2_lasso_post\n", - "table[6, 1] <- r2_lava_pop[[6]]\n", - "\n", - "colnames(table) <- c(\"R2 (Population)\")\n", - "rownames(table) <- c(\n", - " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", - " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", - ")\n", - "tab <- xtable(table, digits = 3)\n", - "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "npU6rAHRUs_s", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Creating a data frame with the predicted values for test\n", - "data <- data.frame(\n", - " gXtest = gXtest,\n", - " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", - " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", - " RLasso = predict(fit_rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", - " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", - " Lava = as.vector(r2_lava_traintest[[4]])\n", - ")\n", - "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", - "\n", - "# Reshaping data into longer format for ggplot\n", - "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", - "\n", - "# Plotting\n", - "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", - " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", - " guide = guide_legend(title = \"Model\")) +\n", - " theme_minimal() +\n", - " labs(\n", - " title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\"\n", - " ) +\n", - " guides(shape = \"none\") # Remove the shape legend" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fc8S-gruBnFD" - }, - "source": [ - "## Data Generating Process: Dense Coefficients" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BiEL0vydBowk", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "n <- 100\n", - "p <- 400\n", - "res <- gen_data(n, p, regime = \"dense\")\n", - "\n", - "X <- res$X\n", - "y <- res$y\n", - "gX <- res$gX\n", - "Xtest <- res$Xtest\n", - "ytest <- res$ytest\n", - "gXtest <- res$gXtest\n", - "Xpop <- res$Xpop\n", - "ypop <- res$ypop\n", - "gXpop <- res$gXpop\n", - "betas <- res$beta\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BoHnfTmcDgvw", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qU2g-tf6DjsN", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# plot betas\n", - "plot(seq_along(betas), abs(betas),\n", - " log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\"))\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kGKVHss9BpDr", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# family gaussian means that we'll be using square loss\n", - "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", - "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", - "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", - "\n", - "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", - "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", - "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e93xdkcECQN_", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "table <- matrix(0, 6, 1)\n", - "table[1, 1] <- r2_lasso_cv\n", - "table[2, 1] <- r2_ridge\n", - "table[3, 1] <- r2_elnet\n", - "table[4, 1] <- r2_rlasso\n", - "table[5, 1] <- r2_rlasso_post\n", - "table[6, 1] <- r2_lava\n", - "\n", - "colnames(table) <- c(\"R2\")\n", - "rownames(table) <- c(\n", - " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", - " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", - ")\n", - "tab <- xtable(table, digits = 3)\n", - "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZdSCN8zeCQSR", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# get lava prediction on test set for plot below\n", - "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uiDd9oxhVcnc", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Creating a data frame with the predicted values for test\n", - "data <- data.frame(\n", - " gXtest = gXtest,\n", - " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", - " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", - " RLasso = predict(fit_rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", - " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", - " Lava = as.vector(lava_yhat)\n", - ")\n", - "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", - "\n", - "# Reshaping data into longer format for ggplot\n", - "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", - "\n", - "# Plotting\n", - "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", - " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", - " guide = guide_legend(title = \"Model\")) +\n", - " theme_minimal() +\n", - " labs(\n", - " title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\"\n", - " ) +\n", - " guides(shape = \"none\") # Remove the shape legend" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sxZFIhYuONaB", - "papermill": { - "duration": 0.018842, - "end_time": "2021-02-15T11:02:51.941852", - "exception": false, - "start_time": "2021-02-15T11:02:51.923010", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Data Generating Process: Approximately Sparse + Small Dense Part" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nQcWgf3KONaC", - "papermill": { - "duration": 0.207598, - "end_time": "2021-02-15T11:02:52.168536", - "exception": false, - "start_time": "2021-02-15T11:02:51.960938", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "set.seed(1)\n", - "n <- 100\n", - "p <- 400\n", - "res <- gen_data(n, p, regime = \"sparsedense\")\n", - "\n", - "X <- res$X\n", - "y <- res$y\n", - "gX <- res$gX\n", - "Xtest <- res$Xtest\n", - "ytest <- res$ytest\n", - "gXtest <- res$gXtest\n", - "Xpop <- res$Xpop\n", - "ypop <- res$ypop\n", - "gXpop <- res$gXpop\n", - "betas <- res$beta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yiIrU6SQDkjK", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", - "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X2N8JfHDDkmk", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# plot betas\n", - "plot(seq_along(betas), abs(betas),\n", - " log = \"y\", pch = 20, col = \"blue\",\n", - " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", - " main = expression(paste(\"Beta Magnitude\"))\n", - ")\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "EaMt_4G0ONZ7", + "papermill": { + "duration": 0.010774, + "end_time": "2021-02-15T11:01:41.782833", + "exception": false, + "start_time": "2021-02-15T11:01:41.772059", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "obWejQaJONaC", - "papermill": { - "duration": 1.432822, - "end_time": "2021-02-15T11:02:53.626802", - "exception": false, - "start_time": "2021-02-15T11:02:52.193980", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# family gaussian means that we'll be using square loss\n", - "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", - "# family gaussian means that we'll be using square loss\n", - "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", - "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", - "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", - "\n", - "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", - "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", - "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", - "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" - ] + "tags": [] + }, + "source": [ + "# Penalized Linear Regressions: A Simulation Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fw3Ya0m6vboO", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"glmnet\")\n", + "\n", + "library(hdm)\n", + "library(xtable)\n", + "library(glmnet)\n", + "library(ggplot2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GNTVs-CtE-U9" + }, + "source": [ + "## Data Generating Process" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UXGpnWeeFAHV" + }, + "source": [ + "We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N1TPWyBtBrqB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "gen_data <- function(n, p, regime = \"sparse\") {\n", + " # constants chosen to get R^2 of approximately .80\n", + " if (regime == \"sparse\") {\n", + " beta <- (1 / seq(1:p)^2) * 7\n", + " } else if (regime == \"dense\") {\n", + " beta <- rnorm(p) * 0.5\n", + " } else if (regime == \"sparsedense\") {\n", + " beta_1 <- (1 / seq(1:p)^2) * 6.5\n", + " beta_2 <- rnorm(p, 0, 0.5) * 0.7\n", + " beta <- beta_1 + beta_2\n", + " }\n", + "\n", + " true_fn <- function(x) {\n", + " x[, seq_len(dim(x)[2])] %*% beta\n", + " }\n", + "\n", + " X <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", + " gX <- true_fn(X)\n", + " y <- gX + rnorm(n)\n", + "\n", + " Xtest <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p)\n", + " gXtest <- true_fn(Xtest)\n", + " ytest <- gXtest + rnorm(n)\n", + "\n", + " Xpop <- matrix(runif(100000 * p, min = -0.5, max = 0.5), 100000, p)\n", + " gXpop <- true_fn(Xpop)\n", + " ypop <- gXpop + rnorm(100000)\n", + "\n", + " return(list(\n", + " X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest,\n", + " Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta = beta\n", + " ))\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5UedfBJpONZ7", + "papermill": { + "duration": 0.010616, + "end_time": "2021-02-15T11:01:41.804126", + "exception": false, + "start_time": "2021-02-15T11:01:41.793510", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "38KYAe5MONaC", - "papermill": { - "duration": 13.756606, - "end_time": "2021-02-15T11:03:07.405363", - "exception": false, - "start_time": "2021-02-15T11:02:53.648757", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "table <- matrix(0, 6, 1)\n", - "table[1, 1] <- r2_lasso_cv\n", - "table[2, 1] <- r2_ridge\n", - "table[3, 1] <- r2_elnet\n", - "table[4, 1] <- r2_rlasso\n", - "table[5, 1] <- r2_rlasso_post\n", - "table[6, 1] <- r2_lava\n", - "\n", - "colnames(table) <- c(\"R2\")\n", - "rownames(table) <- c(\n", - " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", - " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", - ")\n", - "tab <- xtable(table, digits = 3)\n", - "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab" - ] + "tags": [] + }, + "source": [ + "## Data Generating Process: Approximately Sparse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LV521EPdA05z", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "n <- 100\n", + "p <- 400\n", + "res <- gen_data(n, p, regime = \"sparse\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "REt70Qs_zBPl", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "X <- res$X\n", + "y <- res$y\n", + "gX <- res$gX\n", + "Xtest <- res$Xtest\n", + "ytest <- res$ytest\n", + "gXtest <- res$gXtest\n", + "Xpop <- res$Xpop\n", + "ypop <- res$ypop\n", + "gXpop <- res$gXpop\n", + "betas <- res$beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3lvcbHdqv11D", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ry_b39bLDIDT", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g6jcTnhwUkhl" + }, + "source": [ + "## Lasso, Ridge, ElasticNet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aRWiO93SUw1G" + }, + "source": [ + "We use glmnet's penalized estimators, which choose the penalty parameter via cross-validation (by default 10-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. The parameter `alpha` controls what penalty (or allows for a convex combination of `l1` and `l2` penalty). Set `alpha=0.5` for elastic net.\n", + "\n", + "Features will be standardized (by glmnet) so that penalization does not favor different features asymmetrically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dy1XNF6JXPpe", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "r2_score <- function(preds, actual, ytrain = y) {\n", + " rss <- sum((preds - actual)^2) # residual sum of squares\n", + " # total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object\n", + " tss <- sum((actual - mean(ytrain))^2)\n", + " rsq <- 1 - rss / tss\n", + " return(rsq)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Cy7dThUhONZ_", + "papermill": { + "duration": 2.898022, + "end_time": "2021-02-15T11:01:45.358083", + "exception": false, + "start_time": "2021-02-15T11:01:42.460061", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oW3kq2xNOone", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# get lava prediction on test set for plot below\n", - "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a7WQJRJ6l0n4" + }, + "source": [ + "We calculate the R-squared on the small test set that we have" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SMuo4MlvXtxH", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "cat(\n", + " \"lassocv R2 (Test): \", r2_score(predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"), ytest),\n", + " \"\\nridge R2 (Test): \", r2_score(predict(fit_ridge, newx = Xtest, s = \"lambda.min\"), ytest),\n", + " \"\\nelnet R2 (Test): \", r2_score(predict(fit_elnet, newx = Xtest, s = \"lambda.min\"), ytest)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fw7a-6_-Yhbb" + }, + "source": [ + "We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UKmjj0fdYiL1", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "\n", + "cat(\n", + " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", + " \"\\nridge R2 (Pop): \", r2_ridge,\n", + " \"\\nelnet R2 (Pop): \", r2_elnet\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QECIRikt3j5y" + }, + "source": [ + "#### glmnet failure in Ridge\n", + "\n", + "**Note**: Ridge regression performs worse relatively to the Ridge in the correponding [Python notebook](https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM2/python_linear_penalized_regs.ipynb). Even if one were to control for the randomness in the data and use the same data for both, R's glmnet fails.\n", + "\n", + "To understand why, look at the cross-validated MSE curve with different $\\lambda$ ()." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kUvo6YbaHaSN", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "plot(fit_ridge)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mVRvqs8fnRaA" + }, + "source": [ + "From the [glmnet documentation](https://glmnet.stanford.edu/articles/glmnet.html):\n", + "\n", + "\n", + "\n", + "> This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\\lambda$ sequence (error bars). Two special values along the $\\lambda$ sequence are indicated by the vertical dotted lines. ```lambda.min``` is the value of $\\lambda$ that gives minimum mean cross-validated error, while ```lambda.1se``` is the value of $\\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum.\n", + "\n", + "Notice that the chosen ```lambda.min``` is at the boundary of the sequence. An easy way to check this instead of plotting is to extract the $\\lambda$ sequence and the minimum chosen $\\lambda_{min}$ from the fitted object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZsjlfgrynSLx", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "cat(\"lambda sequence: \", fit_ridge$lambda)\n", + "cat(\"\\nChosen minimum lambda: \", fit_ridge$lambda.min)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9VKXxWilod6N" + }, + "source": [ + "In general, it is good practice to examine the lambda sequence that R produces and searches over in cross-validation. When the penalty is chosen at the boundary like we see here, this indicates the generated penalty sequence is likely misspecified. Thus, we choose to supply our own sequence. In particular, we choose values that match up with those in Python's ```sklearn``` Ridge implementation.\n", + "\n", + "\n", + "```glmnet``` minimizes the elastic net loss function as follows:\n", + "$$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\lambda_{R} \\left( \\frac{1}{2} (1-\\alpha) \\|\\beta\\|_2^2 + \\alpha \\|\\beta\\|_1 \\right) $$ \n", + "\n", + "For ridge, $\\alpha=0$, so $$\\min_{\\beta} \\frac{1}{N} \\| X\\beta - y\\|_2^2 + \\frac{\\lambda_{R}}{2} \\|\\beta\\|_2^2 $$\n", + "\n", + "Meanwhile, ```sklearn``` minimizes $$\\min_{\\beta} \\frac{1}{N} \\|X\\beta-y\\|_2^2 + \\frac{\\lambda_{python}}{N} \\|\\beta\\|_2^2$$ where $\\lambda_{python}$ is chosen from the grid $(0.1,1,10)$.\n", + "\n", + "To translate this into R, we must set in glmnet $$\\lambda_{R} :=\\frac{2}{N} \\lambda_{python}$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "o-k2e0zMI65-", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# sklearn lambdas (penalty)\n", + "lambdas_sklearn <- c(0.1, 1, 10) # defaults\n", + "l_seq <- 2 / nrow(X) * lambdas_sklearn\n", + "l_seq # note how different these are to the actual lambdas generated by glmnet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gLH-u5we8QaY", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5, lambda = l_seq)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "snYw1Gg0phee", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "cat(\n", + " \"lassocv R2 (Pop): \", r2_lasso_cv,\n", + " \"\\nridge R2 (Pop): \", r2_ridge,\n", + " \"\\nelnet R2 (Pop): \", r2_elnet\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-GuaTiprcCqq" + }, + "source": [ + "## Plug-in Hyperparameter Lasso and Post-Lasso OLS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T2te6CvUcEa5" + }, + "source": [ + "Here we compute the lasso and ols post lasso using plug-in choices for penalty levels." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NQGL2JsocEjC" + }, + "source": [ + "\\We use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, and $\\Phi^{-1}$ denotes the quantile function. Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", + "\n", + "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G7yKoP1IcI5y" + }, + "source": [ + "We pull an anaue of R's rlasso. Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\\\n", + "The post lasso function makes OLS with the selected ***T*** regressors.\n", + "To select those parameters, they use $\\lambda$ as variable to penalize\\\n", + "**Funny thing: the function rlasso was named like that because it is the \"rigorous\" Lasso.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fHDKDGlVcXBh", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YMpfjDycchEp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "r2_lasso <- r2_score(predict(fit_rlasso, newdata = Xtest), ytest)\n", + "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xtest), ytest)\n", + "\n", + "cat(\n", + " \"rlasso R2 (Test): \", r2_lasso,\n", + " \"\\nrlasso-post R2 (Test): \", r2_lasso_post\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7CLOwOKKIgB5", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "r2_lasso <- r2_score(predict(fit_rlasso, newdata = (Xpop)), (ypop))\n", + "r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = (Xpop)), (ypop))\n", + "\n", + "cat(\n", + " \"rlasso R2 (Pop): \", r2_lasso,\n", + " \"\\nrlasso-post R2 (Pop): \", r2_lasso_post\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WUaAe00Uc5-r" + }, + "source": [ + "## LAVA: Dense + Sparse Coefficients" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YBN4j8FMONaA", + "papermill": { + "duration": 0.02899, + "end_time": "2021-02-15T11:01:56.880825", + "exception": false, + "start_time": "2021-02-15T11:01:56.851835", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1sYLd-O0V2IC", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Creating a data frame with the predicted values for test\n", - "data <- data.frame(\n", - " gXtest = gXtest,\n", - " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", - " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", - " RLasso = predict(fit_rlasso, newdata = Xtest),\n", - " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", - " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", - " Lava = as.vector(lava_yhat)\n", - ")\n", - "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", - "\n", - "# Reshaping data into longer format for ggplot\n", - "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", - "\n", - "# Plotting\n", - "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", - " geom_point(aes(shape = Model)) +\n", - " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", - " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", - " guide = guide_legend(title = \"Model\")) +\n", - " theme_minimal() +\n", - " labs(\n", - " title = \"Comparison of Methods on Predicting gX\",\n", - " x = \"gXtest\",\n", - " y = \"Predictions\"\n", - " ) +\n", - " guides(shape = \"none\") # Remove the shape legend" - ] + "tags": [] + }, + "source": [ + "Next we code up lava, which alternates the fitting of lasso and ridge" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jUqZjZJ-mIaG", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", + "# number of iteration. Could iterate until a convergence criterion is met.\n", + "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", + " require(glmnet)\n", + "\n", + " # Need to demean internally\n", + " dy <- Y - mean(Y)\n", + " dx <- scale(X, scale = FALSE)\n", + "\n", + " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + "\n", + " i <- 1\n", + " while (i <= iter) {\n", + " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " i <- i + 1\n", + " }\n", + "\n", + " bhat <- sp1$beta + de1$beta\n", + " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", + "\n", + " # Need to add intercept to output\n", + "\n", + " yhat <- newX %*% bhat + a0\n", + "\n", + " return(yhat)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tr_KBCwwovp6", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# define function to get predictions and r2 scores for lava estimator\n", + "\n", + "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", + " # 5-fold CV. glmnet does cross-validation internally and\n", + " # relatively efficiently. We're going to write out all the steps to make sure\n", + " # we're using the same CV folds across all procedures in a transparent way and\n", + " # to keep the overall structure clear as well.\n", + "\n", + " # Setup for brute force K-Fold CV\n", + " n <- length(ytr)\n", + " Kf <- num_folds # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", + " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", + "\n", + "\n", + " ## ------------------------------------------------------------\n", + " # We're going to take a shortcut and use the range of lambda values that come out\n", + " # of the default implementation in glmnet for everything. Could do better here - maybe\n", + "\n", + " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", + "\n", + " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", + "\n", + " ## ------------------------------------------------------------\n", + "\n", + "\n", + " # Lava - Using a double loop over candidate penalty parameter values.\n", + "\n", + " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", + " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", + "\n", + " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", + "\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", + "\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + " k_xte_mod <- xtr_mod[indk, ]\n", + " k_yte <- ytr[indk]\n", + "\n", + " for (ii in seq_along(lambda1_lava_mod)) {\n", + " for (jj in seq_along(lambda2_lava_mod)) {\n", + " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", + " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", + " newX = k_xte_mod,\n", + " lambda1 = lambda1_lava_mod[ii],\n", + " lambda2 = lambda2_lava_mod[jj]))^2)\n", + " }\n", + " }\n", + " }\n", + "\n", + " # Get CV min values of tuning parameters\n", + " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", + " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", + " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", + "\n", + " cat(\"Min Lava Lasso CV Penalty: \", cvlambda1_lava_mod)\n", + " cat(\"\\nMin Lava Ridge CV Penalty: \", cvlambda2_lava_mod)\n", + "\n", + "\n", + " #### Look at performance on test sample\n", + "\n", + " # Calculate R^2 in training data and in validation data as measures\n", + " # Refit on entire training sample\n", + "\n", + "\n", + " #### CV-min model\n", + "\n", + " # In sample fit\n", + " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + "\n", + " # Out of sample fit\n", + " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + "\n", + " cat(\"\\nIn sample R2 (CV-min): \", r2_lava_mod)\n", + " cat(\"\\nOut of Sample R2 (CV-min): \", r2v_lava_mod)\n", + "\n", + "\n", + " #### Use average model across cv-folds and refit model using all training data\n", + " ###### we won't report these results.\n", + " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", + " n_tr <- length(ytr)\n", + " n_te <- length(yte)\n", + " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", + " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", + "\n", + "\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", + "\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + "\n", + " # Lava\n", + " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " }\n", + "\n", + " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", + " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", + "\n", + " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " cat(\"\\nIn sample R2 (Average Across Folds): \", r2_cv_ave_lava_mod)\n", + " cat(\"\\nOut of Sample R2 (Average Across Folds): \", r2v_cv_ave_lava_mod)\n", + "\n", + " return(c(\n", + " cvlambda1_lava_mod,\n", + " cvlambda2_lava_mod,\n", + " cvmin_yhat_lava_tr, # CV_min\n", + " cvmin_yhat_lava_test, # CV_min\n", + " r2_lava_mod, # CV_min\n", + " r2v_lava_mod, # CV_min\n", + " avg_yhat_lava_tr, # Average across Folds\n", + " avg_yhat_lava_test, # Average across Folds\n", + " r2_cv_ave_lava_mod, # Average across Folds\n", + " r2v_cv_ave_lava_mod # Average across Folds\n", + " ))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5dEsONeRF51R", + "vscode": { + "languageId": "r" } - ], - "metadata": { - "colab": { - "provenance": [] + }, + "outputs": [], + "source": [ + "# Results for Test\n", + "cat(\"Test Results ...\\n\")\n", + "r2_lava_traintest <- lava_yhat_r2(X, Xtest, y, ytest)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kdAQN0yq_ISV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Results for Pop\n", + "## note we don't have to re-train the entire model\n", + "## this is just due to the way the function is defined above\n", + "cat(\"Population Results ...\\n\")\n", + "r2_lava_pop <- lava_yhat_r2(X, Xpop, y, ypop)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GaTBT7NkhRmH", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# report R2 using CV min\n", + "cat(\"LAVA R2 (Test): \", r2_lava_traintest[[6]])\n", + "cat(\"\\nLAVA R2 (Pop) \", r2_lava_pop[[6]])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gv0bAoZZiLnH" + }, + "source": [ + "## Summarizing Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VtzIoSdyS9To", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "table <- matrix(0, 6, 1)\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_lasso\n", + "table[5, 1] <- r2_lasso_post\n", + "table[6, 1] <- r2_lava_pop[[6]]\n", + "\n", + "colnames(table) <- c(\"R2 (Population)\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "npU6rAHRUs_s", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Creating a data frame with the predicted values for test\n", + "data <- data.frame(\n", + " gXtest = gXtest,\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(r2_lava_traintest[[4]])\n", + ")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", + "\n", + "# Reshaping data into longer format for ggplot\n", + "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", + "\n", + "# Plotting\n", + "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", + " geom_point(aes(shape = Model)) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", + " theme_minimal() +\n", + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fc8S-gruBnFD" + }, + "source": [ + "## Data Generating Process: Dense Coefficients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BiEL0vydBowk", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "n <- 100\n", + "p <- 400\n", + "res <- gen_data(n, p, regime = \"dense\")\n", + "\n", + "X <- res$X\n", + "y <- res$y\n", + "gX <- res$gX\n", + "Xtest <- res$Xtest\n", + "ytest <- res$ytest\n", + "gXtest <- res$gXtest\n", + "Xpop <- res$Xpop\n", + "ypop <- res$ypop\n", + "gXpop <- res$gXpop\n", + "betas <- res$beta\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BoHnfTmcDgvw", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qU2g-tf6DjsN", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kGKVHss9BpDr", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", + "\n", + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", + "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", + "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e93xdkcECQN_", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "table <- matrix(0, 6, 1)\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_rlasso\n", + "table[5, 1] <- r2_rlasso_post\n", + "table[6, 1] <- r2_lava\n", + "\n", + "colnames(table) <- c(\"R2\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZdSCN8zeCQSR", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# get lava prediction on test set for plot below\n", + "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uiDd9oxhVcnc", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Creating a data frame with the predicted values for test\n", + "data <- data.frame(\n", + " gXtest = gXtest,\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(lava_yhat)\n", + ")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", + "\n", + "# Reshaping data into longer format for ggplot\n", + "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", + "\n", + "# Plotting\n", + "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", + " geom_point(aes(shape = Model)) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", + " theme_minimal() +\n", + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sxZFIhYuONaB", + "papermill": { + "duration": 0.018842, + "end_time": "2021-02-15T11:02:51.941852", + "exception": false, + "start_time": "2021-02-15T11:02:51.923010", + "status": "completed" }, - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + "tags": [] + }, + "source": [ + "## Data Generating Process: Approximately Sparse + Small Dense Part" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nQcWgf3KONaC", + "papermill": { + "duration": 0.207598, + "end_time": "2021-02-15T11:02:52.168536", + "exception": false, + "start_time": "2021-02-15T11:02:51.960938", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "set.seed(1)\n", + "n <- 100\n", + "p <- 400\n", + "res <- gen_data(n, p, regime = \"sparsedense\")\n", + "\n", + "X <- res$X\n", + "y <- res$y\n", + "gX <- res$gX\n", + "Xtest <- res$Xtest\n", + "ytest <- res$ytest\n", + "gXtest <- res$gXtest\n", + "Xpop <- res$Xpop\n", + "ypop <- res$ypop\n", + "gXpop <- res$gXpop\n", + "betas <- res$beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yiIrU6SQDkjK", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "plot(gX, y, xlab = \"g(X)\", ylab = \"y\") # plot V vs g(X)\n", + "print(c(\"theoretical R2:\", var(gX) / var(y))) # theoretical R-square in the simulation example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X2N8JfHDDkmk", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# plot betas\n", + "plot(seq_along(betas), abs(betas),\n", + " log = \"y\", pch = 20, col = \"blue\",\n", + " xlab = expression(beta), ylab = \"Magnitude (log scale)\",\n", + " main = expression(paste(\"Beta Magnitude\"))\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "obWejQaJONaC", + "papermill": { + "duration": 1.432822, + "end_time": "2021-02-15T11:02:53.626802", + "exception": false, + "start_time": "2021-02-15T11:02:52.193980", + "status": "completed" }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# family gaussian means that we'll be using square loss\n", + "fit_lasso_cv <- cv.glmnet(X, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", + "# family gaussian means that we'll be using square loss\n", + "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", + "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", + "\n", + "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = \"lambda.min\"), ypop)\n", + "r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop)\n", + "r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop)\n", + "r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "38KYAe5MONaC", "papermill": { - "default_parameters": {}, - "duration": 157.995397, - "end_time": "2021-02-15T11:04:16.324442", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-02-15T11:01:38.329045", - "version": "2.2.2" + "duration": 13.756606, + "end_time": "2021-02-15T11:03:07.405363", + "exception": false, + "start_time": "2021-02-15T11:02:53.648757", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "table <- matrix(0, 6, 1)\n", + "table[1, 1] <- r2_lasso_cv\n", + "table[2, 1] <- r2_ridge\n", + "table[3, 1] <- r2_elnet\n", + "table[4, 1] <- r2_rlasso\n", + "table[5, 1] <- r2_rlasso_post\n", + "table[6, 1] <- r2_lava\n", + "\n", + "colnames(table) <- c(\"R2\")\n", + "rownames(table) <- c(\n", + " \"Cross-Validated Lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Lasso\", \"Post-Lasso\", \"Lava\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oW3kq2xNOone", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# get lava prediction on test set for plot below\n", + "lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1sYLd-O0V2IC", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Creating a data frame with the predicted values for test\n", + "data <- data.frame(\n", + " gXtest = gXtest,\n", + " Ridge = predict(fit_ridge, newx = Xtest, s = \"lambda.min\"),\n", + " ENet = predict(fit_elnet, newx = Xtest, s = \"lambda.min\"),\n", + " RLasso = predict(fit_rlasso, newdata = Xtest),\n", + " RLassoPost = predict(fit_rlasso_post, newdata = Xtest),\n", + " LassoCV = predict(fit_lasso_cv, newx = Xtest, s = \"lambda.min\"),\n", + " Lava = as.vector(lava_yhat)\n", + ")\n", + "colnames(data) <- c(\"gXtest\", \"Ridge\", \"ENet\", \"RLasso\", \"RlassoPost\", \"LassoCV\", \"Lava\")\n", + "\n", + "# Reshaping data into longer format for ggplot\n", + "data_long <- tidyr::gather(data, Model, Predicted, -gXtest)\n", + "\n", + "# Plotting\n", + "ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) +\n", + " geom_point(aes(shape = Model)) +\n", + " geom_abline(slope = 1, intercept = 0, linetype = \"dashed\", color = \"black\") + # gX by gX\n", + " scale_color_manual(values = c(\"brown\", \"yellow\", \"red\", \"green\", \"blue\", \"magenta\"),\n", + " guide = guide_legend(title = \"Model\")) +\n", + " theme_minimal() +\n", + " labs(\n", + " title = \"Comparison of Methods on Predicting gX\",\n", + " x = \"gXtest\",\n", + " y = \"Predictions\"\n", + " ) +\n", + " guides(shape = \"none\") # Remove the shape legend" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 0 + "papermill": { + "default_parameters": {}, + "duration": 157.995397, + "end_time": "2021-02-15T11:04:16.324442", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-02-15T11:01:38.329045", + "version": "2.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_ml_for_wage_prediction.Rmd b/PM2/r_ml_for_wage_prediction.Rmd new file mode 100644 index 00000000..8e10ee86 --- /dev/null +++ b/PM2/r_ml_for_wage_prediction.Rmd @@ -0,0 +1,446 @@ +--- +title: An R Markdown document converted from "PM2/r_ml_for_wage_prediction.irnb" +output: html_document +--- + +# A Simple Case Study using Wage Data from 2015 + +We illustrate how to predict an outcome variable $Y$ in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. We use linear prediction rules for estimation, including OLS and the penalized linear methods we've studied. Later, we will also consider nonlinear prediction rules including tree-based methods and neural nets. + +```{r} +install.packages("xtable") +install.packages("hdm") +install.packages("glmnet") +install.packages("MLmetrics") + +library(hdm) +library(xtable) +library(glmnet) +library(MLmetrics) +``` + +## Data + +Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. +The preproccessed sample consists of $5150$ never-married individuals. + +```{r} +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +data <- read.csv(file) +dim(data) +``` + +The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators. + +```{r} +Z <- subset(data, select = -c(lwage, wage)) # regressors +colnames(Z) +``` + +The following figure shows the weekly wage distribution from the US survey data. + +```{r} +hist(data$wage, xlab = "hourly wage", main = "Empirical wage distribution from the US survey data", breaks = 35) +``` + +Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by +the logarithm. + +## Analysis + +Due to the skewness of the data, we are considering log wages which leads to the following regression model + +$$log(wage) = g(Z) + \epsilon.$$ + +In this notebook, we will evaluate *linear* prediction rules. In later notebooks, we will also utilize nonlinear prediction methods. In linear models, we estimate the prediction rule of the form + +$$\hat g(Z) = \hat \beta'X.$$ + +Again, we generate $X$ in three ways: + +1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators). + + +2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., $\operatorname{exp}^2$ and $\operatorname{exp}^3$) and additional two-way interactions. + +3. Extra Flexible Model: $X$ takes the flexible model and takes all pairwise interactions. + +To evaluate the out-of-sample performance, we split the data first. + +```{r} +set.seed(1234) +training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE) + +data_train <- data[training, ] +data_test <- data[-training, ] +``` + +```{r} +y_train <- data_train$lwage +y_test <- data_test$lwage +``` + +We are starting by running a simple OLS regression. We fit the basic and flexible model to our training data by running an ols regression and compute the R-squared on the test sample + +As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression and computing the mean squared error and $R^2$ on the test sample. + +### Low dimensional specification (basic) + +```{r} +x_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" +formula_basic <- as.formula(paste("lwage", "~", x_basic)) +model_x_basic_train <- model.matrix(formula_basic, data_train) +model_x_basic_test <- model.matrix(formula_basic, data_test) +p_basic <- dim(model_x_basic_train)[2] +p_basic +``` + +```{r} +# ols (basic model) +fit_lm_basic <- lm(formula_basic, data_train) +# Compute the Out-Of-Sample Performance +yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test) +cat("Basic model MSE (OLS): ", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model) +``` + +To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*: + +```{r} +mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2] +mse_lm_basic +``` + +We also compute the out-of-sample $R^2$: + +```{r} +r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test) +cat("Basic model R^2 (OLS): ", r2_lm_basic) # MSE OLS (basic model) +``` + +### High-dimensional specification (flexible) + +```{r} +x_flex <- "sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we " + + "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)" +formula_flex <- as.formula(paste("lwage", "~", x_flex)) +model_x_flex_train <- model.matrix(formula_flex, data_train) +model_x_flex_test <- model.matrix(formula_flex, data_test) +p_flex <- dim(model_x_flex_train)[2] +p_flex +``` + +We repeat the same procedure for the flexible model. + +```{r} +# ols (flexible model) +fit_lm_flex <- lm(formula_flex, data_train) +# Compute the Out-Of-Sample Performance +options(warn = -1) +yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test) +mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2] +r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test) +cat("Flexible model R^2 (OLS): ", r2_lm_flex) # MSE OLS (flexible model) +``` + +### Penalized regressions (flexible model) + + + + +We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We proceed by running penalized regressions first for the flexible model, tuned via cross-validation. + +```{r} +fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 1) +fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 0) +fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = .5) + +yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test) +yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test) +yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test) + +mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2] +mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2] +mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2] + +r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test) +r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test) +r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test) + +# R^2 using cross-validation (flexible model) +cat("Flexible model R^2 (Lasso): ", r2_lasso_cv_flex) +cat("\nFlexible model R^2 (Ridge): ", r2_ridge_flex) +cat("\nFlexible model R^2 (Elastic Net): ", r2_elnet_flex) +``` + +We can also try a variant of the `l1` penalty, where the weight is chosen based on theoretical derivations. We use package *hdm* and the function *rlasso*, relying on a theoretical based choice of the penalty level $\lambda$ in the lasso regression. + +Specifically, we use "plug-in" tuning with a theoretically valid choice of penalty $\lambda = 2 \cdot c \hat{\sigma} \sqrt{n} \Phi^{-1}(1-\alpha/2p)$, where $c>1$ and $1-\alpha$ is a confidence level, $\Phi^{-1}$ denotes the quantile function, and $\hat{\sigma}$ is estimated in an iterative manner (see corresponding notes in book). Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\epsilon$ and $X$ in the regression equation decay quickly at the tails. + +In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks. + +Now, we repeat the same procedure for the flexible model. + +```{r} +fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE) +fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE) +yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test) +yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test) + +mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2] +mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2] + +r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test) +r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test) +# R^2 theoretically chosen penalty (flexible model) +cat("Flexible model R^2 (RLasso): ", r2_lasso_flex) +cat("\nFlexible model R^2 (RLasso post): ", r2_lasso_post_flex) +``` + +Finally, we try the combination of a sparse and a dense coefficient using the LAVA method + +```{r} +# Define function to compute lava estimator. Doing an iterative scheme with fixed +# number of iteration. Could iterate until a convergence criterion is met. +lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { + require(glmnet) + + # Need to demean internally + dy <- Y - mean(Y) + dx <- scale(X, scale = FALSE) + + sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" + de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) + + i <- 1 + while (i <= iter) { + sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) + de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) + i <- i + 1 + } + + bhat <- sp1$beta + de1$beta + a0 <- mean(Y) - sum(colMeans(X) * bhat) + + # Need to add intercept to output + + yhat <- newX %*% bhat + a0 + + return(yhat) +} +``` + +```{r} +# define function to get predictions and r2 scores for lava estimator +lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { + # 5-fold CV. glmnet does cross-validation internally and + # relatively efficiently. We're going to write out all the steps to make sure + # we're using the same CV folds across all procedures in a transparent way and + # to keep the overall structure clear as well. + + # Setup for brute force K-Fold CV + n <- length(ytr) + Kf <- num_folds # Number of folds + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups + + + ## ------------------------------------------------------------ + # We're going to take a shortcut and use the range of lambda values that come out + # of the default implementation in glmnet for everything. Could do better here - maybe + + ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. + ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge + ridge_lambda <- ridge_mod$lambda # values of penalty parameter + + ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. + lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) + lasso_lambda <- lasso_mod$lambda # values of penalty parameter + + ## ------------------------------------------------------------ + + + # Lava - Using a double loop over candidate penalty parameter values. + + lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)] + lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)] + + cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod)) + + for (k in 1:Kf) { + indk <- cvgroup == k + + k_xtr_mod <- xtr_mod[!indk, ] + k_ytr <- ytr[!indk] + k_xte_mod <- xtr_mod[indk, ] + k_yte <- ytr[indk] + + for (ii in seq_along(lambda1_lava_mod)) { + for (jj in seq_along(lambda2_lava_mod)) { + cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] + + sum((k_yte - lava_predict(k_xtr_mod, k_ytr, + newX = k_xte_mod, + lambda1 = lambda1_lava_mod[ii], + lambda2 = lambda2_lava_mod[jj]))^2) + } + } + } + + # Get CV min values of tuning parameters + cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE) + cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]] + cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]] + + #### Look at performance on test sample + + # Calculate R^2 in training data and in validation data as measures + # Refit on entire training sample + + #### CV-min model + + # In sample fit + cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr, + newX = xtr_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + ) + r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) + + # Out of sample fit + cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr, + newX = xte_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + ) + r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) + + #### Use average model across cv-folds and refit model using all training data + ###### we won't report these results. + ###### Averaging is theoretically more solid, but cv-min is more practical. + n_tr <- length(ytr) + n_te <- length(yte) + yhat_tr_lava_mod <- matrix(0, n_tr, Kf) + yhat_te_lava_mod <- matrix(0, n_te, Kf) + + + for (k in 1:Kf) { + indk <- cvgroup == k + + k_xtr_mod <- xtr_mod[!indk, ] + k_ytr <- ytr[!indk] + + # Lava + yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, + newX = xtr_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + )) + yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, + newX = xte_mod, + lambda1 = cvlambda1_lava_mod, + lambda2 = cvlambda2_lava_mod + )) + } + + avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod) + avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod) + + r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) + r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) + + return(c( + cvlambda1_lava_mod, + cvlambda2_lava_mod, + cvmin_yhat_lava_tr, # CV_min + cvmin_yhat_lava_test, # CV_min + r2_lava_mod, # CV_min + r2v_lava_mod, # CV_min + avg_yhat_lava_tr, # Average across Folds + avg_yhat_lava_test, # Average across Folds + r2_cv_ave_lava_mod, # Average across Folds + r2v_cv_ave_lava_mod # Average across Folds + )) +} +``` + +```{r} +fit_lava_flex <- lava_yhat_r2(model_x_flex_train, model_x_flex_test, y_train, y_test) +cat("Flexible model R^2 (LAVA): ", fit_lava_flex[[6]]) # using CV_min +``` + + + +We find that for this dataset the low dimensional OLS is sufficient. The high-dimensional approaches did not manage to substantively increase predictive power. + +### Extra high-dimensional specification (extra flexible) + +We repeat the same procedure for the extra flexible model. + + + + + + +```{r} +x_extra <- " sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2" +formula_extra <- as.formula(paste("lwage", "~", x_extra)) +model_x_extra_train <- model.matrix(formula_extra, data_train) +model_x_extra_test <- model.matrix(formula_extra, data_test) +p_extra <- dim(model_x_extra_train)[2] +p_extra +``` + +```{r} +# ols (extra flexible model) +fit_lm_extra <- lm(formula_extra, data_train) +options(warn = -1) +yhat_lm_extra <- predict(fit_lm_extra, newdata = data_test) +mse_lm_extra <- summary(lm((y_test - yhat_lm_extra)^2 ~ 1))$coef[1:2] +r2_lm_extra <- 1 - mse_lm_extra[1] / var(y_test) +cat("Extra flexible model R^2 (OLS): ", r2_lm_extra) +``` + +#### Penalized regressions (extra flexible model) + +Now let's repeat our penalized regression analysis for the extra flexible model. Note this block takes a while ~ 1 hour 15 minutes. To reduce time substantially, reduce the number of folds in LAVA. + +```{r} +# penalized regressions +fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 1) +fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 0) +fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = .5) +fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE) +fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE) +fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test) + +yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test) +yhat_ridge_extra <- predict(fit_ridge_extra, newx = model_x_extra_test) +yhat_elnet_extra <- predict(fit_elnet_extra, newx = model_x_extra_test) +yhat_rlasso_extra <- predict(fit_rlasso_extra, newdata = data_test) +yhat_rlasso_post_extra <- predict(fit_rlasso_post_extra, newdata = data_test) +yhat_lava_extra <- fit_lava_extra[[4]] + +mse_lasso_cv_extra <- summary(lm((y_test - yhat_lasso_cv_extra)^2 ~ 1))$coef[1:2] +mse_ridge_extra <- summary(lm((y_test - yhat_ridge_extra)^2 ~ 1))$coef[1:2] +mse_elnet_extra <- summary(lm((y_test - yhat_elnet_extra)^2 ~ 1))$coef[1:2] +mse_lasso_extra <- summary(lm((y_test - yhat_rlasso_extra)^2 ~ 1))$coef[1:2] +mse_lasso_post_extra <- summary(lm((y_test - yhat_rlasso_post_extra)^2 ~ 1))$coef[1:2] +mse_lava_extra <- summary(lm(as.vector(y_test - yhat_lava_extra)^2 ~ 1))$coef[1:2] + +r2_lasso_cv_extra <- 1 - mse_lasso_cv_extra[1] / var(y_test) +r2_ridge_extra <- 1 - mse_ridge_extra[1] / var(y_test) +r2_elnet_extra <- 1 - mse_elnet_extra[1] / var(y_test) +r2_lasso_extra <- 1 - mse_lasso_extra[1] / var(y_test) +r2_lasso_post_extra <- 1 - mse_lasso_post_extra[1] / var(y_test) +r2_lava_extra <- 1 - mse_lava_extra[1] / var(y_test) + +# R^2 (extra flexible) +cat("\nExtra flexible model R^2 (Lasso): ", r2_lasso_cv_extra) +cat("\nExtra flexible model R^2 (Ridge): ", r2_ridge_extra) +cat("\nExtra flexible model R^2 (Elastic Net): ", r2_elnet_extra) +cat("\nExtra flexible model R^2 (RLasso): ", r2_lasso_extra) +cat("\nExtra flexible model R^2 (RLasso post): ", r2_lasso_post_extra) +cat("\nExtra flexible model R^2 (LAVA): ", r2_lava_extra) # using CV_min +``` + + + diff --git a/PM2/r_ml_for_wage_prediction.irnb b/PM2/r_ml_for_wage_prediction.irnb index 768a40ae..75c80aa0 100644 --- a/PM2/r_ml_for_wage_prediction.irnb +++ b/PM2/r_ml_for_wage_prediction.irnb @@ -1,1123 +1,1123 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "_execution_state": "idle", - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "id": "dtt9U13qNGOn", - "papermill": { - "duration": 0.036479, - "end_time": "2021-02-13T18:19:43.396666", - "exception": false, - "start_time": "2021-02-13T18:19:43.360187", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# A Simple Case Study using Wage Data from 2015" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LCKYnHWrNGOn", - "papermill": { - "duration": 0.036639, - "end_time": "2021-02-13T18:19:43.468425", - "exception": false, - "start_time": "2021-02-13T18:19:43.431786", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We illustrate how to predict an outcome variable $Y$ in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. We use linear prediction rules for estimation, including OLS and the penalized linear methods we've studied. Later, we will also consider nonlinear prediction rules including tree-based methods and neural nets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VPwV7nNDS_nz", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "install.packages(\"xtable\")\n", - "install.packages(\"hdm\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"MLmetrics\")\n", - "\n", - "library(hdm)\n", - "library(xtable)\n", - "library(glmnet)\n", - "library(MLmetrics)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gRYGynhrNGOo", - "papermill": { - "duration": 0.034705, - "end_time": "2021-02-13T18:19:43.537814", - "exception": false, - "start_time": "2021-02-13T18:19:43.503109", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S_YMURKqNGOo", - "papermill": { - "duration": 0.036082, - "end_time": "2021-02-13T18:19:43.609347", - "exception": false, - "start_time": "2021-02-13T18:19:43.573265", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.\n", - "The preproccessed sample consists of $5150$ never-married individuals." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B09_5wrUNGOo", - "papermill": { - "duration": 0.279387, - "end_time": "2021-02-13T18:19:43.923823", - "exception": false, - "start_time": "2021-02-13T18:19:43.644436", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", - "data <- read.csv(file)\n", - "dim(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ol9dToMQNGOq", - "papermill": { - "duration": 0.034902, - "end_time": "2021-02-13T18:19:43.994834", - "exception": false, - "start_time": "2021-02-13T18:19:43.959932", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bsqnI6d0NGOq", - "papermill": { - "duration": 0.091723, - "end_time": "2021-02-13T18:19:44.123394", - "exception": false, - "start_time": "2021-02-13T18:19:44.031671", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Z <- subset(data, select = -c(lwage, wage)) # regressors\n", - "colnames(Z)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VeU2XMYENGOr", - "papermill": { - "duration": 0.037074, - "end_time": "2021-02-13T18:19:44.196749", - "exception": false, - "start_time": "2021-02-13T18:19:44.159675", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "The following figure shows the weekly wage distribution from the US survey data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d3sbTfpRNGOr", - "papermill": { - "duration": 0.443391, - "end_time": "2021-02-13T18:19:44.677379", - "exception": false, - "start_time": "2021-02-13T18:19:44.233988", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "hist(data$wage, xlab = \"hourly wage\", main = \"Empirical wage distribution from the US survey data\", breaks = 35)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BmGfrWFNNGOs", - "papermill": { - "duration": 0.036602, - "end_time": "2021-02-13T18:19:44.752465", - "exception": false, - "start_time": "2021-02-13T18:19:44.715863", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by\n", - "the logarithm." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "msBSjtuKNGOs", - "papermill": { - "duration": 0.036009, - "end_time": "2021-02-13T18:19:44.826260", - "exception": false, - "start_time": "2021-02-13T18:19:44.790251", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B-XZMCogNGOs", - "papermill": { - "duration": 0.036925, - "end_time": "2021-02-13T18:19:44.899159", - "exception": false, - "start_time": "2021-02-13T18:19:44.862234", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Due to the skewness of the data, we are considering log wages which leads to the following regression model\n", - "\n", - "$$log(wage) = g(Z) + \\epsilon.$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNLS-C_7NGOt", - "papermill": { - "duration": 0.036183, - "end_time": "2021-02-13T18:19:44.971528", - "exception": false, - "start_time": "2021-02-13T18:19:44.935345", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "In this notebook, we will evaluate *linear* prediction rules. In later notebooks, we will also utilize nonlinear prediction methods. In linear models, we estimate the prediction rule of the form\n", - "\n", - "$$\\hat g(Z) = \\hat \\beta'X.$$\n", - "\n", - "Again, we generate $X$ in three ways:\n", - "\n", - "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators).\n", - "\n", - "\n", - "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., $\\operatorname{exp}^2$ and $\\operatorname{exp}^3$) and additional two-way interactions.\n", - "\n", - "3. Extra Flexible Model: $X$ takes the flexible model and takes all pairwise interactions." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-kGLyGXvNGOt", - "papermill": { - "duration": 0.037318, - "end_time": "2021-02-13T18:19:45.044959", - "exception": false, - "start_time": "2021-02-13T18:19:45.007641", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "To evaluate the out-of-sample performance, we split the data first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oAqJIgmlNGOt", - "papermill": { - "duration": 0.062188, - "end_time": "2021-02-13T18:19:45.143118", - "exception": false, - "start_time": "2021-02-13T18:19:45.080930", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "set.seed(1234)\n", - "training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE)\n", - "\n", - "data_train <- data[training, ]\n", - "data_test <- data[-training, ]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C6qC_wyjNGOu", - "papermill": { - "duration": 0.060969, - "end_time": "2021-02-13T18:19:45.445389", - "exception": false, - "start_time": "2021-02-13T18:19:45.384420", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "y_train <- data_train$lwage\n", - "y_test <- data_test$lwage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5-CCjJnbNGOt", - "papermill": { - "duration": 0.038774, - "end_time": "2021-02-13T18:19:45.217757", - "exception": false, - "start_time": "2021-02-13T18:19:45.178983", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We are starting by running a simple OLS regression. We fit the basic and flexible model to our training data by running an ols regression and compute the R-squared on the test sample" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qbummAR-NGOu", - "papermill": { - "duration": 0.037704, - "end_time": "2021-02-13T18:19:45.622370", - "exception": false, - "start_time": "2021-02-13T18:19:45.584666", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression and computing the mean squared error and $R^2$ on the test sample." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4LNs__OcfmFV" - }, - "source": [ - "### Low dimensional specification (basic)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "dtt9U13qNGOn", + "papermill": { + "duration": 0.036479, + "end_time": "2021-02-13T18:19:43.396666", + "exception": false, + "start_time": "2021-02-13T18:19:43.360187", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WD7tshOlNGOt", - "papermill": { - "duration": 0.094135, - "end_time": "2021-02-13T18:19:45.347955", - "exception": false, - "start_time": "2021-02-13T18:19:45.253820", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "x_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", - "formula_basic <- as.formula(paste(\"lwage\", \"~\", x_basic))\n", - "model_x_basic_train <- model.matrix(formula_basic, data_train)\n", - "model_x_basic_test <- model.matrix(formula_basic, data_test)\n", - "p_basic <- dim(model_x_basic_train)[2]\n", - "p_basic" - ] + "tags": [] + }, + "source": [ + "# A Simple Case Study using Wage Data from 2015" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LCKYnHWrNGOn", + "papermill": { + "duration": 0.036639, + "end_time": "2021-02-13T18:19:43.468425", + "exception": false, + "start_time": "2021-02-13T18:19:43.431786", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kx1xoWHFNGOv", - "papermill": { - "duration": 0.069537, - "end_time": "2021-02-13T18:19:45.887169", - "exception": false, - "start_time": "2021-02-13T18:19:45.817632", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ols (basic model)\n", - "fit_lm_basic <- lm(formula_basic, data_train)\n", - "# Compute the Out-Of-Sample Performance\n", - "yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test)\n", - "cat(\"Basic model MSE (OLS): \", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model)" - ] + "tags": [] + }, + "source": [ + "We illustrate how to predict an outcome variable $Y$ in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. We use linear prediction rules for estimation, including OLS and the penalized linear methods we've studied. Later, we will also consider nonlinear prediction rules including tree-based methods and neural nets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VPwV7nNDS_nz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"xtable\")\n", + "install.packages(\"hdm\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"MLmetrics\")\n", + "\n", + "library(hdm)\n", + "library(xtable)\n", + "library(glmnet)\n", + "library(MLmetrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gRYGynhrNGOo", + "papermill": { + "duration": 0.034705, + "end_time": "2021-02-13T18:19:43.537814", + "exception": false, + "start_time": "2021-02-13T18:19:43.503109", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "hDlMXF0ANGOw", - "papermill": { - "duration": 0.052764, - "end_time": "2021-02-13T18:19:46.122829", - "exception": false, - "start_time": "2021-02-13T18:19:46.070065", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*:" - ] + "tags": [] + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S_YMURKqNGOo", + "papermill": { + "duration": 0.036082, + "end_time": "2021-02-13T18:19:43.609347", + "exception": false, + "start_time": "2021-02-13T18:19:43.573265", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ERCs22oDNGOw", - "papermill": { - "duration": 0.076484, - "end_time": "2021-02-13T18:19:46.239015", - "exception": false, - "start_time": "2021-02-13T18:19:46.162531", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2]\n", - "mse_lm_basic" - ] + "tags": [] + }, + "source": [ + "Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015.\n", + "The preproccessed sample consists of $5150$ never-married individuals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B09_5wrUNGOo", + "papermill": { + "duration": 0.279387, + "end_time": "2021-02-13T18:19:43.923823", + "exception": false, + "start_time": "2021-02-13T18:19:43.644436", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "3PQ-f_waNGOw", - "papermill": { - "duration": 0.039088, - "end_time": "2021-02-13T18:19:46.317915", - "exception": false, - "start_time": "2021-02-13T18:19:46.278827", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We also compute the out-of-sample $R^2$:" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "data <- read.csv(file)\n", + "dim(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ol9dToMQNGOq", + "papermill": { + "duration": 0.034902, + "end_time": "2021-02-13T18:19:43.994834", + "exception": false, + "start_time": "2021-02-13T18:19:43.959932", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YLgvLE2BNGOw", - "papermill": { - "duration": 0.057098, - "end_time": "2021-02-13T18:19:46.413754", - "exception": false, - "start_time": "2021-02-13T18:19:46.356656", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test)\n", - "cat(\"Basic model R^2 (OLS): \", r2_lm_basic) # MSE OLS (basic model)" - ] + "tags": [] + }, + "source": [ + "The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bsqnI6d0NGOq", + "papermill": { + "duration": 0.091723, + "end_time": "2021-02-13T18:19:44.123394", + "exception": false, + "start_time": "2021-02-13T18:19:44.031671", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "tTZyELgyf51J" - }, - "source": [ - "### High-dimensional specification (flexible)" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "Z <- subset(data, select = -c(lwage, wage)) # regressors\n", + "colnames(Z)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VeU2XMYENGOr", + "papermill": { + "duration": 0.037074, + "end_time": "2021-02-13T18:19:44.196749", + "exception": false, + "start_time": "2021-02-13T18:19:44.159675", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "J8Rffx0ef3nM", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", - " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", - "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", - "model_x_flex_train <- model.matrix(formula_flex, data_train)\n", - "model_x_flex_test <- model.matrix(formula_flex, data_test)\n", - "p_flex <- dim(model_x_flex_train)[2]\n", - "p_flex" - ] + "tags": [] + }, + "source": [ + "The following figure shows the weekly wage distribution from the US survey data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d3sbTfpRNGOr", + "papermill": { + "duration": 0.443391, + "end_time": "2021-02-13T18:19:44.677379", + "exception": false, + "start_time": "2021-02-13T18:19:44.233988", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "77G7YfbnNGOw", - "papermill": { - "duration": 0.039585, - "end_time": "2021-02-13T18:19:46.492903", - "exception": false, - "start_time": "2021-02-13T18:19:46.453318", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We repeat the same procedure for the flexible model." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "hist(data$wage, xlab = \"hourly wage\", main = \"Empirical wage distribution from the US survey data\", breaks = 35)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BmGfrWFNNGOs", + "papermill": { + "duration": 0.036602, + "end_time": "2021-02-13T18:19:44.752465", + "exception": false, + "start_time": "2021-02-13T18:19:44.715863", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KpRtjaAlNGOw", - "papermill": { - "duration": 0.198636, - "end_time": "2021-02-13T18:19:46.730717", - "exception": false, - "start_time": "2021-02-13T18:19:46.532081", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ols (flexible model)\n", - "fit_lm_flex <- lm(formula_flex, data_train)\n", - "# Compute the Out-Of-Sample Performance\n", - "options(warn = -1)\n", - "yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test)\n", - "mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2]\n", - "r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test)\n", - "cat(\"Flexible model R^2 (OLS): \", r2_lm_flex) # MSE OLS (flexible model)" - ] + "tags": [] + }, + "source": [ + "Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by\n", + "the logarithm." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "msBSjtuKNGOs", + "papermill": { + "duration": 0.036009, + "end_time": "2021-02-13T18:19:44.826260", + "exception": false, + "start_time": "2021-02-13T18:19:44.790251", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "sShiB-i9NGOx", - "papermill": { - "duration": 0.042521, - "end_time": "2021-02-13T18:19:46.935859", - "exception": false, - "start_time": "2021-02-13T18:19:46.893338", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "### Penalized regressions (flexible model)\n", - "\n", - "\n", - "\n" - ] + "tags": [] + }, + "source": [ + "## Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B-XZMCogNGOs", + "papermill": { + "duration": 0.036925, + "end_time": "2021-02-13T18:19:44.899159", + "exception": false, + "start_time": "2021-02-13T18:19:44.862234", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "5wgFVRkkNGOx", - "papermill": { - "duration": 0.051953, - "end_time": "2021-02-13T18:19:46.853182", - "exception": false, - "start_time": "2021-02-13T18:19:46.801229", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We proceed by running penalized regressions first for the flexible model, tuned via cross-validation." - ] + "tags": [] + }, + "source": [ + "Due to the skewness of the data, we are considering log wages which leads to the following regression model\n", + "\n", + "$$log(wage) = g(Z) + \\epsilon.$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNLS-C_7NGOt", + "papermill": { + "duration": 0.036183, + "end_time": "2021-02-13T18:19:44.971528", + "exception": false, + "start_time": "2021-02-13T18:19:44.935345", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n3jvO5HQmzbf", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 1)\n", - "fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 0)\n", - "fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = .5)\n", - "\n", - "yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test)\n", - "yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test)\n", - "yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test)\n", - "\n", - "mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2]\n", - "mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2]\n", - "mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2]\n", - "\n", - "r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test)\n", - "r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test)\n", - "r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test)\n", - "\n", - "# R^2 using cross-validation (flexible model)\n", - "cat(\"Flexible model R^2 (Lasso): \", r2_lasso_cv_flex)\n", - "cat(\"\\nFlexible model R^2 (Ridge): \", r2_ridge_flex)\n", - "cat(\"\\nFlexible model R^2 (Elastic Net): \", r2_elnet_flex)" - ] + "tags": [] + }, + "source": [ + "In this notebook, we will evaluate *linear* prediction rules. In later notebooks, we will also utilize nonlinear prediction methods. In linear models, we estimate the prediction rule of the form\n", + "\n", + "$$\\hat g(Z) = \\hat \\beta'X.$$\n", + "\n", + "Again, we generate $X$ in three ways:\n", + "\n", + "1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators).\n", + "\n", + "\n", + "2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., $\\operatorname{exp}^2$ and $\\operatorname{exp}^3$) and additional two-way interactions.\n", + "\n", + "3. Extra Flexible Model: $X$ takes the flexible model and takes all pairwise interactions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-kGLyGXvNGOt", + "papermill": { + "duration": 0.037318, + "end_time": "2021-02-13T18:19:45.044959", + "exception": false, + "start_time": "2021-02-13T18:19:45.007641", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZytMS-aCNGOx", - "papermill": { - "duration": 0.040161, - "end_time": "2021-02-13T18:19:47.015626", - "exception": false, - "start_time": "2021-02-13T18:19:46.975465", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "We can also try a variant of the `l1` penalty, where the weight is chosen based on theoretical derivations. We use package *hdm* and the function *rlasso*, relying on a theoretical based choice of the penalty level $\\lambda$ in the lasso regression." - ] + "tags": [] + }, + "source": [ + "To evaluate the out-of-sample performance, we split the data first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oAqJIgmlNGOt", + "papermill": { + "duration": 0.062188, + "end_time": "2021-02-13T18:19:45.143118", + "exception": false, + "start_time": "2021-02-13T18:19:45.080930", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "n-D_fRJBnkEH" - }, - "source": [ - "Specifically, we use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, $\\Phi^{-1}$ denotes the quantile function, and $\\hat{\\sigma}$ is estimated in an iterative manner (see corresponding notes in book). Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", - "\n", - "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "set.seed(1234)\n", + "training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE)\n", + "\n", + "data_train <- data[training, ]\n", + "data_test <- data[-training, ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C6qC_wyjNGOu", + "papermill": { + "duration": 0.060969, + "end_time": "2021-02-13T18:19:45.445389", + "exception": false, + "start_time": "2021-02-13T18:19:45.384420", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "yBiZ3q3INGOy", - "papermill": { - "duration": 0.049543, - "end_time": "2021-02-13T18:19:47.757271", - "exception": false, - "start_time": "2021-02-13T18:19:47.707728", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Now, we repeat the same procedure for the flexible model." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "y_train <- data_train$lwage\n", + "y_test <- data_test$lwage" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5-CCjJnbNGOt", + "papermill": { + "duration": 0.038774, + "end_time": "2021-02-13T18:19:45.217757", + "exception": false, + "start_time": "2021-02-13T18:19:45.178983", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PlTdJh5PNGOy", - "papermill": { - "duration": 3.430649, - "end_time": "2021-02-13T18:19:51.229007", - "exception": false, - "start_time": "2021-02-13T18:19:47.798358", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE)\n", - "fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE)\n", - "yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test)\n", - "yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test)\n", - "\n", - "mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2]\n", - "mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2]\n", - "\n", - "r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test)\n", - "r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test)\n", - "# R^2 theoretically chosen penalty (flexible model)\n", - "cat(\"Flexible model R^2 (RLasso): \", r2_lasso_flex)\n", - "cat(\"\\nFlexible model R^2 (RLasso post): \", r2_lasso_post_flex)" - ] + "tags": [] + }, + "source": [ + "We are starting by running a simple OLS regression. We fit the basic and flexible model to our training data by running an ols regression and compute the R-squared on the test sample" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qbummAR-NGOu", + "papermill": { + "duration": 0.037704, + "end_time": "2021-02-13T18:19:45.622370", + "exception": false, + "start_time": "2021-02-13T18:19:45.584666", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "aude922IfxBG" - }, - "source": [ - "Finally, we try the combination of a sparse and a dense coefficient using the LAVA method" - ] + "tags": [] + }, + "source": [ + "As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression and computing the mean squared error and $R^2$ on the test sample." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4LNs__OcfmFV" + }, + "source": [ + "### Low dimensional specification (basic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WD7tshOlNGOt", + "papermill": { + "duration": 0.094135, + "end_time": "2021-02-13T18:19:45.347955", + "exception": false, + "start_time": "2021-02-13T18:19:45.253820", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WgBPFQ72ftBz", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", - "# number of iteration. Could iterate until a convergence criterion is met.\n", - "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", - " require(glmnet)\n", - "\n", - " # Need to demean internally\n", - " dy <- Y - mean(Y)\n", - " dx <- scale(X, scale = FALSE)\n", - "\n", - " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", - "\n", - " i <- 1\n", - " while (i <= iter) {\n", - " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", - " i <- i + 1\n", - " }\n", - "\n", - " bhat <- sp1$beta + de1$beta\n", - " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", - "\n", - " # Need to add intercept to output\n", - "\n", - " yhat <- newX %*% bhat + a0\n", - "\n", - " return(yhat)\n", - "}" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "x_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", + "formula_basic <- as.formula(paste(\"lwage\", \"~\", x_basic))\n", + "model_x_basic_train <- model.matrix(formula_basic, data_train)\n", + "model_x_basic_test <- model.matrix(formula_basic, data_test)\n", + "p_basic <- dim(model_x_basic_train)[2]\n", + "p_basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kx1xoWHFNGOv", + "papermill": { + "duration": 0.069537, + "end_time": "2021-02-13T18:19:45.887169", + "exception": false, + "start_time": "2021-02-13T18:19:45.817632", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2HFE2EbdkMjj", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# define function to get predictions and r2 scores for lava estimator\n", - "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", - " # 5-fold CV. glmnet does cross-validation internally and\n", - " # relatively efficiently. We're going to write out all the steps to make sure\n", - " # we're using the same CV folds across all procedures in a transparent way and\n", - " # to keep the overall structure clear as well.\n", - "\n", - " # Setup for brute force K-Fold CV\n", - " n <- length(ytr)\n", - " Kf <- num_folds # Number of folds\n", - " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", - " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", - "\n", - "\n", - " ## ------------------------------------------------------------\n", - " # We're going to take a shortcut and use the range of lambda values that come out\n", - " # of the default implementation in glmnet for everything. Could do better here - maybe\n", - "\n", - " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", - " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", - "\n", - " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", - " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", - "\n", - " ## ------------------------------------------------------------\n", - "\n", - "\n", - " # Lava - Using a double loop over candidate penalty parameter values.\n", - "\n", - " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", - " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", - "\n", - " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", - "\n", - " for (k in 1:Kf) {\n", - " indk <- cvgroup == k\n", - "\n", - " k_xtr_mod <- xtr_mod[!indk, ]\n", - " k_ytr <- ytr[!indk]\n", - " k_xte_mod <- xtr_mod[indk, ]\n", - " k_yte <- ytr[indk]\n", - "\n", - " for (ii in seq_along(lambda1_lava_mod)) {\n", - " for (jj in seq_along(lambda2_lava_mod)) {\n", - " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", - " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", - " newX = k_xte_mod,\n", - " lambda1 = lambda1_lava_mod[ii],\n", - " lambda2 = lambda2_lava_mod[jj]))^2)\n", - " }\n", - " }\n", - " }\n", - "\n", - " # Get CV min values of tuning parameters\n", - " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", - " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", - " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", - "\n", - " #### Look at performance on test sample\n", - "\n", - " # Calculate R^2 in training data and in validation data as measures\n", - " # Refit on entire training sample\n", - "\n", - " #### CV-min model\n", - "\n", - " # In sample fit\n", - " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", - " newX = xtr_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " )\n", - " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", - "\n", - " # Out of sample fit\n", - " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", - " newX = xte_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " )\n", - " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", - "\n", - " #### Use average model across cv-folds and refit model using all training data\n", - " ###### we won't report these results.\n", - " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", - " n_tr <- length(ytr)\n", - " n_te <- length(yte)\n", - " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", - " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", - "\n", - "\n", - " for (k in 1:Kf) {\n", - " indk <- cvgroup == k\n", - "\n", - " k_xtr_mod <- xtr_mod[!indk, ]\n", - " k_ytr <- ytr[!indk]\n", - "\n", - " # Lava\n", - " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", - " newX = xtr_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " ))\n", - " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", - " newX = xte_mod,\n", - " lambda1 = cvlambda1_lava_mod,\n", - " lambda2 = cvlambda2_lava_mod\n", - " ))\n", - " }\n", - "\n", - " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", - " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", - "\n", - " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", - " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", - "\n", - " return(c(\n", - " cvlambda1_lava_mod,\n", - " cvlambda2_lava_mod,\n", - " cvmin_yhat_lava_tr, # CV_min\n", - " cvmin_yhat_lava_test, # CV_min\n", - " r2_lava_mod, # CV_min\n", - " r2v_lava_mod, # CV_min\n", - " avg_yhat_lava_tr, # Average across Folds\n", - " avg_yhat_lava_test, # Average across Folds\n", - " r2_cv_ave_lava_mod, # Average across Folds\n", - " r2v_cv_ave_lava_mod # Average across Folds\n", - " ))\n", - "}" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ols (basic model)\n", + "fit_lm_basic <- lm(formula_basic, data_train)\n", + "# Compute the Out-Of-Sample Performance\n", + "yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test)\n", + "cat(\"Basic model MSE (OLS): \", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hDlMXF0ANGOw", + "papermill": { + "duration": 0.052764, + "end_time": "2021-02-13T18:19:46.122829", + "exception": false, + "start_time": "2021-02-13T18:19:46.070065", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Uw3LMCiskJzV", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_lava_flex <- lava_yhat_r2(model_x_flex_train, model_x_flex_test, y_train, y_test)\n", - "cat(\"Flexible model R^2 (LAVA): \", fit_lava_flex[[6]]) # using CV_min" - ] + "tags": [] + }, + "source": [ + "To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ERCs22oDNGOw", + "papermill": { + "duration": 0.076484, + "end_time": "2021-02-13T18:19:46.239015", + "exception": false, + "start_time": "2021-02-13T18:19:46.162531", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "f8lYhGaWfpYR" - }, - "source": [ - "\n", - "\n", - "We find that for this dataset the low dimensional OLS is sufficient. The high-dimensional approaches did not manage to substantively increase predictive power." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2]\n", + "mse_lm_basic" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3PQ-f_waNGOw", + "papermill": { + "duration": 0.039088, + "end_time": "2021-02-13T18:19:46.317915", + "exception": false, + "start_time": "2021-02-13T18:19:46.278827", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "bxuPZI4Zx0Vm" - }, - "source": [ - "### Extra high-dimensional specification (extra flexible)\n", - "\n", - "We repeat the same procedure for the extra flexible model.\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] + "tags": [] + }, + "source": [ + "We also compute the out-of-sample $R^2$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YLgvLE2BNGOw", + "papermill": { + "duration": 0.057098, + "end_time": "2021-02-13T18:19:46.413754", + "exception": false, + "start_time": "2021-02-13T18:19:46.356656", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsFhSsM_rGjN", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "x_extra <- \" sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\"\n", - "formula_extra <- as.formula(paste(\"lwage\", \"~\", x_extra))\n", - "model_x_extra_train <- model.matrix(formula_extra, data_train)\n", - "model_x_extra_test <- model.matrix(formula_extra, data_test)\n", - "p_extra <- dim(model_x_extra_train)[2]\n", - "p_extra" - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test)\n", + "cat(\"Basic model R^2 (OLS): \", r2_lm_basic) # MSE OLS (basic model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tTZyELgyf51J" + }, + "source": [ + "### High-dimensional specification (flexible)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J8Rffx0ef3nM", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", + " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", + "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", + "model_x_flex_train <- model.matrix(formula_flex, data_train)\n", + "model_x_flex_test <- model.matrix(formula_flex, data_test)\n", + "p_flex <- dim(model_x_flex_train)[2]\n", + "p_flex" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "77G7YfbnNGOw", + "papermill": { + "duration": 0.039585, + "end_time": "2021-02-13T18:19:46.492903", + "exception": false, + "start_time": "2021-02-13T18:19:46.453318", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eheA1UPBsHfL", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# ols (extra flexible model)\n", - "fit_lm_extra <- lm(formula_extra, data_train)\n", - "options(warn = -1)\n", - "yhat_lm_extra <- predict(fit_lm_extra, newdata = data_test)\n", - "mse_lm_extra <- summary(lm((y_test - yhat_lm_extra)^2 ~ 1))$coef[1:2]\n", - "r2_lm_extra <- 1 - mse_lm_extra[1] / var(y_test)\n", - "cat(\"Extra flexible model R^2 (OLS): \", r2_lm_extra)" - ] + "tags": [] + }, + "source": [ + "We repeat the same procedure for the flexible model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KpRtjaAlNGOw", + "papermill": { + "duration": 0.198636, + "end_time": "2021-02-13T18:19:46.730717", + "exception": false, + "start_time": "2021-02-13T18:19:46.532081", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "Scm5monOrJu5" - }, - "source": [ - "#### Penalized regressions (extra flexible model)\n", - "\n", - "Now let's repeat our penalized regression analysis for the extra flexible model. Note this block takes a while ~ 1 hour 15 minutes. To reduce time substantially, reduce the number of folds in LAVA." - ] + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ols (flexible model)\n", + "fit_lm_flex <- lm(formula_flex, data_train)\n", + "# Compute the Out-Of-Sample Performance\n", + "options(warn = -1)\n", + "yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test)\n", + "mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2]\n", + "r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test)\n", + "cat(\"Flexible model R^2 (OLS): \", r2_lm_flex) # MSE OLS (flexible model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sShiB-i9NGOx", + "papermill": { + "duration": 0.042521, + "end_time": "2021-02-13T18:19:46.935859", + "exception": false, + "start_time": "2021-02-13T18:19:46.893338", + "status": "completed" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tOKoNLKFovrI", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# penalized regressions\n", - "fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 1)\n", - "fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 0)\n", - "fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = .5)\n", - "fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE)\n", - "fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE)\n", - "fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test)\n", - "\n", - "yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test)\n", - "yhat_ridge_extra <- predict(fit_ridge_extra, newx = model_x_extra_test)\n", - "yhat_elnet_extra <- predict(fit_elnet_extra, newx = model_x_extra_test)\n", - "yhat_rlasso_extra <- predict(fit_rlasso_extra, newdata = data_test)\n", - "yhat_rlasso_post_extra <- predict(fit_rlasso_post_extra, newdata = data_test)\n", - "yhat_lava_extra <- fit_lava_extra[[4]]\n", - "\n", - "mse_lasso_cv_extra <- summary(lm((y_test - yhat_lasso_cv_extra)^2 ~ 1))$coef[1:2]\n", - "mse_ridge_extra <- summary(lm((y_test - yhat_ridge_extra)^2 ~ 1))$coef[1:2]\n", - "mse_elnet_extra <- summary(lm((y_test - yhat_elnet_extra)^2 ~ 1))$coef[1:2]\n", - "mse_lasso_extra <- summary(lm((y_test - yhat_rlasso_extra)^2 ~ 1))$coef[1:2]\n", - "mse_lasso_post_extra <- summary(lm((y_test - yhat_rlasso_post_extra)^2 ~ 1))$coef[1:2]\n", - "mse_lava_extra <- summary(lm(as.vector(y_test - yhat_lava_extra)^2 ~ 1))$coef[1:2]\n", - "\n", - "r2_lasso_cv_extra <- 1 - mse_lasso_cv_extra[1] / var(y_test)\n", - "r2_ridge_extra <- 1 - mse_ridge_extra[1] / var(y_test)\n", - "r2_elnet_extra <- 1 - mse_elnet_extra[1] / var(y_test)\n", - "r2_lasso_extra <- 1 - mse_lasso_extra[1] / var(y_test)\n", - "r2_lasso_post_extra <- 1 - mse_lasso_post_extra[1] / var(y_test)\n", - "r2_lava_extra <- 1 - mse_lava_extra[1] / var(y_test)\n", - "\n", - "# R^2 (extra flexible)\n", - "cat(\"\\nExtra flexible model R^2 (Lasso): \", r2_lasso_cv_extra)\n", - "cat(\"\\nExtra flexible model R^2 (Ridge): \", r2_ridge_extra)\n", - "cat(\"\\nExtra flexible model R^2 (Elastic Net): \", r2_elnet_extra)\n", - "cat(\"\\nExtra flexible model R^2 (RLasso): \", r2_lasso_extra)\n", - "cat(\"\\nExtra flexible model R^2 (RLasso post): \", r2_lasso_post_extra)\n", - "cat(\"\\nExtra flexible model R^2 (LAVA): \", r2_lava_extra) # using CV_min" - ] + "tags": [] + }, + "source": [ + "### Penalized regressions (flexible model)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5wgFVRkkNGOx", + "papermill": { + "duration": 0.051953, + "end_time": "2021-02-13T18:19:46.853182", + "exception": false, + "start_time": "2021-02-13T18:19:46.801229", + "status": "completed" }, - { - "cell_type": "markdown", - "metadata": { - "id": "Btez-AI8yE7S" - }, - "source": [ - "" - ] + "tags": [] + }, + "source": [ + "We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We proceed by running penalized regressions first for the flexible model, tuned via cross-validation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n3jvO5HQmzbf", + "vscode": { + "languageId": "r" } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" + }, + "outputs": [], + "source": [ + "fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = .5)\n", + "\n", + "yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test)\n", + "yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test)\n", + "yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test)\n", + "\n", + "mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2]\n", + "mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2]\n", + "mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2]\n", + "\n", + "r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test)\n", + "r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test)\n", + "r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test)\n", + "\n", + "# R^2 using cross-validation (flexible model)\n", + "cat(\"Flexible model R^2 (Lasso): \", r2_lasso_cv_flex)\n", + "cat(\"\\nFlexible model R^2 (Ridge): \", r2_ridge_flex)\n", + "cat(\"\\nFlexible model R^2 (Elastic Net): \", r2_elnet_flex)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZytMS-aCNGOx", + "papermill": { + "duration": 0.040161, + "end_time": "2021-02-13T18:19:47.015626", + "exception": false, + "start_time": "2021-02-13T18:19:46.975465", + "status": "completed" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.6.3" + "tags": [] + }, + "source": [ + "We can also try a variant of the `l1` penalty, where the weight is chosen based on theoretical derivations. We use package *hdm* and the function *rlasso*, relying on a theoretical based choice of the penalty level $\\lambda$ in the lasso regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n-D_fRJBnkEH" + }, + "source": [ + "Specifically, we use \"plug-in\" tuning with a theoretically valid choice of penalty $\\lambda = 2 \\cdot c \\hat{\\sigma} \\sqrt{n} \\Phi^{-1}(1-\\alpha/2p)$, where $c>1$ and $1-\\alpha$ is a confidence level, $\\Phi^{-1}$ denotes the quantile function, and $\\hat{\\sigma}$ is estimated in an iterative manner (see corresponding notes in book). Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\\epsilon$ and $X$ in the regression equation decay quickly at the tails.\n", + "\n", + "In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yBiZ3q3INGOy", + "papermill": { + "duration": 0.049543, + "end_time": "2021-02-13T18:19:47.757271", + "exception": false, + "start_time": "2021-02-13T18:19:47.707728", + "status": "completed" }, + "tags": [] + }, + "source": [ + "Now, we repeat the same procedure for the flexible model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PlTdJh5PNGOy", "papermill": { - "default_parameters": {}, - "duration": 90.376935, - "end_time": "2021-02-13T18:21:10.266455", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2021-02-13T18:19:39.889520", - "version": "2.2.2" + "duration": 3.430649, + "end_time": "2021-02-13T18:19:51.229007", + "exception": false, + "start_time": "2021-02-13T18:19:47.798358", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE)\n", + "fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE)\n", + "yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test)\n", + "yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test)\n", + "\n", + "mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2]\n", + "\n", + "r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test)\n", + "r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test)\n", + "# R^2 theoretically chosen penalty (flexible model)\n", + "cat(\"Flexible model R^2 (RLasso): \", r2_lasso_flex)\n", + "cat(\"\\nFlexible model R^2 (RLasso post): \", r2_lasso_post_flex)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aude922IfxBG" + }, + "source": [ + "Finally, we try the combination of a sparse and a dense coefficient using the LAVA method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WgBPFQ72ftBz", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", + "# number of iteration. Could iterate until a convergence criterion is met.\n", + "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", + " require(glmnet)\n", + "\n", + " # Need to demean internally\n", + " dy <- Y - mean(Y)\n", + " dx <- scale(X, scale = FALSE)\n", + "\n", + " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + "\n", + " i <- 1\n", + " while (i <= iter) {\n", + " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " i <- i + 1\n", + " }\n", + "\n", + " bhat <- sp1$beta + de1$beta\n", + " a0 <- mean(Y) - sum(colMeans(X) * bhat)\n", + "\n", + " # Need to add intercept to output\n", + "\n", + " yhat <- newX %*% bhat + a0\n", + "\n", + " return(yhat)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2HFE2EbdkMjj", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# define function to get predictions and r2 scores for lava estimator\n", + "lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) {\n", + " # 5-fold CV. glmnet does cross-validation internally and\n", + " # relatively efficiently. We're going to write out all the steps to make sure\n", + " # we're using the same CV folds across all procedures in a transparent way and\n", + " # to keep the overall structure clear as well.\n", + "\n", + " # Setup for brute force K-Fold CV\n", + " n <- length(ytr)\n", + " Kf <- num_folds # Number of folds\n", + " sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", + " cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups\n", + "\n", + "\n", + " ## ------------------------------------------------------------\n", + " # We're going to take a shortcut and use the range of lambda values that come out\n", + " # of the default implementation in glmnet for everything. Could do better here - maybe\n", + "\n", + " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", + "\n", + " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", + " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", + "\n", + " ## ------------------------------------------------------------\n", + "\n", + "\n", + " # Lava - Using a double loop over candidate penalty parameter values.\n", + "\n", + " lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)]\n", + " lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)]\n", + "\n", + " cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod))\n", + "\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", + "\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + " k_xte_mod <- xtr_mod[indk, ]\n", + " k_yte <- ytr[indk]\n", + "\n", + " for (ii in seq_along(lambda1_lava_mod)) {\n", + " for (jj in seq_along(lambda2_lava_mod)) {\n", + " cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] +\n", + " sum((k_yte - lava_predict(k_xtr_mod, k_ytr,\n", + " newX = k_xte_mod,\n", + " lambda1 = lambda1_lava_mod[ii],\n", + " lambda2 = lambda2_lava_mod[jj]))^2)\n", + " }\n", + " }\n", + " }\n", + "\n", + " # Get CV min values of tuning parameters\n", + " cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE)\n", + " cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]]\n", + " cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]]\n", + "\n", + " #### Look at performance on test sample\n", + "\n", + " # Calculate R^2 in training data and in validation data as measures\n", + " # Refit on entire training sample\n", + "\n", + " #### CV-min model\n", + "\n", + " # In sample fit\n", + " cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + "\n", + " # Out of sample fit\n", + " cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " )\n", + " r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " #### Use average model across cv-folds and refit model using all training data\n", + " ###### we won't report these results.\n", + " ###### Averaging is theoretically more solid, but cv-min is more practical.\n", + " n_tr <- length(ytr)\n", + " n_te <- length(yte)\n", + " yhat_tr_lava_mod <- matrix(0, n_tr, Kf)\n", + " yhat_te_lava_mod <- matrix(0, n_te, Kf)\n", + "\n", + "\n", + " for (k in 1:Kf) {\n", + " indk <- cvgroup == k\n", + "\n", + " k_xtr_mod <- xtr_mod[!indk, ]\n", + " k_ytr <- ytr[!indk]\n", + "\n", + " # Lava\n", + " yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xtr_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr,\n", + " newX = xte_mod,\n", + " lambda1 = cvlambda1_lava_mod,\n", + " lambda2 = cvlambda2_lava_mod\n", + " ))\n", + " }\n", + "\n", + " avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod)\n", + " avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod)\n", + "\n", + " r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2)\n", + " r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2)\n", + "\n", + " return(c(\n", + " cvlambda1_lava_mod,\n", + " cvlambda2_lava_mod,\n", + " cvmin_yhat_lava_tr, # CV_min\n", + " cvmin_yhat_lava_test, # CV_min\n", + " r2_lava_mod, # CV_min\n", + " r2v_lava_mod, # CV_min\n", + " avg_yhat_lava_tr, # Average across Folds\n", + " avg_yhat_lava_test, # Average across Folds\n", + " r2_cv_ave_lava_mod, # Average across Folds\n", + " r2v_cv_ave_lava_mod # Average across Folds\n", + " ))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Uw3LMCiskJzV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_lava_flex <- lava_yhat_r2(model_x_flex_train, model_x_flex_test, y_train, y_test)\n", + "cat(\"Flexible model R^2 (LAVA): \", fit_lava_flex[[6]]) # using CV_min" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f8lYhGaWfpYR" + }, + "source": [ + "\n", + "\n", + "We find that for this dataset the low dimensional OLS is sufficient. The high-dimensional approaches did not manage to substantively increase predictive power." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bxuPZI4Zx0Vm" + }, + "source": [ + "### Extra high-dimensional specification (extra flexible)\n", + "\n", + "We repeat the same procedure for the extra flexible model.\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsFhSsM_rGjN", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "x_extra <- \" sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2\"\n", + "formula_extra <- as.formula(paste(\"lwage\", \"~\", x_extra))\n", + "model_x_extra_train <- model.matrix(formula_extra, data_train)\n", + "model_x_extra_test <- model.matrix(formula_extra, data_test)\n", + "p_extra <- dim(model_x_extra_train)[2]\n", + "p_extra" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eheA1UPBsHfL", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# ols (extra flexible model)\n", + "fit_lm_extra <- lm(formula_extra, data_train)\n", + "options(warn = -1)\n", + "yhat_lm_extra <- predict(fit_lm_extra, newdata = data_test)\n", + "mse_lm_extra <- summary(lm((y_test - yhat_lm_extra)^2 ~ 1))$coef[1:2]\n", + "r2_lm_extra <- 1 - mse_lm_extra[1] / var(y_test)\n", + "cat(\"Extra flexible model R^2 (OLS): \", r2_lm_extra)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Scm5monOrJu5" + }, + "source": [ + "#### Penalized regressions (extra flexible model)\n", + "\n", + "Now let's repeat our penalized regression analysis for the extra flexible model. Note this block takes a while ~ 1 hour 15 minutes. To reduce time substantially, reduce the number of folds in LAVA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tOKoNLKFovrI", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# penalized regressions\n", + "fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = .5)\n", + "fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE)\n", + "fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE)\n", + "fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test)\n", + "\n", + "yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test)\n", + "yhat_ridge_extra <- predict(fit_ridge_extra, newx = model_x_extra_test)\n", + "yhat_elnet_extra <- predict(fit_elnet_extra, newx = model_x_extra_test)\n", + "yhat_rlasso_extra <- predict(fit_rlasso_extra, newdata = data_test)\n", + "yhat_rlasso_post_extra <- predict(fit_rlasso_post_extra, newdata = data_test)\n", + "yhat_lava_extra <- fit_lava_extra[[4]]\n", + "\n", + "mse_lasso_cv_extra <- summary(lm((y_test - yhat_lasso_cv_extra)^2 ~ 1))$coef[1:2]\n", + "mse_ridge_extra <- summary(lm((y_test - yhat_ridge_extra)^2 ~ 1))$coef[1:2]\n", + "mse_elnet_extra <- summary(lm((y_test - yhat_elnet_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_extra <- summary(lm((y_test - yhat_rlasso_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post_extra <- summary(lm((y_test - yhat_rlasso_post_extra)^2 ~ 1))$coef[1:2]\n", + "mse_lava_extra <- summary(lm(as.vector(y_test - yhat_lava_extra)^2 ~ 1))$coef[1:2]\n", + "\n", + "r2_lasso_cv_extra <- 1 - mse_lasso_cv_extra[1] / var(y_test)\n", + "r2_ridge_extra <- 1 - mse_ridge_extra[1] / var(y_test)\n", + "r2_elnet_extra <- 1 - mse_elnet_extra[1] / var(y_test)\n", + "r2_lasso_extra <- 1 - mse_lasso_extra[1] / var(y_test)\n", + "r2_lasso_post_extra <- 1 - mse_lasso_post_extra[1] / var(y_test)\n", + "r2_lava_extra <- 1 - mse_lava_extra[1] / var(y_test)\n", + "\n", + "# R^2 (extra flexible)\n", + "cat(\"\\nExtra flexible model R^2 (Lasso): \", r2_lasso_cv_extra)\n", + "cat(\"\\nExtra flexible model R^2 (Ridge): \", r2_ridge_extra)\n", + "cat(\"\\nExtra flexible model R^2 (Elastic Net): \", r2_elnet_extra)\n", + "cat(\"\\nExtra flexible model R^2 (RLasso): \", r2_lasso_extra)\n", + "cat(\"\\nExtra flexible model R^2 (RLasso post): \", r2_lasso_post_extra)\n", + "cat(\"\\nExtra flexible model R^2 (LAVA): \", r2_lava_extra) # using CV_min" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Btez-AI8yE7S" + }, + "source": [ + "" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" }, - "nbformat": 4, - "nbformat_minor": 0 + "papermill": { + "default_parameters": {}, + "duration": 90.376935, + "end_time": "2021-02-13T18:21:10.266455", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2021-02-13T18:19:39.889520", + "version": "2.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_orthogonal_orig.Rmd b/PM2/r_orthogonal_orig.Rmd new file mode 100644 index 00000000..9e85eb1d --- /dev/null +++ b/PM2/r_orthogonal_orig.Rmd @@ -0,0 +1,113 @@ +--- +title: An R Markdown document converted from "PM2/r_orthogonal_orig.irnb" +output: html_document +--- + +# Simulation on Orthogonal Estimation + +We compare the performance of the naive and orthogonal methods in a computational experiment where +$p=n=100$, $\beta_j = 1/j^2$, $(\gamma_{DW})_j = 1/j^2$ and $$Y = 1 \cdot D + \beta' W + \epsilon_Y$$ + +where $W \sim N(0,I)$, $\epsilon_Y \sim N(0,1)$, and $$D = \gamma'_{DW} W + \tilde{D}$$ where $\tilde{D} \sim N(0,1)/4$. + +The true treatment effect here is 1. From the plots produced in this notebook (estimate minus ground truth), we show that the naive single-selection estimator is heavily biased (lack of Neyman orthogonality in its estimation strategy), while the orthogonal estimator based on partialling out, is approximately unbiased and Gaussian. + +```{r} +install.packages("hdm") +library(hdm) +library(ggplot2) +``` + +```{r} +# Initialize constants +B <- 10000 # Number of iterations +n <- 100 # Sample size +p <- 100 # Number of features + +# Initialize arrays to store results +Naive <- rep(0, B) +Orthogonal <- rep(0, B) + + +lambdaYs <- rep(0,B) +lambdaDs <- rep(0,B) + +for (i in 1:B) { + # Generate parameters + beta <- 1 / (1:p)^2 + gamma <- 1 / (1:p)^2 + + # Generate covariates / random data + X <- matrix(rnorm(n * p), n, p) + D <- X %*% gamma + rnorm(n) / 4 + + # Generate Y using DGP + Y <- D + X %*% beta + rnorm(n) + + # Single selection method + rlasso_result <- rlasso(Y ~ D + X) # Fit lasso regression + SX_IDs <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates + + # Check if any Xs are selected + if (sum(SX_IDs) == 0) { + Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected + } else { + Naive[i] <- lm(Y ~ D + X[, SX_IDs])$coef[2] # Fit linear regression with selected X otherwise + } + + # Partialling out / Double Lasso + + fitY <- rlasso(Y ~ X, post = TRUE) + resY <- fitY$res + #cat("lambda Y mean: ", mean(fitY$lambda)) + + fitD <- rlasso(D ~ X, post = TRUE) + resD <- fitD$res + #cat("\nlambda D mean: ", mean(fitD$lambda)) + + Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals +} +``` + +## Make a Nice Plot + +```{r} +#Specify ratio +img_width = 15 +img_height = img_width/2 +``` + +```{r} +# Create a data frame for the estimates +df <- data.frame(Method = rep(c("Naive", "Orthogonal"), each = B), Value = c(Naive-1,Orthogonal-1)) + +# Create the histogram using ggplot2 +hist_plot <- ggplot(df, aes(x = Value, fill = Method)) + + geom_histogram(binwidth = 0.1, color = "black", alpha = 0.7) + + facet_wrap(~Method, scales = "fixed") + + labs( + title = "Distribution of Estimates (Centered around Ground Truth)", + x = "Bias", + y = "Frequency" + ) + + scale_x_continuous(breaks = seq(-2, 1.5, 0.5)) + + theme_minimal() + + theme( + plot.title = element_text(hjust = 0.5), # Center the plot title + strip.text = element_text(size = 10), # Increase text size in facet labels + legend.position = "none", # Remove the legend + panel.grid.major = element_blank(), # Make major grid lines invisible + # panel.grid.minor = element_blank(), # Make minor grid lines invisible + strip.background = element_blank() # Make the strip background transparent + ) + + theme(panel.spacing = unit(2, "lines")) # Adjust the ratio to separate subplots wider + +# Set a wider plot size +options(repr.plot.width = img_width, repr.plot.height = img_height) + +# Display the histogram +print(hist_plot) +``` + +As we can see from the above bias plots (estimates minus the ground truth effect of 1), the double lasso procedure concentrates around zero whereas the naive estimator does not. + diff --git a/PM2/r_orthogonal_orig.irnb b/PM2/r_orthogonal_orig.irnb index ee5c2147..76911c26 100644 --- a/PM2/r_orthogonal_orig.irnb +++ b/PM2/r_orthogonal_orig.irnb @@ -1,192 +1,190 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Simulation on Orthogonal Estimation\n" - ], - "metadata": { - "id": "7HCJkA2ifjEk" - } - }, - { - "cell_type": "markdown", - "source": [ - "We compare the performance of the naive and orthogonal methods in a computational experiment where\n", - "$p=n=100$, $\\beta_j = 1/j^2$, $(\\gamma_{DW})_j = 1/j^2$ and $$Y = 1 \\cdot D + \\beta' W + \\epsilon_Y$$\n", - "\n", - "where $W \\sim N(0,I)$, $\\epsilon_Y \\sim N(0,1)$, and $$D = \\gamma'_{DW} W + \\tilde{D}$$ where $\\tilde{D} \\sim N(0,1)/4$.\n", - "\n", - "The true treatment effect here is 1. From the plots produced in this notebook (estimate minus ground truth), we show that the naive single-selection estimator is heavily biased (lack of Neyman orthogonality in its estimation strategy), while the orthogonal estimator based on partialling out, is approximately unbiased and Gaussian." - ], - "metadata": { - "id": "4sldk16nfXw9" - } - }, - { - "cell_type": "code", - "source": [ - "install.packages(\"hdm\")\n", - "library(hdm)\n", - "library(ggplot2)" - ], - "metadata": { - "id": "dSvVz5Z6D14H" - }, - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "_execution_state": "idle", - "trusted": true, - "id": "fAe2EP5VCFN_" - }, - "cell_type": "code", - "source": [ - "# Initialize constants\n", - "B <- 10000 # Number of iterations\n", - "n <- 100 # Sample size\n", - "p <- 100 # Number of features\n", - "\n", - "# Initialize arrays to store results\n", - "Naive <- rep(0, B)\n", - "Orthogonal <- rep(0, B)\n", - "\n", - "\n", - "lambdaYs <- rep(0,B)\n", - "lambdaDs <- rep(0,B)\n", - "\n", - "for (i in 1:B) {\n", - " # Generate parameters\n", - " beta <- 1 / (1:p)^2\n", - " gamma <- 1 / (1:p)^2\n", - "\n", - " # Generate covariates / random data\n", - " X <- matrix(rnorm(n * p), n, p)\n", - " D <- X %*% gamma + rnorm(n) / 4\n", - "\n", - " # Generate Y using DGP\n", - " Y <- D + X %*% beta + rnorm(n)\n", - "\n", - " # Single selection method\n", - " rlasso_result <- rlasso(Y ~ D + X) # Fit lasso regression\n", - " SX_IDs <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates\n", - "\n", - " # Check if any Xs are selected\n", - " if (sum(SX_IDs) == 0) {\n", - " Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected\n", - " } else {\n", - " Naive[i] <- lm(Y ~ D + X[, SX_IDs])$coef[2] # Fit linear regression with selected X otherwise\n", - " }\n", - "\n", - " # Partialling out / Double Lasso\n", - "\n", - " fitY <- rlasso(Y ~ X, post = TRUE)\n", - " resY <- fitY$res\n", - " #cat(\"lambda Y mean: \", mean(fitY$lambda))\n", - "\n", - " fitD <- rlasso(D ~ X, post = TRUE)\n", - " resD <- fitD$res\n", - " #cat(\"\\nlambda D mean: \", mean(fitD$lambda))\n", - "\n", - " Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals\n", - "}\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Make a Nice Plot" - ], - "metadata": { - "id": "Bj174QuEaPb5" - } - }, - { - "cell_type": "code", - "source": [ - "#Specify ratio\n", - "img_width = 15\n", - "img_height = img_width/2" - ], - "metadata": { - "id": "MjB3qbGEaRnl" - }, - "execution_count": null, - "outputs": [] - }, - { - "metadata": { - "trusted": true, - "id": "N7bdztt1CFOE" - }, - "cell_type": "code", - "source": [ - "# Create a data frame for the estimates\n", - "df <- data.frame(Method = rep(c(\"Naive\", \"Orthogonal\"), each = B), Value = c(Naive-1,Orthogonal-1))\n", - "\n", - "# Create the histogram using ggplot2\n", - "hist_plot <- ggplot(df, aes(x = Value, fill = Method)) +\n", - " geom_histogram(binwidth = 0.1, color = \"black\", alpha = 0.7) +\n", - " facet_wrap(~Method, scales = \"fixed\") +\n", - " labs(\n", - " title = \"Distribution of Estimates (Centered around Ground Truth)\",\n", - " x = \"Bias\",\n", - " y = \"Frequency\"\n", - " ) +\n", - " scale_x_continuous(breaks = seq(-2, 1.5, 0.5)) +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(hjust = 0.5), # Center the plot title\n", - " strip.text = element_text(size = 10), # Increase text size in facet labels\n", - " legend.position = \"none\", # Remove the legend\n", - " panel.grid.major = element_blank(), # Make major grid lines invisible\n", - " # panel.grid.minor = element_blank(), # Make minor grid lines invisible\n", - " strip.background = element_blank() # Make the strip background transparent\n", - " ) +\n", - " theme(panel.spacing = unit(2, \"lines\")) # Adjust the ratio to separate subplots wider\n", - "\n", - "# Set a wider plot size\n", - "options(repr.plot.width = img_width, repr.plot.height = img_height)\n", - "\n", - "# Display the histogram\n", - "print(hist_plot)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As we can see from the above bias plots (estimates minus the ground truth effect of 1), the double lasso procedure concentrates around zero whereas the naive estimator does not." - ], - "metadata": { - "id": "8hrJ3M5mrD8_" - } - } - ], - "metadata": { - "kernelspec": { - "name": "ir", - "display_name": "R", - "language": "R" - }, - "language_info": { - "name": "R", - "codemirror_mode": "r", - "pygments_lexer": "r", - "mimetype": "text/x-r-source", - "file_extension": ".r", - "version": "3.6.3" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "7HCJkA2ifjEk" + }, + "source": [ + "# Simulation on Orthogonal Estimation\n" + ] }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "4sldk16nfXw9" + }, + "source": [ + "We compare the performance of the naive and orthogonal methods in a computational experiment where\n", + "$p=n=100$, $\\beta_j = 1/j^2$, $(\\gamma_{DW})_j = 1/j^2$ and $$Y = 1 \\cdot D + \\beta' W + \\epsilon_Y$$\n", + "\n", + "where $W \\sim N(0,I)$, $\\epsilon_Y \\sim N(0,1)$, and $$D = \\gamma'_{DW} W + \\tilde{D}$$ where $\\tilde{D} \\sim N(0,1)/4$.\n", + "\n", + "The true treatment effect here is 1. From the plots produced in this notebook (estimate minus ground truth), we show that the naive single-selection estimator is heavily biased (lack of Neyman orthogonality in its estimation strategy), while the orthogonal estimator based on partialling out, is approximately unbiased and Gaussian." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dSvVz5Z6D14H" + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "library(hdm)\n", + "library(ggplot2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_execution_state": "idle", + "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", + "id": "fAe2EP5VCFN_" + }, + "outputs": [], + "source": [ + "# Initialize constants\n", + "B <- 10000 # Number of iterations\n", + "n <- 100 # Sample size\n", + "p <- 100 # Number of features\n", + "\n", + "# Initialize arrays to store results\n", + "Naive <- rep(0, B)\n", + "Orthogonal <- rep(0, B)\n", + "\n", + "\n", + "lambdaYs <- rep(0,B)\n", + "lambdaDs <- rep(0,B)\n", + "\n", + "for (i in 1:B) {\n", + " # Generate parameters\n", + " beta <- 1 / (1:p)^2\n", + " gamma <- 1 / (1:p)^2\n", + "\n", + " # Generate covariates / random data\n", + " X <- matrix(rnorm(n * p), n, p)\n", + " D <- X %*% gamma + rnorm(n) / 4\n", + "\n", + " # Generate Y using DGP\n", + " Y <- D + X %*% beta + rnorm(n)\n", + "\n", + " # Single selection method\n", + " rlasso_result <- rlasso(Y ~ D + X) # Fit lasso regression\n", + " SX_IDs <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates\n", + "\n", + " # Check if any Xs are selected\n", + " if (sum(SX_IDs) == 0) {\n", + " Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected\n", + " } else {\n", + " Naive[i] <- lm(Y ~ D + X[, SX_IDs])$coef[2] # Fit linear regression with selected X otherwise\n", + " }\n", + "\n", + " # Partialling out / Double Lasso\n", + "\n", + " fitY <- rlasso(Y ~ X, post = TRUE)\n", + " resY <- fitY$res\n", + " #cat(\"lambda Y mean: \", mean(fitY$lambda))\n", + "\n", + " fitD <- rlasso(D ~ X, post = TRUE)\n", + " resD <- fitD$res\n", + " #cat(\"\\nlambda D mean: \", mean(fitD$lambda))\n", + "\n", + " Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bj174QuEaPb5" + }, + "source": [ + "## Make a Nice Plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MjB3qbGEaRnl" + }, + "outputs": [], + "source": [ + "#Specify ratio\n", + "img_width = 15\n", + "img_height = img_width/2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N7bdztt1CFOE" + }, + "outputs": [], + "source": [ + "# Create a data frame for the estimates\n", + "df <- data.frame(Method = rep(c(\"Naive\", \"Orthogonal\"), each = B), Value = c(Naive-1,Orthogonal-1))\n", + "\n", + "# Create the histogram using ggplot2\n", + "hist_plot <- ggplot(df, aes(x = Value, fill = Method)) +\n", + " geom_histogram(binwidth = 0.1, color = \"black\", alpha = 0.7) +\n", + " facet_wrap(~Method, scales = \"fixed\") +\n", + " labs(\n", + " title = \"Distribution of Estimates (Centered around Ground Truth)\",\n", + " x = \"Bias\",\n", + " y = \"Frequency\"\n", + " ) +\n", + " scale_x_continuous(breaks = seq(-2, 1.5, 0.5)) +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(hjust = 0.5), # Center the plot title\n", + " strip.text = element_text(size = 10), # Increase text size in facet labels\n", + " legend.position = \"none\", # Remove the legend\n", + " panel.grid.major = element_blank(), # Make major grid lines invisible\n", + " # panel.grid.minor = element_blank(), # Make minor grid lines invisible\n", + " strip.background = element_blank() # Make the strip background transparent\n", + " ) +\n", + " theme(panel.spacing = unit(2, \"lines\")) # Adjust the ratio to separate subplots wider\n", + "\n", + "# Set a wider plot size\n", + "options(repr.plot.width = img_width, repr.plot.height = img_height)\n", + "\n", + "# Display the histogram\n", + "print(hist_plot)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8hrJ3M5mrD8_" + }, + "source": [ + "As we can see from the above bias plots (estimates minus the ground truth effect of 1), the double lasso procedure concentrates around zero whereas the naive estimator does not." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 485a79192257c84d5f43569ea2bab8da44693375 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:06:24 -0700 Subject: [PATCH 081/261] linting --- .github/workflows/transform-R-to-Rmd.yml | 12 +- ...r_convergence_hypothesis_double_lasso.irnb | 1631 +++++++++++------ 2 files changed, 1083 insertions(+), 560 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index f7d32594..51106fd3 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -76,7 +76,6 @@ jobs: - name: Execute R scripts and log output id: execute - continue-on-error: true run: | log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' @@ -91,7 +90,7 @@ jobs: source(gitrfile) }, error = function(e) { - errors[[length(errors) + 1]] <<- list(file = gitrfile, message = e$message) + errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message) } ) } @@ -103,21 +102,16 @@ jobs: cat("Error found in file:", error$gitrfile, "\n") cat("Error message:", error$message, "\n") } + quit(status = 1, save = "no") # Exit with an error status if errors are found } ' + - name: Upload execution log uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-script-execution-log path: ${{ matrix.directory }}_r_script_execution.log - - name: Fail if errors found - if: failure() - run: exit 1 - - - name: Delete execution log - run: rm ${{ matrix.directory }}_r_script_execution.log - - name: Zip .R files run: | mkdir r_scripts diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 6c48fccc..538431b9 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -1,564 +1,1093 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "79U65py1grzb" - }, - "source": [ - "# Testing the Convergence Hypothesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "79U65py1grzb" + }, + "source": [ + "# Testing the Convergence Hypothesis" + ] }, - "id": "GK-MMvLseA2Q", - "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"xtable\")\n", - "install.packages(\"lmtest\")\n", - "install.packages(\"sandwich\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"ggplot2\")\n", - "library(hdm)\n", - "library(xtable)\n", - "library(lmtest)\n", - "library(sandwich)\n", - "library(glmnet) # For LassoCV\n", - "library(ggplot2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nlpSLLV6g1pc" - }, - "source": [ - "## Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xXkzGJWag02O" - }, - "source": [ - "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", - " $$\n", - " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", - " $$\n", - " \n", - "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", - " \n", - "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5Ul2ppLfUBQ" - }, - "source": [ - "## Data Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9GgPNICafYuK" - }, - "source": [ - "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_B9DWuS6fcVW", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "getdata <- function(...) {\n", - " e <- new.env()\n", - " name <- data(..., envir = e)[1]\n", - " e[[name]]\n", - "}\n", - "\n", - "# now load your data calling getdata()\n", - "growth <- getdata(GrowthData)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smYhqwpbffVh" - }, - "source": [ - "The sample contains $90$ countries and $63$ controls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GK-MMvLseA2Q", + "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’, ‘glmnet’, ‘checkmate’, ‘Formula’\n", + "\n", + "\n", + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "also installing the dependency ‘zoo’\n", + "\n", + "\n", + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "Installing package into ‘/usr/local/lib/R/site-library’\n", + "(as ‘lib’ is unspecified)\n", + "\n", + "Loading required package: zoo\n", + "\n", + "\n", + "Attaching package: ‘zoo’\n", + "\n", + "\n", + "The following objects are masked from ‘package:base’:\n", + "\n", + " as.Date, as.Date.numeric\n", + "\n", + "\n", + "Loading required package: Matrix\n", + "\n", + "Loaded glmnet 4.1-8\n", + "\n" + ] + } + ], + "source": [ + "install.packages(\"hdm\")\n", + "install.packages(\"xtable\")\n", + "install.packages(\"lmtest\")\n", + "install.packages(\"sandwich\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"ggplot2\")\n", + "library(hdm)\n", + "library(xtable)\n", + "library(lmtest)\n", + "library(sandwich)\n", + "library(glmnet) # For LassoCV\n", + "library(ggplot2)" + ] }, - "id": "1dsF7_R4j-Qv", - "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "growth" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-AMcbsgefhTg" - }, - "source": [ - "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", - "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DncWsRS9mgAp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "## Create the outcome variable y and covariates x\n", - "y <- growth$Outcome\n", - "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "markdown", + "metadata": { + "id": "nlpSLLV6g1pc" + }, + "source": [ + "## Introduction" + ] }, - "id": "vPO08MjomqfZ", - "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit <- lm(Outcome ~ ., data = X)\n", - "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", - "\n", - "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", - "\n", - "# print unconditional effect of gdpsh465 and the corresponding standard error\n", - "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", - " \" and the corresponding robust standard error is\", se)\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci <- est - 1.96 * se\n", - "upper_ci <- est + 1.96 * se\n", - "\n", - "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D7nJZzhGfjQT" - }, - "source": [ - "## Summarize OLS results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "markdown", + "metadata": { + "id": "xXkzGJWag02O" + }, + "source": [ + "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", + " $$\n", + " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", + " $$\n", + " \n", + "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", + " \n", + "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" + ] }, - "id": "EwGVcIVAfRe5", - "outputId": "87f41279-8907-415b-f8eb-589f736089b2", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create an empty data frame with column names\n", - "table <- data.frame(\n", - " Method = character(0),\n", - " Estimate = character(0),\n", - " `Std. Error` = numeric(0),\n", - " `Lower Bound CI` = numeric(0),\n", - " `Upper Bound CI` = numeric(0)\n", - ")\n", - "\n", - "# Add OLS results to the table\n", - "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", - "\n", - "# Rename the columns to match the Python table\n", - "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KfrhJqKhfwKB" - }, - "source": [ - "Least squares provides a rather noisy estimate (high standard error) of the\n", - "speed of convergence, and does not allow us to answer the question\n", - "about the convergence hypothesis since the confidence interval includes zero.\n", - "\n", - "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D9Y2U1Ldf1eB", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "y <- growth$Outcome\n", - "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", - "D <- growth$gdpsh465" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8yNU2UgefzCZ" - }, - "source": [ - "## Method 1: Lasso with Theoretical Penalty using HDM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tQPxdzQ2f84M" - }, - "source": [ - "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", - "\n", - "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DIzy51tZsoWp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", - "\n", - " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat_rlasso)\n", - "\n", - "\n", - " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", - " dres <- D - as.numeric(dhat_rlasso)\n", - "\n", - " # rest is the same as in the OLS case\n", - " hat <- mean(yres * dres) / mean(dres^2)\n", - " epsilon <- yres - hat * dres\n", - " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", - " stderr <- sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ncz7Uqn5sqqU", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "results <- double_lasso(y, D, W)\n", - "hat <- results$hat\n", - "stderr <- results$stderr\n", - "# Calculate the 95% confidence interval\n", - "ci_lower <- hat - 1.96 * stderr\n", - "ci_upper <- hat + 1.96 * stderr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "P5PEjKw9gLvC" - }, - "source": [ - "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", - "\n", - "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "markdown", + "metadata": { + "id": "a5Ul2ppLfUBQ" + }, + "source": [ + "## Data Analysis" + ] }, - "id": "tNLVM4WEgL9v", - "outputId": "1f2683b7-630a-43c5-e110-74c527603850", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Add Double Lasso results to the table\n", - "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smPkxqCpgMR8" - }, - "source": [ - "## Method 2: Lasso with Cross-Validation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MH-eUye8liRq" - }, - "source": [ - "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YhpTUkE_wQz9", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choose penalty based on KFold cross validation\n", - "set.seed(123)\n", - "# Given small sample size, we use an aggressive number of 20 folds\n", - "n_folds <- 20\n", - "\n", - "\n", - "# Define LassoCV models for y and D\n", - "model_y <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = y,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "model_d <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = D,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "# Get the best lambda values for y and D\n", - "best_lambda_y <- model_y$lambda.min\n", - "best_lambda_d <- model_d$lambda.min\n", - "\n", - "# Fit Lasso models with the best lambda values\n", - "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", - "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", - "\n", - "# Calculate the residuals\n", - "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", - "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cbVsr86tyqTY", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", - "colnames(tmp_df) = c(\"res_y\", \"res_D\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D7SzuZ2P0P0X", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", - "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", - "\n", - "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci_cv <- est_cv - 1.96 * se_cv\n", - "upper_ci_cv <- est_cv + 1.96 * se_cv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "markdown", + "metadata": { + "id": "9GgPNICafYuK" + }, + "source": [ + "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "_B9DWuS6fcVW", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "getdata <- function(...) {\n", + " e <- new.env()\n", + " name <- data(..., envir = e)[1]\n", + " e[[name]]\n", + "}\n", + "\n", + "# now load your data calling getdata()\n", + "growth <- getdata(GrowthData)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smYhqwpbffVh" + }, + "source": [ + "The sample contains $90$ countries and $63$ controls." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "1dsF7_R4j-Qv", + "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 90 × 63
Outcomeinterceptgdpsh465bmp1lfreeopfreetarh65hm65hf65p65seccf65syr65syrm65syrf65teapri65teasec65ex1im1xr65tot1
<dbl><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
-0.0243357516.5916740.28370.1534910.0438880.0070.0130.0010.29 0.040.0330.0570.01047.617.30.07290.0667 0.348-0.014727
0.1004725716.8297940.61410.3135090.0618270.0190.0320.0070.91 0.640.1730.2740.06757.118.00.09400.1438 0.525 0.005750
0.0670514818.8950820.00000.2042440.0091860.2600.3250.2011.0018.142.5732.4782.66726.520.70.17410.1750 1.082-0.010040
0.0640891717.5652750.19970.2487140.0362700.0610.0700.0511.00 2.630.4380.4530.42427.822.70.12650.1496 6.625-0.002195
0.0279295517.1623970.17400.2992520.0373670.0170.0270.0070.82 2.110.2570.2870.22934.517.60.12110.1308 2.500 0.003283
0.0464074417.2189100.00000.2588650.0208800.0230.0380.0060.50 1.460.1600.1740.14634.3 8.10.06340.0762 1.000-0.001747
0.0673323417.8536050.00000.1825250.0143850.0390.0630.0140.92 1.590.3420.4840.20746.614.70.03420.0428 12.499 0.009092
0.0209776817.7039100.27760.2152750.0297130.0240.0350.0130.69 1.630.1840.2190.15234.016.10.08640.0931 7.000 0.011630
0.0335512419.0634630.00000.1096140.0021710.4020.4880.3141.0024.723.2063.1543.25328.220.60.05940.0460 1.000 0.008169
0.0391465218.1519100.14840.1108850.0285790.1450.1730.1141.00 6.760.7030.7850.62020.3 7.20.05240.0523 2.119 0.007584
0.0761265116.9295170.02960.1657840.0201150.0460.0660.0250.73 6.211.3161.6830.96927.817.20.05600.0826 11.879 0.086032
0.1279512117.2377780.21510.0784880.0115810.0220.0310.0141.00 3.960.5940.6740.51528.214.80.02700.0275 1.938 0.007666
-0.0243260918.1158200.43180.1374820.0265470.0590.0730.0451.0011.361.1321.1261.13852.118.80.08040.0930 0.003 0.016968
0.0782934217.2717040.16890.1645980.0444460.0290.0450.0130.84 3.100.5680.6950.45035.913.10.06170.0678 10.479 0.004573
0.1129115517.1212520.18320.1880160.0456780.0330.0510.0150.91 3.160.4400.5120.36937.412.70.07750.0780 18.476-0.020322
0.0523081916.9772810.09620.2046110.0778520.0370.0430.0301.00 2.400.4190.5480.29930.3 7.90.06680.0787125.990 0.028916
0.0363908917.6496930.02270.1362870.0467300.0810.1050.0560.99 3.510.5620.6990.42735.714.70.08720.0938 26.800 0.020228
0.0297382318.0567440.02080.1978530.0372240.0830.0970.0691.00 3.300.7220.7650.68036.612.60.05570.0624 0.052 0.013407
-0.0566435818.7809410.26540.1898670.0317470.0680.0890.0460.94 2.990.3720.4620.28134.020.30.31780.1583 4.500-0.024761
0.0192048016.2878590.42070.1306820.1099210.0530.0390.0110.74 0.340.1420.2230.05535.519.10.02010.0341 4.762-0.021656
0.0852060016.1377270.13710.1238180.0158970.0280.0250.0070.72 0.560.1480.2320.06541.321.30.02980.0297 4.125-0.054872
0.1339822118.1288800.00000.1672100.0033110.1290.1960.0631.0013.161.7271.9101.56028.123.20.05700.0609360.000-0.054874
0.1730247416.6808550.47130.2284240.0293280.0620.0900.0321.00 3.950.9741.5260.47062.434.90.02060.0618265.690 0.018194
0.1096991517.1770190.01780.1852400.0154530.0200.0260.0130.90 1.890.5710.8430.28626.924.10.22950.1990 3.061-0.034733
0.0159899016.6489850.47620.1711810.0589370.0180.0280.0070.40 0.760.3570.5120.18539.926.90.01780.0634 4.762-0.000222
0.0622497716.8793560.29270.1795080.0358420.1880.1690.2081.00 3.690.6510.7590.54731.431.20.06950.0728 4.017 0.033636
0.1098706917.3473000.10170.2476260.0373920.0800.1330.0270.78 0.720.1950.3030.08536.221.60.08600.0898 3.177 0.010162
0.0921062816.7250340.02660.1799330.0463760.0150.0200.0100.78 0.860.2580.3820.13734.616.50.05580.0613 20.800-0.018514
0.0833760418.4510530.00000.3585560.0164680.0900.1330.0441.00 2.910.7661.0870.51021.915.30.16870.1635 26.000 0.010943
0.0762334518.6024530.00000.4162340.0147210.1480.1940.1001.0012.171.5541.7241.39820.6 7.20.26290.2698 50.000-0.001521
-0.0340453918.3461680.31990.1108850.0285790.2720.2890.2721.00 9.580.9190.9360.90218.2 7.70.06250.0578 36.603 0.014286
-0.0338063517.3031700.31330.1657840.0201150.1120.1320.0650.85 5.601.1581.4730.86222.718.20.10710.1028 20.000 0.111198
0.0699148817.8590270.12220.0784880.0115810.1070.1030.0920.88 2.800.5960.6450.54821.714.40.03570.0466 8.127 0.006002
-0.0817256017.9983351.63780.1374820.0265470.1560.1810.1501.0013.741.3391.2221.44534.815.20.07830.0847 4.911-0.127025
0.0460100517.6558640.13450.1645980.0444460.0800.0970.0581.00 8.251.0761.1431.01332.119.40.05250.0572 30.929-0.004592
0.0665980917.6750820.08980.1880160.0456780.2690.3380.2001.00 5.800.6870.7450.63037.516.40.09060.0959 25.000 0.191066
-0.0113842417.8300280.48800.1362870.0467300.1460.1930.0941.00 6.420.9501.1290.77239.123.80.07640.0866 40.500-0.007018
-0.1009899018.4986220.00100.1898670.0317470.1810.1900.1590.97 7.630.8010.8500.75230.216.80.21310.1437 4.285 0.168536
0.0547508716.2166060.75570.2143450.0734950.0230.0510.0060.73 0.440.2820.4880.05150.621.80.02320.0407 8.876-0.084064
0.0946181718.4144960.00000.3743280.0000000.1010.1470.0531.0011.801.8462.3691.30131.124.30.59580.5819 4.935 0.021808
0.0457152916.3835070.35560.1306820.1099210.0860.1300.0420.79 1.000.4460.7130.16342.319.80.01880.0222 8.653-0.012443
0.0654911118.7823230.00000.1672100.0033110.2460.3310.1600.9915.521.9692.1211.82825.317.50.10320.0958296.800-0.057094
0.0212465117.2513450.05160.2638130.0452250.0900.0530.0300.88 4.000.8171.2050.41334.721.10.07300.2227 0.320 0.128443
0.1414454817.5115250.10530.2284240.0293280.1030.1390.0541.00 9.331.7002.3691.06051.737.10.09030.1229484.000 0.007257
0.0968162317.7137850.00500.1852400.0154530.0310.0420.0160.91 4.350.8911.2550.51731.827.30.19220.1821 2.402 0.030424
0.0405342016.7286290.61900.1711810.0589370.0190.0270.0090.46 1.010.6701.0390.27140.118.00.02810.0459 9.900-0.012137
0.0105884117.1861440.07600.1795080.0358420.1840.1730.2171.00 5.340.9431.0490.83729.031.50.07030.0716 7.248 0.009640
0.1855264918.3260330.00500.3216580.0051060.0900.1090.0751.00 4.641.1271.4270.81730.523.10.74700.8489 2.371 0.051395
0.0931049117.8946910.10620.2476260.0373920.1210.1750.0630.96 1.470.4810.7610.20033.819.60.07970.1018 3.017 0.207492
0.0652285617.1754900.00000.1799330.0463760.0350.0400.0270.83 1.230.3320.4510.21927.927.20.06360.0721 20.379 0.018019
0.0380950219.0309740.00000.2931380.0055170.2450.2510.2381.00 7.501.1671.2101.12822.515.50.16620.1617 4.286-0.006642
0.0342130018.9955370.00000.3047200.0116580.2460.2600.1901.00 6.750.6670.7760.57523.515.00.25970.2288 2.460-0.003241
0.0527591418.2348300.03630.2884050.0115890.1830.2220.1421.00 8.181.0101.2200.82130.228.30.10440.1796 32.051-0.034352
0.0384156418.3325490.00000.3454850.0065030.1880.2480.1361.0013.121.5761.5671.58531.014.30.28660.3500 0.452-0.001660
0.0318947918.6455860.00000.2884400.0059950.2560.3010.1991.00 6.911.3071.5791.06218.911.30.12960.1458652.850-0.046278
0.0311959818.9910640.00000.3718980.0145860.2550.3360.1700.9811.412.2262.4941.97127.515.90.44070.4257 2.529-0.011883
0.0340956618.0251890.00500.2964370.0136150.1080.1170.0931.00 1.950.5100.6940.36220.215.70.16690.2201 25.553-0.039080
0.0469004619.0301370.00000.2657780.0086290.2880.3370.2371.0025.642.7272.6642.78820.4 9.40.32380.3134 4.152 0.005175
0.0397733718.8653120.00000.2829390.0050480.1880.2360.1391.0010.761.8881.9201.86020.016.00.18450.1940 0.452-0.029551
0.0406415418.9123390.00000.1503660.0243770.2570.3380.2151.0024.403.0513.2352.87518.529.10.18760.2007 0.886-0.036482
\n" + ], + "text/latex": [ + "A data.frame: 90 × 63\n", + "\\begin{tabular}{lllllllllllllllllllll}\n", + " Outcome & intercept & gdpsh465 & bmp1l & freeop & freetar & h65 & hm65 & hf65 & p65 & ⋯ & seccf65 & syr65 & syrm65 & syrf65 & teapri65 & teasec65 & ex1 & im1 & xr65 & tot1\\\\\n", + " & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t -0.02433575 & 1 & 6.591674 & 0.2837 & 0.153491 & 0.043888 & 0.007 & 0.013 & 0.001 & 0.29 & ⋯ & 0.04 & 0.033 & 0.057 & 0.010 & 47.6 & 17.3 & 0.0729 & 0.0667 & 0.348 & -0.014727\\\\\n", + "\t 0.10047257 & 1 & 6.829794 & 0.6141 & 0.313509 & 0.061827 & 0.019 & 0.032 & 0.007 & 0.91 & ⋯ & 0.64 & 0.173 & 0.274 & 0.067 & 57.1 & 18.0 & 0.0940 & 0.1438 & 0.525 & 0.005750\\\\\n", + "\t 0.06705148 & 1 & 8.895082 & 0.0000 & 0.204244 & 0.009186 & 0.260 & 0.325 & 0.201 & 1.00 & ⋯ & 18.14 & 2.573 & 2.478 & 2.667 & 26.5 & 20.7 & 0.1741 & 0.1750 & 1.082 & -0.010040\\\\\n", + "\t 0.06408917 & 1 & 7.565275 & 0.1997 & 0.248714 & 0.036270 & 0.061 & 0.070 & 0.051 & 1.00 & ⋯ & 2.63 & 0.438 & 0.453 & 0.424 & 27.8 & 22.7 & 0.1265 & 0.1496 & 6.625 & -0.002195\\\\\n", + "\t 0.02792955 & 1 & 7.162397 & 0.1740 & 0.299252 & 0.037367 & 0.017 & 0.027 & 0.007 & 0.82 & ⋯ & 2.11 & 0.257 & 0.287 & 0.229 & 34.5 & 17.6 & 0.1211 & 0.1308 & 2.500 & 0.003283\\\\\n", + "\t 0.04640744 & 1 & 7.218910 & 0.0000 & 0.258865 & 0.020880 & 0.023 & 0.038 & 0.006 & 0.50 & ⋯ & 1.46 & 0.160 & 0.174 & 0.146 & 34.3 & 8.1 & 0.0634 & 0.0762 & 1.000 & -0.001747\\\\\n", + "\t 0.06733234 & 1 & 7.853605 & 0.0000 & 0.182525 & 0.014385 & 0.039 & 0.063 & 0.014 & 0.92 & ⋯ & 1.59 & 0.342 & 0.484 & 0.207 & 46.6 & 14.7 & 0.0342 & 0.0428 & 12.499 & 0.009092\\\\\n", + "\t 0.02097768 & 1 & 7.703910 & 0.2776 & 0.215275 & 0.029713 & 0.024 & 0.035 & 0.013 & 0.69 & ⋯ & 1.63 & 0.184 & 0.219 & 0.152 & 34.0 & 16.1 & 0.0864 & 0.0931 & 7.000 & 0.011630\\\\\n", + "\t 0.03355124 & 1 & 9.063463 & 0.0000 & 0.109614 & 0.002171 & 0.402 & 0.488 & 0.314 & 1.00 & ⋯ & 24.72 & 3.206 & 3.154 & 3.253 & 28.2 & 20.6 & 0.0594 & 0.0460 & 1.000 & 0.008169\\\\\n", + "\t 0.03914652 & 1 & 8.151910 & 0.1484 & 0.110885 & 0.028579 & 0.145 & 0.173 & 0.114 & 1.00 & ⋯ & 6.76 & 0.703 & 0.785 & 0.620 & 20.3 & 7.2 & 0.0524 & 0.0523 & 2.119 & 0.007584\\\\\n", + "\t 0.07612651 & 1 & 6.929517 & 0.0296 & 0.165784 & 0.020115 & 0.046 & 0.066 & 0.025 & 0.73 & ⋯ & 6.21 & 1.316 & 1.683 & 0.969 & 27.8 & 17.2 & 0.0560 & 0.0826 & 11.879 & 0.086032\\\\\n", + "\t 0.12795121 & 1 & 7.237778 & 0.2151 & 0.078488 & 0.011581 & 0.022 & 0.031 & 0.014 & 1.00 & ⋯ & 3.96 & 0.594 & 0.674 & 0.515 & 28.2 & 14.8 & 0.0270 & 0.0275 & 1.938 & 0.007666\\\\\n", + "\t -0.02432609 & 1 & 8.115820 & 0.4318 & 0.137482 & 0.026547 & 0.059 & 0.073 & 0.045 & 1.00 & ⋯ & 11.36 & 1.132 & 1.126 & 1.138 & 52.1 & 18.8 & 0.0804 & 0.0930 & 0.003 & 0.016968\\\\\n", + "\t 0.07829342 & 1 & 7.271704 & 0.1689 & 0.164598 & 0.044446 & 0.029 & 0.045 & 0.013 & 0.84 & ⋯ & 3.10 & 0.568 & 0.695 & 0.450 & 35.9 & 13.1 & 0.0617 & 0.0678 & 10.479 & 0.004573\\\\\n", + "\t 0.11291155 & 1 & 7.121252 & 0.1832 & 0.188016 & 0.045678 & 0.033 & 0.051 & 0.015 & 0.91 & ⋯ & 3.16 & 0.440 & 0.512 & 0.369 & 37.4 & 12.7 & 0.0775 & 0.0780 & 18.476 & -0.020322\\\\\n", + "\t 0.05230819 & 1 & 6.977281 & 0.0962 & 0.204611 & 0.077852 & 0.037 & 0.043 & 0.030 & 1.00 & ⋯ & 2.40 & 0.419 & 0.548 & 0.299 & 30.3 & 7.9 & 0.0668 & 0.0787 & 125.990 & 0.028916\\\\\n", + "\t 0.03639089 & 1 & 7.649693 & 0.0227 & 0.136287 & 0.046730 & 0.081 & 0.105 & 0.056 & 0.99 & ⋯ & 3.51 & 0.562 & 0.699 & 0.427 & 35.7 & 14.7 & 0.0872 & 0.0938 & 26.800 & 0.020228\\\\\n", + "\t 0.02973823 & 1 & 8.056744 & 0.0208 & 0.197853 & 0.037224 & 0.083 & 0.097 & 0.069 & 1.00 & ⋯ & 3.30 & 0.722 & 0.765 & 0.680 & 36.6 & 12.6 & 0.0557 & 0.0624 & 0.052 & 0.013407\\\\\n", + "\t -0.05664358 & 1 & 8.780941 & 0.2654 & 0.189867 & 0.031747 & 0.068 & 0.089 & 0.046 & 0.94 & ⋯ & 2.99 & 0.372 & 0.462 & 0.281 & 34.0 & 20.3 & 0.3178 & 0.1583 & 4.500 & -0.024761\\\\\n", + "\t 0.01920480 & 1 & 6.287859 & 0.4207 & 0.130682 & 0.109921 & 0.053 & 0.039 & 0.011 & 0.74 & ⋯ & 0.34 & 0.142 & 0.223 & 0.055 & 35.5 & 19.1 & 0.0201 & 0.0341 & 4.762 & -0.021656\\\\\n", + "\t 0.08520600 & 1 & 6.137727 & 0.1371 & 0.123818 & 0.015897 & 0.028 & 0.025 & 0.007 & 0.72 & ⋯ & 0.56 & 0.148 & 0.232 & 0.065 & 41.3 & 21.3 & 0.0298 & 0.0297 & 4.125 & -0.054872\\\\\n", + "\t 0.13398221 & 1 & 8.128880 & 0.0000 & 0.167210 & 0.003311 & 0.129 & 0.196 & 0.063 & 1.00 & ⋯ & 13.16 & 1.727 & 1.910 & 1.560 & 28.1 & 23.2 & 0.0570 & 0.0609 & 360.000 & -0.054874\\\\\n", + "\t 0.17302474 & 1 & 6.680855 & 0.4713 & 0.228424 & 0.029328 & 0.062 & 0.090 & 0.032 & 1.00 & ⋯ & 3.95 & 0.974 & 1.526 & 0.470 & 62.4 & 34.9 & 0.0206 & 0.0618 & 265.690 & 0.018194\\\\\n", + "\t 0.10969915 & 1 & 7.177019 & 0.0178 & 0.185240 & 0.015453 & 0.020 & 0.026 & 0.013 & 0.90 & ⋯ & 1.89 & 0.571 & 0.843 & 0.286 & 26.9 & 24.1 & 0.2295 & 0.1990 & 3.061 & -0.034733\\\\\n", + "\t 0.01598990 & 1 & 6.648985 & 0.4762 & 0.171181 & 0.058937 & 0.018 & 0.028 & 0.007 & 0.40 & ⋯ & 0.76 & 0.357 & 0.512 & 0.185 & 39.9 & 26.9 & 0.0178 & 0.0634 & 4.762 & -0.000222\\\\\n", + "\t 0.06224977 & 1 & 6.879356 & 0.2927 & 0.179508 & 0.035842 & 0.188 & 0.169 & 0.208 & 1.00 & ⋯ & 3.69 & 0.651 & 0.759 & 0.547 & 31.4 & 31.2 & 0.0695 & 0.0728 & 4.017 & 0.033636\\\\\n", + "\t 0.10987069 & 1 & 7.347300 & 0.1017 & 0.247626 & 0.037392 & 0.080 & 0.133 & 0.027 & 0.78 & ⋯ & 0.72 & 0.195 & 0.303 & 0.085 & 36.2 & 21.6 & 0.0860 & 0.0898 & 3.177 & 0.010162\\\\\n", + "\t 0.09210628 & 1 & 6.725034 & 0.0266 & 0.179933 & 0.046376 & 0.015 & 0.020 & 0.010 & 0.78 & ⋯ & 0.86 & 0.258 & 0.382 & 0.137 & 34.6 & 16.5 & 0.0558 & 0.0613 & 20.800 & -0.018514\\\\\n", + "\t 0.08337604 & 1 & 8.451053 & 0.0000 & 0.358556 & 0.016468 & 0.090 & 0.133 & 0.044 & 1.00 & ⋯ & 2.91 & 0.766 & 1.087 & 0.510 & 21.9 & 15.3 & 0.1687 & 0.1635 & 26.000 & 0.010943\\\\\n", + "\t 0.07623345 & 1 & 8.602453 & 0.0000 & 0.416234 & 0.014721 & 0.148 & 0.194 & 0.100 & 1.00 & ⋯ & 12.17 & 1.554 & 1.724 & 1.398 & 20.6 & 7.2 & 0.2629 & 0.2698 & 50.000 & -0.001521\\\\\n", + "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋱ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", + "\t -0.03404539 & 1 & 8.346168 & 0.3199 & 0.110885 & 0.028579 & 0.272 & 0.289 & 0.272 & 1.00 & ⋯ & 9.58 & 0.919 & 0.936 & 0.902 & 18.2 & 7.7 & 0.0625 & 0.0578 & 36.603 & 0.014286\\\\\n", + "\t -0.03380635 & 1 & 7.303170 & 0.3133 & 0.165784 & 0.020115 & 0.112 & 0.132 & 0.065 & 0.85 & ⋯ & 5.60 & 1.158 & 1.473 & 0.862 & 22.7 & 18.2 & 0.1071 & 0.1028 & 20.000 & 0.111198\\\\\n", + "\t 0.06991488 & 1 & 7.859027 & 0.1222 & 0.078488 & 0.011581 & 0.107 & 0.103 & 0.092 & 0.88 & ⋯ & 2.80 & 0.596 & 0.645 & 0.548 & 21.7 & 14.4 & 0.0357 & 0.0466 & 8.127 & 0.006002\\\\\n", + "\t -0.08172560 & 1 & 7.998335 & 1.6378 & 0.137482 & 0.026547 & 0.156 & 0.181 & 0.150 & 1.00 & ⋯ & 13.74 & 1.339 & 1.222 & 1.445 & 34.8 & 15.2 & 0.0783 & 0.0847 & 4.911 & -0.127025\\\\\n", + "\t 0.04601005 & 1 & 7.655864 & 0.1345 & 0.164598 & 0.044446 & 0.080 & 0.097 & 0.058 & 1.00 & ⋯ & 8.25 & 1.076 & 1.143 & 1.013 & 32.1 & 19.4 & 0.0525 & 0.0572 & 30.929 & -0.004592\\\\\n", + "\t 0.06659809 & 1 & 7.675082 & 0.0898 & 0.188016 & 0.045678 & 0.269 & 0.338 & 0.200 & 1.00 & ⋯ & 5.80 & 0.687 & 0.745 & 0.630 & 37.5 & 16.4 & 0.0906 & 0.0959 & 25.000 & 0.191066\\\\\n", + "\t -0.01138424 & 1 & 7.830028 & 0.4880 & 0.136287 & 0.046730 & 0.146 & 0.193 & 0.094 & 1.00 & ⋯ & 6.42 & 0.950 & 1.129 & 0.772 & 39.1 & 23.8 & 0.0764 & 0.0866 & 40.500 & -0.007018\\\\\n", + "\t -0.10098990 & 1 & 8.498622 & 0.0010 & 0.189867 & 0.031747 & 0.181 & 0.190 & 0.159 & 0.97 & ⋯ & 7.63 & 0.801 & 0.850 & 0.752 & 30.2 & 16.8 & 0.2131 & 0.1437 & 4.285 & 0.168536\\\\\n", + "\t 0.05475087 & 1 & 6.216606 & 0.7557 & 0.214345 & 0.073495 & 0.023 & 0.051 & 0.006 & 0.73 & ⋯ & 0.44 & 0.282 & 0.488 & 0.051 & 50.6 & 21.8 & 0.0232 & 0.0407 & 8.876 & -0.084064\\\\\n", + "\t 0.09461817 & 1 & 8.414496 & 0.0000 & 0.374328 & 0.000000 & 0.101 & 0.147 & 0.053 & 1.00 & ⋯ & 11.80 & 1.846 & 2.369 & 1.301 & 31.1 & 24.3 & 0.5958 & 0.5819 & 4.935 & 0.021808\\\\\n", + "\t 0.04571529 & 1 & 6.383507 & 0.3556 & 0.130682 & 0.109921 & 0.086 & 0.130 & 0.042 & 0.79 & ⋯ & 1.00 & 0.446 & 0.713 & 0.163 & 42.3 & 19.8 & 0.0188 & 0.0222 & 8.653 & -0.012443\\\\\n", + "\t 0.06549111 & 1 & 8.782323 & 0.0000 & 0.167210 & 0.003311 & 0.246 & 0.331 & 0.160 & 0.99 & ⋯ & 15.52 & 1.969 & 2.121 & 1.828 & 25.3 & 17.5 & 0.1032 & 0.0958 & 296.800 & -0.057094\\\\\n", + "\t 0.02124651 & 1 & 7.251345 & 0.0516 & 0.263813 & 0.045225 & 0.090 & 0.053 & 0.030 & 0.88 & ⋯ & 4.00 & 0.817 & 1.205 & 0.413 & 34.7 & 21.1 & 0.0730 & 0.2227 & 0.320 & 0.128443\\\\\n", + "\t 0.14144548 & 1 & 7.511525 & 0.1053 & 0.228424 & 0.029328 & 0.103 & 0.139 & 0.054 & 1.00 & ⋯ & 9.33 & 1.700 & 2.369 & 1.060 & 51.7 & 37.1 & 0.0903 & 0.1229 & 484.000 & 0.007257\\\\\n", + "\t 0.09681623 & 1 & 7.713785 & 0.0050 & 0.185240 & 0.015453 & 0.031 & 0.042 & 0.016 & 0.91 & ⋯ & 4.35 & 0.891 & 1.255 & 0.517 & 31.8 & 27.3 & 0.1922 & 0.1821 & 2.402 & 0.030424\\\\\n", + "\t 0.04053420 & 1 & 6.728629 & 0.6190 & 0.171181 & 0.058937 & 0.019 & 0.027 & 0.009 & 0.46 & ⋯ & 1.01 & 0.670 & 1.039 & 0.271 & 40.1 & 18.0 & 0.0281 & 0.0459 & 9.900 & -0.012137\\\\\n", + "\t 0.01058841 & 1 & 7.186144 & 0.0760 & 0.179508 & 0.035842 & 0.184 & 0.173 & 0.217 & 1.00 & ⋯ & 5.34 & 0.943 & 1.049 & 0.837 & 29.0 & 31.5 & 0.0703 & 0.0716 & 7.248 & 0.009640\\\\\n", + "\t 0.18552649 & 1 & 8.326033 & 0.0050 & 0.321658 & 0.005106 & 0.090 & 0.109 & 0.075 & 1.00 & ⋯ & 4.64 & 1.127 & 1.427 & 0.817 & 30.5 & 23.1 & 0.7470 & 0.8489 & 2.371 & 0.051395\\\\\n", + "\t 0.09310491 & 1 & 7.894691 & 0.1062 & 0.247626 & 0.037392 & 0.121 & 0.175 & 0.063 & 0.96 & ⋯ & 1.47 & 0.481 & 0.761 & 0.200 & 33.8 & 19.6 & 0.0797 & 0.1018 & 3.017 & 0.207492\\\\\n", + "\t 0.06522856 & 1 & 7.175490 & 0.0000 & 0.179933 & 0.046376 & 0.035 & 0.040 & 0.027 & 0.83 & ⋯ & 1.23 & 0.332 & 0.451 & 0.219 & 27.9 & 27.2 & 0.0636 & 0.0721 & 20.379 & 0.018019\\\\\n", + "\t 0.03809502 & 1 & 9.030974 & 0.0000 & 0.293138 & 0.005517 & 0.245 & 0.251 & 0.238 & 1.00 & ⋯ & 7.50 & 1.167 & 1.210 & 1.128 & 22.5 & 15.5 & 0.1662 & 0.1617 & 4.286 & -0.006642\\\\\n", + "\t 0.03421300 & 1 & 8.995537 & 0.0000 & 0.304720 & 0.011658 & 0.246 & 0.260 & 0.190 & 1.00 & ⋯ & 6.75 & 0.667 & 0.776 & 0.575 & 23.5 & 15.0 & 0.2597 & 0.2288 & 2.460 & -0.003241\\\\\n", + "\t 0.05275914 & 1 & 8.234830 & 0.0363 & 0.288405 & 0.011589 & 0.183 & 0.222 & 0.142 & 1.00 & ⋯ & 8.18 & 1.010 & 1.220 & 0.821 & 30.2 & 28.3 & 0.1044 & 0.1796 & 32.051 & -0.034352\\\\\n", + "\t 0.03841564 & 1 & 8.332549 & 0.0000 & 0.345485 & 0.006503 & 0.188 & 0.248 & 0.136 & 1.00 & ⋯ & 13.12 & 1.576 & 1.567 & 1.585 & 31.0 & 14.3 & 0.2866 & 0.3500 & 0.452 & -0.001660\\\\\n", + "\t 0.03189479 & 1 & 8.645586 & 0.0000 & 0.288440 & 0.005995 & 0.256 & 0.301 & 0.199 & 1.00 & ⋯ & 6.91 & 1.307 & 1.579 & 1.062 & 18.9 & 11.3 & 0.1296 & 0.1458 & 652.850 & -0.046278\\\\\n", + "\t 0.03119598 & 1 & 8.991064 & 0.0000 & 0.371898 & 0.014586 & 0.255 & 0.336 & 0.170 & 0.98 & ⋯ & 11.41 & 2.226 & 2.494 & 1.971 & 27.5 & 15.9 & 0.4407 & 0.4257 & 2.529 & -0.011883\\\\\n", + "\t 0.03409566 & 1 & 8.025189 & 0.0050 & 0.296437 & 0.013615 & 0.108 & 0.117 & 0.093 & 1.00 & ⋯ & 1.95 & 0.510 & 0.694 & 0.362 & 20.2 & 15.7 & 0.1669 & 0.2201 & 25.553 & -0.039080\\\\\n", + "\t 0.04690046 & 1 & 9.030137 & 0.0000 & 0.265778 & 0.008629 & 0.288 & 0.337 & 0.237 & 1.00 & ⋯ & 25.64 & 2.727 & 2.664 & 2.788 & 20.4 & 9.4 & 0.3238 & 0.3134 & 4.152 & 0.005175\\\\\n", + "\t 0.03977337 & 1 & 8.865312 & 0.0000 & 0.282939 & 0.005048 & 0.188 & 0.236 & 0.139 & 1.00 & ⋯ & 10.76 & 1.888 & 1.920 & 1.860 & 20.0 & 16.0 & 0.1845 & 0.1940 & 0.452 & -0.029551\\\\\n", + "\t 0.04064154 & 1 & 8.912339 & 0.0000 & 0.150366 & 0.024377 & 0.257 & 0.338 & 0.215 & 1.00 & ⋯ & 24.40 & 3.051 & 3.235 & 2.875 & 18.5 & 29.1 & 0.1876 & 0.2007 & 0.886 & -0.036482\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 90 × 63\n", + "\n", + "| Outcome <dbl> | intercept <int> | gdpsh465 <dbl> | bmp1l <dbl> | freeop <dbl> | freetar <dbl> | h65 <dbl> | hm65 <dbl> | hf65 <dbl> | p65 <dbl> | ⋯ ⋯ | seccf65 <dbl> | syr65 <dbl> | syrm65 <dbl> | syrf65 <dbl> | teapri65 <dbl> | teasec65 <dbl> | ex1 <dbl> | im1 <dbl> | xr65 <dbl> | tot1 <dbl> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| -0.02433575 | 1 | 6.591674 | 0.2837 | 0.153491 | 0.043888 | 0.007 | 0.013 | 0.001 | 0.29 | ⋯ | 0.04 | 0.033 | 0.057 | 0.010 | 47.6 | 17.3 | 0.0729 | 0.0667 | 0.348 | -0.014727 |\n", + "| 0.10047257 | 1 | 6.829794 | 0.6141 | 0.313509 | 0.061827 | 0.019 | 0.032 | 0.007 | 0.91 | ⋯ | 0.64 | 0.173 | 0.274 | 0.067 | 57.1 | 18.0 | 0.0940 | 0.1438 | 0.525 | 0.005750 |\n", + "| 0.06705148 | 1 | 8.895082 | 0.0000 | 0.204244 | 0.009186 | 0.260 | 0.325 | 0.201 | 1.00 | ⋯ | 18.14 | 2.573 | 2.478 | 2.667 | 26.5 | 20.7 | 0.1741 | 0.1750 | 1.082 | -0.010040 |\n", + "| 0.06408917 | 1 | 7.565275 | 0.1997 | 0.248714 | 0.036270 | 0.061 | 0.070 | 0.051 | 1.00 | ⋯ | 2.63 | 0.438 | 0.453 | 0.424 | 27.8 | 22.7 | 0.1265 | 0.1496 | 6.625 | -0.002195 |\n", + "| 0.02792955 | 1 | 7.162397 | 0.1740 | 0.299252 | 0.037367 | 0.017 | 0.027 | 0.007 | 0.82 | ⋯ | 2.11 | 0.257 | 0.287 | 0.229 | 34.5 | 17.6 | 0.1211 | 0.1308 | 2.500 | 0.003283 |\n", + "| 0.04640744 | 1 | 7.218910 | 0.0000 | 0.258865 | 0.020880 | 0.023 | 0.038 | 0.006 | 0.50 | ⋯ | 1.46 | 0.160 | 0.174 | 0.146 | 34.3 | 8.1 | 0.0634 | 0.0762 | 1.000 | -0.001747 |\n", + "| 0.06733234 | 1 | 7.853605 | 0.0000 | 0.182525 | 0.014385 | 0.039 | 0.063 | 0.014 | 0.92 | ⋯ | 1.59 | 0.342 | 0.484 | 0.207 | 46.6 | 14.7 | 0.0342 | 0.0428 | 12.499 | 0.009092 |\n", + "| 0.02097768 | 1 | 7.703910 | 0.2776 | 0.215275 | 0.029713 | 0.024 | 0.035 | 0.013 | 0.69 | ⋯ | 1.63 | 0.184 | 0.219 | 0.152 | 34.0 | 16.1 | 0.0864 | 0.0931 | 7.000 | 0.011630 |\n", + "| 0.03355124 | 1 | 9.063463 | 0.0000 | 0.109614 | 0.002171 | 0.402 | 0.488 | 0.314 | 1.00 | ⋯ | 24.72 | 3.206 | 3.154 | 3.253 | 28.2 | 20.6 | 0.0594 | 0.0460 | 1.000 | 0.008169 |\n", + "| 0.03914652 | 1 | 8.151910 | 0.1484 | 0.110885 | 0.028579 | 0.145 | 0.173 | 0.114 | 1.00 | ⋯ | 6.76 | 0.703 | 0.785 | 0.620 | 20.3 | 7.2 | 0.0524 | 0.0523 | 2.119 | 0.007584 |\n", + "| 0.07612651 | 1 | 6.929517 | 0.0296 | 0.165784 | 0.020115 | 0.046 | 0.066 | 0.025 | 0.73 | ⋯ | 6.21 | 1.316 | 1.683 | 0.969 | 27.8 | 17.2 | 0.0560 | 0.0826 | 11.879 | 0.086032 |\n", + "| 0.12795121 | 1 | 7.237778 | 0.2151 | 0.078488 | 0.011581 | 0.022 | 0.031 | 0.014 | 1.00 | ⋯ | 3.96 | 0.594 | 0.674 | 0.515 | 28.2 | 14.8 | 0.0270 | 0.0275 | 1.938 | 0.007666 |\n", + "| -0.02432609 | 1 | 8.115820 | 0.4318 | 0.137482 | 0.026547 | 0.059 | 0.073 | 0.045 | 1.00 | ⋯ | 11.36 | 1.132 | 1.126 | 1.138 | 52.1 | 18.8 | 0.0804 | 0.0930 | 0.003 | 0.016968 |\n", + "| 0.07829342 | 1 | 7.271704 | 0.1689 | 0.164598 | 0.044446 | 0.029 | 0.045 | 0.013 | 0.84 | ⋯ | 3.10 | 0.568 | 0.695 | 0.450 | 35.9 | 13.1 | 0.0617 | 0.0678 | 10.479 | 0.004573 |\n", + "| 0.11291155 | 1 | 7.121252 | 0.1832 | 0.188016 | 0.045678 | 0.033 | 0.051 | 0.015 | 0.91 | ⋯ | 3.16 | 0.440 | 0.512 | 0.369 | 37.4 | 12.7 | 0.0775 | 0.0780 | 18.476 | -0.020322 |\n", + "| 0.05230819 | 1 | 6.977281 | 0.0962 | 0.204611 | 0.077852 | 0.037 | 0.043 | 0.030 | 1.00 | ⋯ | 2.40 | 0.419 | 0.548 | 0.299 | 30.3 | 7.9 | 0.0668 | 0.0787 | 125.990 | 0.028916 |\n", + "| 0.03639089 | 1 | 7.649693 | 0.0227 | 0.136287 | 0.046730 | 0.081 | 0.105 | 0.056 | 0.99 | ⋯ | 3.51 | 0.562 | 0.699 | 0.427 | 35.7 | 14.7 | 0.0872 | 0.0938 | 26.800 | 0.020228 |\n", + "| 0.02973823 | 1 | 8.056744 | 0.0208 | 0.197853 | 0.037224 | 0.083 | 0.097 | 0.069 | 1.00 | ⋯ | 3.30 | 0.722 | 0.765 | 0.680 | 36.6 | 12.6 | 0.0557 | 0.0624 | 0.052 | 0.013407 |\n", + "| -0.05664358 | 1 | 8.780941 | 0.2654 | 0.189867 | 0.031747 | 0.068 | 0.089 | 0.046 | 0.94 | ⋯ | 2.99 | 0.372 | 0.462 | 0.281 | 34.0 | 20.3 | 0.3178 | 0.1583 | 4.500 | -0.024761 |\n", + "| 0.01920480 | 1 | 6.287859 | 0.4207 | 0.130682 | 0.109921 | 0.053 | 0.039 | 0.011 | 0.74 | ⋯ | 0.34 | 0.142 | 0.223 | 0.055 | 35.5 | 19.1 | 0.0201 | 0.0341 | 4.762 | -0.021656 |\n", + "| 0.08520600 | 1 | 6.137727 | 0.1371 | 0.123818 | 0.015897 | 0.028 | 0.025 | 0.007 | 0.72 | ⋯ | 0.56 | 0.148 | 0.232 | 0.065 | 41.3 | 21.3 | 0.0298 | 0.0297 | 4.125 | -0.054872 |\n", + "| 0.13398221 | 1 | 8.128880 | 0.0000 | 0.167210 | 0.003311 | 0.129 | 0.196 | 0.063 | 1.00 | ⋯ | 13.16 | 1.727 | 1.910 | 1.560 | 28.1 | 23.2 | 0.0570 | 0.0609 | 360.000 | -0.054874 |\n", + "| 0.17302474 | 1 | 6.680855 | 0.4713 | 0.228424 | 0.029328 | 0.062 | 0.090 | 0.032 | 1.00 | ⋯ | 3.95 | 0.974 | 1.526 | 0.470 | 62.4 | 34.9 | 0.0206 | 0.0618 | 265.690 | 0.018194 |\n", + "| 0.10969915 | 1 | 7.177019 | 0.0178 | 0.185240 | 0.015453 | 0.020 | 0.026 | 0.013 | 0.90 | ⋯ | 1.89 | 0.571 | 0.843 | 0.286 | 26.9 | 24.1 | 0.2295 | 0.1990 | 3.061 | -0.034733 |\n", + "| 0.01598990 | 1 | 6.648985 | 0.4762 | 0.171181 | 0.058937 | 0.018 | 0.028 | 0.007 | 0.40 | ⋯ | 0.76 | 0.357 | 0.512 | 0.185 | 39.9 | 26.9 | 0.0178 | 0.0634 | 4.762 | -0.000222 |\n", + "| 0.06224977 | 1 | 6.879356 | 0.2927 | 0.179508 | 0.035842 | 0.188 | 0.169 | 0.208 | 1.00 | ⋯ | 3.69 | 0.651 | 0.759 | 0.547 | 31.4 | 31.2 | 0.0695 | 0.0728 | 4.017 | 0.033636 |\n", + "| 0.10987069 | 1 | 7.347300 | 0.1017 | 0.247626 | 0.037392 | 0.080 | 0.133 | 0.027 | 0.78 | ⋯ | 0.72 | 0.195 | 0.303 | 0.085 | 36.2 | 21.6 | 0.0860 | 0.0898 | 3.177 | 0.010162 |\n", + "| 0.09210628 | 1 | 6.725034 | 0.0266 | 0.179933 | 0.046376 | 0.015 | 0.020 | 0.010 | 0.78 | ⋯ | 0.86 | 0.258 | 0.382 | 0.137 | 34.6 | 16.5 | 0.0558 | 0.0613 | 20.800 | -0.018514 |\n", + "| 0.08337604 | 1 | 8.451053 | 0.0000 | 0.358556 | 0.016468 | 0.090 | 0.133 | 0.044 | 1.00 | ⋯ | 2.91 | 0.766 | 1.087 | 0.510 | 21.9 | 15.3 | 0.1687 | 0.1635 | 26.000 | 0.010943 |\n", + "| 0.07623345 | 1 | 8.602453 | 0.0000 | 0.416234 | 0.014721 | 0.148 | 0.194 | 0.100 | 1.00 | ⋯ | 12.17 | 1.554 | 1.724 | 1.398 | 20.6 | 7.2 | 0.2629 | 0.2698 | 50.000 | -0.001521 |\n", + "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", + "| -0.03404539 | 1 | 8.346168 | 0.3199 | 0.110885 | 0.028579 | 0.272 | 0.289 | 0.272 | 1.00 | ⋯ | 9.58 | 0.919 | 0.936 | 0.902 | 18.2 | 7.7 | 0.0625 | 0.0578 | 36.603 | 0.014286 |\n", + "| -0.03380635 | 1 | 7.303170 | 0.3133 | 0.165784 | 0.020115 | 0.112 | 0.132 | 0.065 | 0.85 | ⋯ | 5.60 | 1.158 | 1.473 | 0.862 | 22.7 | 18.2 | 0.1071 | 0.1028 | 20.000 | 0.111198 |\n", + "| 0.06991488 | 1 | 7.859027 | 0.1222 | 0.078488 | 0.011581 | 0.107 | 0.103 | 0.092 | 0.88 | ⋯ | 2.80 | 0.596 | 0.645 | 0.548 | 21.7 | 14.4 | 0.0357 | 0.0466 | 8.127 | 0.006002 |\n", + "| -0.08172560 | 1 | 7.998335 | 1.6378 | 0.137482 | 0.026547 | 0.156 | 0.181 | 0.150 | 1.00 | ⋯ | 13.74 | 1.339 | 1.222 | 1.445 | 34.8 | 15.2 | 0.0783 | 0.0847 | 4.911 | -0.127025 |\n", + "| 0.04601005 | 1 | 7.655864 | 0.1345 | 0.164598 | 0.044446 | 0.080 | 0.097 | 0.058 | 1.00 | ⋯ | 8.25 | 1.076 | 1.143 | 1.013 | 32.1 | 19.4 | 0.0525 | 0.0572 | 30.929 | -0.004592 |\n", + "| 0.06659809 | 1 | 7.675082 | 0.0898 | 0.188016 | 0.045678 | 0.269 | 0.338 | 0.200 | 1.00 | ⋯ | 5.80 | 0.687 | 0.745 | 0.630 | 37.5 | 16.4 | 0.0906 | 0.0959 | 25.000 | 0.191066 |\n", + "| -0.01138424 | 1 | 7.830028 | 0.4880 | 0.136287 | 0.046730 | 0.146 | 0.193 | 0.094 | 1.00 | ⋯ | 6.42 | 0.950 | 1.129 | 0.772 | 39.1 | 23.8 | 0.0764 | 0.0866 | 40.500 | -0.007018 |\n", + "| -0.10098990 | 1 | 8.498622 | 0.0010 | 0.189867 | 0.031747 | 0.181 | 0.190 | 0.159 | 0.97 | ⋯ | 7.63 | 0.801 | 0.850 | 0.752 | 30.2 | 16.8 | 0.2131 | 0.1437 | 4.285 | 0.168536 |\n", + "| 0.05475087 | 1 | 6.216606 | 0.7557 | 0.214345 | 0.073495 | 0.023 | 0.051 | 0.006 | 0.73 | ⋯ | 0.44 | 0.282 | 0.488 | 0.051 | 50.6 | 21.8 | 0.0232 | 0.0407 | 8.876 | -0.084064 |\n", + "| 0.09461817 | 1 | 8.414496 | 0.0000 | 0.374328 | 0.000000 | 0.101 | 0.147 | 0.053 | 1.00 | ⋯ | 11.80 | 1.846 | 2.369 | 1.301 | 31.1 | 24.3 | 0.5958 | 0.5819 | 4.935 | 0.021808 |\n", + "| 0.04571529 | 1 | 6.383507 | 0.3556 | 0.130682 | 0.109921 | 0.086 | 0.130 | 0.042 | 0.79 | ⋯ | 1.00 | 0.446 | 0.713 | 0.163 | 42.3 | 19.8 | 0.0188 | 0.0222 | 8.653 | -0.012443 |\n", + "| 0.06549111 | 1 | 8.782323 | 0.0000 | 0.167210 | 0.003311 | 0.246 | 0.331 | 0.160 | 0.99 | ⋯ | 15.52 | 1.969 | 2.121 | 1.828 | 25.3 | 17.5 | 0.1032 | 0.0958 | 296.800 | -0.057094 |\n", + "| 0.02124651 | 1 | 7.251345 | 0.0516 | 0.263813 | 0.045225 | 0.090 | 0.053 | 0.030 | 0.88 | ⋯ | 4.00 | 0.817 | 1.205 | 0.413 | 34.7 | 21.1 | 0.0730 | 0.2227 | 0.320 | 0.128443 |\n", + "| 0.14144548 | 1 | 7.511525 | 0.1053 | 0.228424 | 0.029328 | 0.103 | 0.139 | 0.054 | 1.00 | ⋯ | 9.33 | 1.700 | 2.369 | 1.060 | 51.7 | 37.1 | 0.0903 | 0.1229 | 484.000 | 0.007257 |\n", + "| 0.09681623 | 1 | 7.713785 | 0.0050 | 0.185240 | 0.015453 | 0.031 | 0.042 | 0.016 | 0.91 | ⋯ | 4.35 | 0.891 | 1.255 | 0.517 | 31.8 | 27.3 | 0.1922 | 0.1821 | 2.402 | 0.030424 |\n", + "| 0.04053420 | 1 | 6.728629 | 0.6190 | 0.171181 | 0.058937 | 0.019 | 0.027 | 0.009 | 0.46 | ⋯ | 1.01 | 0.670 | 1.039 | 0.271 | 40.1 | 18.0 | 0.0281 | 0.0459 | 9.900 | -0.012137 |\n", + "| 0.01058841 | 1 | 7.186144 | 0.0760 | 0.179508 | 0.035842 | 0.184 | 0.173 | 0.217 | 1.00 | ⋯ | 5.34 | 0.943 | 1.049 | 0.837 | 29.0 | 31.5 | 0.0703 | 0.0716 | 7.248 | 0.009640 |\n", + "| 0.18552649 | 1 | 8.326033 | 0.0050 | 0.321658 | 0.005106 | 0.090 | 0.109 | 0.075 | 1.00 | ⋯ | 4.64 | 1.127 | 1.427 | 0.817 | 30.5 | 23.1 | 0.7470 | 0.8489 | 2.371 | 0.051395 |\n", + "| 0.09310491 | 1 | 7.894691 | 0.1062 | 0.247626 | 0.037392 | 0.121 | 0.175 | 0.063 | 0.96 | ⋯ | 1.47 | 0.481 | 0.761 | 0.200 | 33.8 | 19.6 | 0.0797 | 0.1018 | 3.017 | 0.207492 |\n", + "| 0.06522856 | 1 | 7.175490 | 0.0000 | 0.179933 | 0.046376 | 0.035 | 0.040 | 0.027 | 0.83 | ⋯ | 1.23 | 0.332 | 0.451 | 0.219 | 27.9 | 27.2 | 0.0636 | 0.0721 | 20.379 | 0.018019 |\n", + "| 0.03809502 | 1 | 9.030974 | 0.0000 | 0.293138 | 0.005517 | 0.245 | 0.251 | 0.238 | 1.00 | ⋯ | 7.50 | 1.167 | 1.210 | 1.128 | 22.5 | 15.5 | 0.1662 | 0.1617 | 4.286 | -0.006642 |\n", + "| 0.03421300 | 1 | 8.995537 | 0.0000 | 0.304720 | 0.011658 | 0.246 | 0.260 | 0.190 | 1.00 | ⋯ | 6.75 | 0.667 | 0.776 | 0.575 | 23.5 | 15.0 | 0.2597 | 0.2288 | 2.460 | -0.003241 |\n", + "| 0.05275914 | 1 | 8.234830 | 0.0363 | 0.288405 | 0.011589 | 0.183 | 0.222 | 0.142 | 1.00 | ⋯ | 8.18 | 1.010 | 1.220 | 0.821 | 30.2 | 28.3 | 0.1044 | 0.1796 | 32.051 | -0.034352 |\n", + "| 0.03841564 | 1 | 8.332549 | 0.0000 | 0.345485 | 0.006503 | 0.188 | 0.248 | 0.136 | 1.00 | ⋯ | 13.12 | 1.576 | 1.567 | 1.585 | 31.0 | 14.3 | 0.2866 | 0.3500 | 0.452 | -0.001660 |\n", + "| 0.03189479 | 1 | 8.645586 | 0.0000 | 0.288440 | 0.005995 | 0.256 | 0.301 | 0.199 | 1.00 | ⋯ | 6.91 | 1.307 | 1.579 | 1.062 | 18.9 | 11.3 | 0.1296 | 0.1458 | 652.850 | -0.046278 |\n", + "| 0.03119598 | 1 | 8.991064 | 0.0000 | 0.371898 | 0.014586 | 0.255 | 0.336 | 0.170 | 0.98 | ⋯ | 11.41 | 2.226 | 2.494 | 1.971 | 27.5 | 15.9 | 0.4407 | 0.4257 | 2.529 | -0.011883 |\n", + "| 0.03409566 | 1 | 8.025189 | 0.0050 | 0.296437 | 0.013615 | 0.108 | 0.117 | 0.093 | 1.00 | ⋯ | 1.95 | 0.510 | 0.694 | 0.362 | 20.2 | 15.7 | 0.1669 | 0.2201 | 25.553 | -0.039080 |\n", + "| 0.04690046 | 1 | 9.030137 | 0.0000 | 0.265778 | 0.008629 | 0.288 | 0.337 | 0.237 | 1.00 | ⋯ | 25.64 | 2.727 | 2.664 | 2.788 | 20.4 | 9.4 | 0.3238 | 0.3134 | 4.152 | 0.005175 |\n", + "| 0.03977337 | 1 | 8.865312 | 0.0000 | 0.282939 | 0.005048 | 0.188 | 0.236 | 0.139 | 1.00 | ⋯ | 10.76 | 1.888 | 1.920 | 1.860 | 20.0 | 16.0 | 0.1845 | 0.1940 | 0.452 | -0.029551 |\n", + "| 0.04064154 | 1 | 8.912339 | 0.0000 | 0.150366 | 0.024377 | 0.257 | 0.338 | 0.215 | 1.00 | ⋯ | 24.40 | 3.051 | 3.235 | 2.875 | 18.5 | 29.1 | 0.1876 | 0.2007 | 0.886 | -0.036482 |\n", + "\n" + ], + "text/plain": [ + " Outcome intercept gdpsh465 bmp1l freeop freetar h65 hm65 hf65 \n", + "1 -0.02433575 1 6.591674 0.2837 0.153491 0.043888 0.007 0.013 0.001\n", + "2 0.10047257 1 6.829794 0.6141 0.313509 0.061827 0.019 0.032 0.007\n", + "3 0.06705148 1 8.895082 0.0000 0.204244 0.009186 0.260 0.325 0.201\n", + "4 0.06408917 1 7.565275 0.1997 0.248714 0.036270 0.061 0.070 0.051\n", + "5 0.02792955 1 7.162397 0.1740 0.299252 0.037367 0.017 0.027 0.007\n", + "6 0.04640744 1 7.218910 0.0000 0.258865 0.020880 0.023 0.038 0.006\n", + "7 0.06733234 1 7.853605 0.0000 0.182525 0.014385 0.039 0.063 0.014\n", + "8 0.02097768 1 7.703910 0.2776 0.215275 0.029713 0.024 0.035 0.013\n", + "9 0.03355124 1 9.063463 0.0000 0.109614 0.002171 0.402 0.488 0.314\n", + "10 0.03914652 1 8.151910 0.1484 0.110885 0.028579 0.145 0.173 0.114\n", + "11 0.07612651 1 6.929517 0.0296 0.165784 0.020115 0.046 0.066 0.025\n", + "12 0.12795121 1 7.237778 0.2151 0.078488 0.011581 0.022 0.031 0.014\n", + "13 -0.02432609 1 8.115820 0.4318 0.137482 0.026547 0.059 0.073 0.045\n", + "14 0.07829342 1 7.271704 0.1689 0.164598 0.044446 0.029 0.045 0.013\n", + "15 0.11291155 1 7.121252 0.1832 0.188016 0.045678 0.033 0.051 0.015\n", + "16 0.05230819 1 6.977281 0.0962 0.204611 0.077852 0.037 0.043 0.030\n", + "17 0.03639089 1 7.649693 0.0227 0.136287 0.046730 0.081 0.105 0.056\n", + "18 0.02973823 1 8.056744 0.0208 0.197853 0.037224 0.083 0.097 0.069\n", + "19 -0.05664358 1 8.780941 0.2654 0.189867 0.031747 0.068 0.089 0.046\n", + "20 0.01920480 1 6.287859 0.4207 0.130682 0.109921 0.053 0.039 0.011\n", + "21 0.08520600 1 6.137727 0.1371 0.123818 0.015897 0.028 0.025 0.007\n", + "22 0.13398221 1 8.128880 0.0000 0.167210 0.003311 0.129 0.196 0.063\n", + "23 0.17302474 1 6.680855 0.4713 0.228424 0.029328 0.062 0.090 0.032\n", + "24 0.10969915 1 7.177019 0.0178 0.185240 0.015453 0.020 0.026 0.013\n", + "25 0.01598990 1 6.648985 0.4762 0.171181 0.058937 0.018 0.028 0.007\n", + "26 0.06224977 1 6.879356 0.2927 0.179508 0.035842 0.188 0.169 0.208\n", + "27 0.10987069 1 7.347300 0.1017 0.247626 0.037392 0.080 0.133 0.027\n", + "28 0.09210628 1 6.725034 0.0266 0.179933 0.046376 0.015 0.020 0.010\n", + "29 0.08337604 1 8.451053 0.0000 0.358556 0.016468 0.090 0.133 0.044\n", + "30 0.07623345 1 8.602453 0.0000 0.416234 0.014721 0.148 0.194 0.100\n", + "⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", + "61 -0.03404539 1 8.346168 0.3199 0.110885 0.028579 0.272 0.289 0.272\n", + "62 -0.03380635 1 7.303170 0.3133 0.165784 0.020115 0.112 0.132 0.065\n", + "63 0.06991488 1 7.859027 0.1222 0.078488 0.011581 0.107 0.103 0.092\n", + "64 -0.08172560 1 7.998335 1.6378 0.137482 0.026547 0.156 0.181 0.150\n", + "65 0.04601005 1 7.655864 0.1345 0.164598 0.044446 0.080 0.097 0.058\n", + "66 0.06659809 1 7.675082 0.0898 0.188016 0.045678 0.269 0.338 0.200\n", + "67 -0.01138424 1 7.830028 0.4880 0.136287 0.046730 0.146 0.193 0.094\n", + "68 -0.10098990 1 8.498622 0.0010 0.189867 0.031747 0.181 0.190 0.159\n", + "69 0.05475087 1 6.216606 0.7557 0.214345 0.073495 0.023 0.051 0.006\n", + "70 0.09461817 1 8.414496 0.0000 0.374328 0.000000 0.101 0.147 0.053\n", + "71 0.04571529 1 6.383507 0.3556 0.130682 0.109921 0.086 0.130 0.042\n", + "72 0.06549111 1 8.782323 0.0000 0.167210 0.003311 0.246 0.331 0.160\n", + "73 0.02124651 1 7.251345 0.0516 0.263813 0.045225 0.090 0.053 0.030\n", + "74 0.14144548 1 7.511525 0.1053 0.228424 0.029328 0.103 0.139 0.054\n", + "75 0.09681623 1 7.713785 0.0050 0.185240 0.015453 0.031 0.042 0.016\n", + "76 0.04053420 1 6.728629 0.6190 0.171181 0.058937 0.019 0.027 0.009\n", + "77 0.01058841 1 7.186144 0.0760 0.179508 0.035842 0.184 0.173 0.217\n", + "78 0.18552649 1 8.326033 0.0050 0.321658 0.005106 0.090 0.109 0.075\n", + "79 0.09310491 1 7.894691 0.1062 0.247626 0.037392 0.121 0.175 0.063\n", + "80 0.06522856 1 7.175490 0.0000 0.179933 0.046376 0.035 0.040 0.027\n", + "81 0.03809502 1 9.030974 0.0000 0.293138 0.005517 0.245 0.251 0.238\n", + "82 0.03421300 1 8.995537 0.0000 0.304720 0.011658 0.246 0.260 0.190\n", + "83 0.05275914 1 8.234830 0.0363 0.288405 0.011589 0.183 0.222 0.142\n", + "84 0.03841564 1 8.332549 0.0000 0.345485 0.006503 0.188 0.248 0.136\n", + "85 0.03189479 1 8.645586 0.0000 0.288440 0.005995 0.256 0.301 0.199\n", + "86 0.03119598 1 8.991064 0.0000 0.371898 0.014586 0.255 0.336 0.170\n", + "87 0.03409566 1 8.025189 0.0050 0.296437 0.013615 0.108 0.117 0.093\n", + "88 0.04690046 1 9.030137 0.0000 0.265778 0.008629 0.288 0.337 0.237\n", + "89 0.03977337 1 8.865312 0.0000 0.282939 0.005048 0.188 0.236 0.139\n", + "90 0.04064154 1 8.912339 0.0000 0.150366 0.024377 0.257 0.338 0.215\n", + " p65 ⋯ seccf65 syr65 syrm65 syrf65 teapri65 teasec65 ex1 im1 xr65 \n", + "1 0.29 ⋯ 0.04 0.033 0.057 0.010 47.6 17.3 0.0729 0.0667 0.348\n", + "2 0.91 ⋯ 0.64 0.173 0.274 0.067 57.1 18.0 0.0940 0.1438 0.525\n", + "3 1.00 ⋯ 18.14 2.573 2.478 2.667 26.5 20.7 0.1741 0.1750 1.082\n", + "4 1.00 ⋯ 2.63 0.438 0.453 0.424 27.8 22.7 0.1265 0.1496 6.625\n", + "5 0.82 ⋯ 2.11 0.257 0.287 0.229 34.5 17.6 0.1211 0.1308 2.500\n", + "6 0.50 ⋯ 1.46 0.160 0.174 0.146 34.3 8.1 0.0634 0.0762 1.000\n", + "7 0.92 ⋯ 1.59 0.342 0.484 0.207 46.6 14.7 0.0342 0.0428 12.499\n", + "8 0.69 ⋯ 1.63 0.184 0.219 0.152 34.0 16.1 0.0864 0.0931 7.000\n", + "9 1.00 ⋯ 24.72 3.206 3.154 3.253 28.2 20.6 0.0594 0.0460 1.000\n", + "10 1.00 ⋯ 6.76 0.703 0.785 0.620 20.3 7.2 0.0524 0.0523 2.119\n", + "11 0.73 ⋯ 6.21 1.316 1.683 0.969 27.8 17.2 0.0560 0.0826 11.879\n", + "12 1.00 ⋯ 3.96 0.594 0.674 0.515 28.2 14.8 0.0270 0.0275 1.938\n", + "13 1.00 ⋯ 11.36 1.132 1.126 1.138 52.1 18.8 0.0804 0.0930 0.003\n", + "14 0.84 ⋯ 3.10 0.568 0.695 0.450 35.9 13.1 0.0617 0.0678 10.479\n", + "15 0.91 ⋯ 3.16 0.440 0.512 0.369 37.4 12.7 0.0775 0.0780 18.476\n", + "16 1.00 ⋯ 2.40 0.419 0.548 0.299 30.3 7.9 0.0668 0.0787 125.990\n", + "17 0.99 ⋯ 3.51 0.562 0.699 0.427 35.7 14.7 0.0872 0.0938 26.800\n", + "18 1.00 ⋯ 3.30 0.722 0.765 0.680 36.6 12.6 0.0557 0.0624 0.052\n", + "19 0.94 ⋯ 2.99 0.372 0.462 0.281 34.0 20.3 0.3178 0.1583 4.500\n", + "20 0.74 ⋯ 0.34 0.142 0.223 0.055 35.5 19.1 0.0201 0.0341 4.762\n", + "21 0.72 ⋯ 0.56 0.148 0.232 0.065 41.3 21.3 0.0298 0.0297 4.125\n", + "22 1.00 ⋯ 13.16 1.727 1.910 1.560 28.1 23.2 0.0570 0.0609 360.000\n", + "23 1.00 ⋯ 3.95 0.974 1.526 0.470 62.4 34.9 0.0206 0.0618 265.690\n", + "24 0.90 ⋯ 1.89 0.571 0.843 0.286 26.9 24.1 0.2295 0.1990 3.061\n", + "25 0.40 ⋯ 0.76 0.357 0.512 0.185 39.9 26.9 0.0178 0.0634 4.762\n", + "26 1.00 ⋯ 3.69 0.651 0.759 0.547 31.4 31.2 0.0695 0.0728 4.017\n", + "27 0.78 ⋯ 0.72 0.195 0.303 0.085 36.2 21.6 0.0860 0.0898 3.177\n", + "28 0.78 ⋯ 0.86 0.258 0.382 0.137 34.6 16.5 0.0558 0.0613 20.800\n", + "29 1.00 ⋯ 2.91 0.766 1.087 0.510 21.9 15.3 0.1687 0.1635 26.000\n", + "30 1.00 ⋯ 12.17 1.554 1.724 1.398 20.6 7.2 0.2629 0.2698 50.000\n", + "⋮ ⋮ ⋱ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", + "61 1.00 ⋯ 9.58 0.919 0.936 0.902 18.2 7.7 0.0625 0.0578 36.603\n", + "62 0.85 ⋯ 5.60 1.158 1.473 0.862 22.7 18.2 0.1071 0.1028 20.000\n", + "63 0.88 ⋯ 2.80 0.596 0.645 0.548 21.7 14.4 0.0357 0.0466 8.127\n", + "64 1.00 ⋯ 13.74 1.339 1.222 1.445 34.8 15.2 0.0783 0.0847 4.911\n", + "65 1.00 ⋯ 8.25 1.076 1.143 1.013 32.1 19.4 0.0525 0.0572 30.929\n", + "66 1.00 ⋯ 5.80 0.687 0.745 0.630 37.5 16.4 0.0906 0.0959 25.000\n", + "67 1.00 ⋯ 6.42 0.950 1.129 0.772 39.1 23.8 0.0764 0.0866 40.500\n", + "68 0.97 ⋯ 7.63 0.801 0.850 0.752 30.2 16.8 0.2131 0.1437 4.285\n", + "69 0.73 ⋯ 0.44 0.282 0.488 0.051 50.6 21.8 0.0232 0.0407 8.876\n", + "70 1.00 ⋯ 11.80 1.846 2.369 1.301 31.1 24.3 0.5958 0.5819 4.935\n", + "71 0.79 ⋯ 1.00 0.446 0.713 0.163 42.3 19.8 0.0188 0.0222 8.653\n", + "72 0.99 ⋯ 15.52 1.969 2.121 1.828 25.3 17.5 0.1032 0.0958 296.800\n", + "73 0.88 ⋯ 4.00 0.817 1.205 0.413 34.7 21.1 0.0730 0.2227 0.320\n", + "74 1.00 ⋯ 9.33 1.700 2.369 1.060 51.7 37.1 0.0903 0.1229 484.000\n", + "75 0.91 ⋯ 4.35 0.891 1.255 0.517 31.8 27.3 0.1922 0.1821 2.402\n", + "76 0.46 ⋯ 1.01 0.670 1.039 0.271 40.1 18.0 0.0281 0.0459 9.900\n", + "77 1.00 ⋯ 5.34 0.943 1.049 0.837 29.0 31.5 0.0703 0.0716 7.248\n", + "78 1.00 ⋯ 4.64 1.127 1.427 0.817 30.5 23.1 0.7470 0.8489 2.371\n", + "79 0.96 ⋯ 1.47 0.481 0.761 0.200 33.8 19.6 0.0797 0.1018 3.017\n", + "80 0.83 ⋯ 1.23 0.332 0.451 0.219 27.9 27.2 0.0636 0.0721 20.379\n", + "81 1.00 ⋯ 7.50 1.167 1.210 1.128 22.5 15.5 0.1662 0.1617 4.286\n", + "82 1.00 ⋯ 6.75 0.667 0.776 0.575 23.5 15.0 0.2597 0.2288 2.460\n", + "83 1.00 ⋯ 8.18 1.010 1.220 0.821 30.2 28.3 0.1044 0.1796 32.051\n", + "84 1.00 ⋯ 13.12 1.576 1.567 1.585 31.0 14.3 0.2866 0.3500 0.452\n", + "85 1.00 ⋯ 6.91 1.307 1.579 1.062 18.9 11.3 0.1296 0.1458 652.850\n", + "86 0.98 ⋯ 11.41 2.226 2.494 1.971 27.5 15.9 0.4407 0.4257 2.529\n", + "87 1.00 ⋯ 1.95 0.510 0.694 0.362 20.2 15.7 0.1669 0.2201 25.553\n", + "88 1.00 ⋯ 25.64 2.727 2.664 2.788 20.4 9.4 0.3238 0.3134 4.152\n", + "89 1.00 ⋯ 10.76 1.888 1.920 1.860 20.0 16.0 0.1845 0.1940 0.452\n", + "90 1.00 ⋯ 24.40 3.051 3.235 2.875 18.5 29.1 0.1876 0.2007 0.886\n", + " tot1 \n", + "1 -0.014727\n", + "2 0.005750\n", + "3 -0.010040\n", + "4 -0.002195\n", + "5 0.003283\n", + "6 -0.001747\n", + "7 0.009092\n", + "8 0.011630\n", + "9 0.008169\n", + "10 0.007584\n", + "11 0.086032\n", + "12 0.007666\n", + "13 0.016968\n", + "14 0.004573\n", + "15 -0.020322\n", + "16 0.028916\n", + "17 0.020228\n", + "18 0.013407\n", + "19 -0.024761\n", + "20 -0.021656\n", + "21 -0.054872\n", + "22 -0.054874\n", + "23 0.018194\n", + "24 -0.034733\n", + "25 -0.000222\n", + "26 0.033636\n", + "27 0.010162\n", + "28 -0.018514\n", + "29 0.010943\n", + "30 -0.001521\n", + "⋮ ⋮ \n", + "61 0.014286\n", + "62 0.111198\n", + "63 0.006002\n", + "64 -0.127025\n", + "65 -0.004592\n", + "66 0.191066\n", + "67 -0.007018\n", + "68 0.168536\n", + "69 -0.084064\n", + "70 0.021808\n", + "71 -0.012443\n", + "72 -0.057094\n", + "73 0.128443\n", + "74 0.007257\n", + "75 0.030424\n", + "76 -0.012137\n", + "77 0.009640\n", + "78 0.051395\n", + "79 0.207492\n", + "80 0.018019\n", + "81 -0.006642\n", + "82 -0.003241\n", + "83 -0.034352\n", + "84 -0.001660\n", + "85 -0.046278\n", + "86 -0.011883\n", + "87 -0.039080\n", + "88 0.005175\n", + "89 -0.029551\n", + "90 -0.036482" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "growth" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-AMcbsgefhTg" + }, + "source": [ + "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", + "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "DncWsRS9mgAp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create the outcome variable y and covariates x\n", + "y <- growth$Outcome\n", + "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vPO08MjomqfZ", + "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The estimated coefficient on gdpsh465 is -0.009377989 and the corresponding robust standard error is 0.032421195% Confidence Interval: [ -0.07292335 , 0.05416737 ]" + ] + } + ], + "source": [ + "fit <- lm(Outcome ~ ., data = X)\n", + "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", + "\n", + "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", + "\n", + "# print unconditional effect of gdpsh465 and the corresponding standard error\n", + "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", + " \" and the corresponding robust standard error is\", se)\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci <- est - 1.96 * se\n", + "upper_ci <- est + 1.96 * se\n", + "\n", + "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D7nJZzhGfjQT" + }, + "source": [ + "## Summarize OLS results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EwGVcIVAfRe5", + "outputId": "87f41279-8907-415b-f8eb-589f736089b2", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Method Estimate Std. Error lower bound CI\n", + "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", + " upper bound CI\n", + "1 0.0541673700112012\n" + ] + } + ], + "source": [ + "# Create an empty data frame with column names\n", + "table <- data.frame(\n", + " Method = character(0),\n", + " Estimate = character(0),\n", + " `Std. Error` = numeric(0),\n", + " `Lower Bound CI` = numeric(0),\n", + " `Upper Bound CI` = numeric(0)\n", + ")\n", + "\n", + "# Add OLS results to the table\n", + "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", + "\n", + "# Rename the columns to match the Python table\n", + "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KfrhJqKhfwKB" + }, + "source": [ + "Least squares provides a rather noisy estimate (high standard error) of the\n", + "speed of convergence, and does not allow us to answer the question\n", + "about the convergence hypothesis since the confidence interval includes zero.\n", + "\n", + "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." + ] }, - "id": "Ctl5T5vUygRk", - "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "D9Y2U1Ldf1eB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "y <- growth$Outcome\n", + "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", + "D <- growth$gdpsh465" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8yNU2UgefzCZ" + }, + "source": [ + "## Method 1: Lasso with Theoretical Penalty using HDM" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQPxdzQ2f84M" + }, + "source": [ + "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", + "\n", + "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "DIzy51tZsoWp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "double_lasso <- function(y, D, W) {\n", + " require(rlasso)\n", + "\n", + " # residualize outcome with Lasso\n", + " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", + "\n", + "\n", + " # residualize treatment with Lasso\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " dres <- D - as.numeric(dhat_rlasso)\n", + "\n", + " # rest is the same as in the OLS case\n", + " hat <- mean(yres * dres) / mean(dres^2)\n", + " epsilon <- yres - hat * dres\n", + " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Ncz7Uqn5sqqU", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "results <- double_lasso(y, D, W)\n", + "hat <- results$hat\n", + "stderr <- results$stderr\n", + "# Calculate the 95% confidence interval\n", + "ci_lower <- hat - 1.96 * stderr\n", + "ci_upper <- hat + 1.96 * stderr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P5PEjKw9gLvC" + }, + "source": [ + "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", + "\n", + "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tNLVM4WEgL9v", + "outputId": "1f2683b7-630a-43c5-e110-74c527603850", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Method Estimate Std. Error lower bound CI\n", + "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", + "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", + " upper bound CI\n", + "1 0.0541673700112012\n", + "2 -0.00975949506187093\n" + ] + } + ], + "source": [ + "# Add Double Lasso results to the table\n", + "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smPkxqCpgMR8" + }, + "source": [ + "## Method 2: Lasso with Cross-Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MH-eUye8liRq" + }, + "source": [ + "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "YhpTUkE_wQz9", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choose penalty based on KFold cross validation\n", + "set.seed(123)\n", + "# Given small sample size, we use an aggressive number of 20 folds\n", + "n_folds <- 20\n", + "\n", + "\n", + "# Define LassoCV models for y and D\n", + "model_y <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = y,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "model_d <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = D,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "# Get the best lambda values for y and D\n", + "best_lambda_y <- model_y$lambda.min\n", + "best_lambda_d <- model_d$lambda.min\n", + "\n", + "# Fit Lasso models with the best lambda values\n", + "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", + "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", + "\n", + "# Calculate the residuals\n", + "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", + "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "cbVsr86tyqTY", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", + "colnames(tmp_df) <- c(\"res_y\", \"res_D\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "D7SzuZ2P0P0X", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", + "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", + "\n", + "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci_cv <- est_cv - 1.96 * se_cv\n", + "upper_ci_cv <- est_cv + 1.96 * se_cv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ctl5T5vUygRk", + "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Method Estimate Std. Error lower bound CI\n", + "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", + "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", + "3 Double Lasso CV -0.00210480949226998 0.00822866735729585 -0.0182329975125698\n", + " upper bound CI\n", + "1 0.0541673700112012\n", + "2 -0.00975949506187093\n", + "3 0.0140233785280299\n" + ] + } + ], + "source": [ + "# Add LassoCV results to the table\n", + "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0LzDsUi8gmQM" + }, + "source": [ + "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 857 + }, + "id": "7uzcIGhVgmei", + "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deaBM9f/H8fddrddyUdakFLIVkaVIi+qXaEUpUimlpEUqRAqlpH0laVPas5WkolIpVL7JEsp6nTZEcnF+Z5mZ+5l7z3vOLO85M4fX8497586d+7nvc8zDnTt35gzpCKGEo1QPgNCBECAhJBAgISQQICEkECAhJBAgISQQICEkECBF0RLqGH7GUHpMZuFWOeXWJbKA1CAxVmKHsBeMaQOH0FOxjRH/7ou046LdqWGXSxDSlpHtquXkHz9kbclPvfheYksvoSMTW8CoYGSbqrm12j6w1Th9No0Pnd+GpoS+C3ULnjudaCgzSsfwM5x2dTy7ogl1umkrP757zL+5sVVWZRpctyqR5bmihhT1Blq76L66L8c0RgK7z95xzv8wKYD0TBnKbdn5hNKUPabE56pfmdDSEpCmlKWc408/Jpcqf6jrM6lh8PxllP9v6LtkZBcETl+YkQCkeHbFbqpQ6LoREWMhle9h1L1jZSrzUWLfwbFoIUW/gfFcWxLZffaOc/6u3kN6lrLv3W683zWhNI0q9rlfKOWQplLm0G3G+4KBlLVY31ePPgt84ka6pei7HBv8SfVnqSbxQ4prV/xFdd23ImIspMC+23ktHbYvwe/huH7HqC4X9QbGdW1JZPdZO475rp5D2lKG3gic/DAz82ddH0CTzQ8W0tn6BeYti/a6vu+J48uVP8W6Bu95+PjypY4csNE4OYze+7Jj+ap9tu+f0KBMozH7jbP2P9emfOmGw/4JrR4GadvtDUvn1r/VZKG/0alyTo0zZxU7qaweaHul0E3uIXS1ccOBLrU/2p2fEbrBs4SurNbEPvkk3WRBUlda16NKmeaT7OuNMqG9C7NofSy7IlDR8t2sm1+hUZSNUTa35K66id6a16FC+ZPmhgaJsO/2VaClRbtE+RZru+cbW7aFWoTNGvat76T3nqhZMXx5dYdE/KcxC2ygur6xOcvOrVaq+avWcMErR2AX2b8jhV9Pii4dcfepO0UdOGwFZVRzx9nftT3NsFb4JPRvVHyndqT3rfOn0ykO14JAiUAaZe15u0toQNi/yIzLqc2EacbNJTrmul7l6UVjr51FDW8cfhbVWGd+5R2VegyoTZcNqdO/d675Wf1SqnHLnW3o2O3BFVVIe06ilrfe2IBa7TX/7692zfAr8jNeDDuprh7oSTo+ePLfX403WqnSf1ofvUKd9aLvckV/WmSdbH3oNBOSutKfdajDsGuq97WuN8qEJSBFsyvslOXfH0OVJ0z4O/AJZWPUzS25q4bQ9WW63tItI+uT4CCR9l19+jo0mfIt/qhNne7pX/1a89qjXtHVbz2Sbi57cb+w5cN2SKR/GqvABqrrj6KRFTvfdDbRx7py5QjsIgtS+PVEuXTE3afuFHVgdQV1VHPH2d91Il1grXAtPR34HsV36jPUxzr/MnM7SlwLAiUCqSO9FDo9k44O/xd5w/qxOZXOMqb+uWy5HcZebrtbN/+L6K7rY6mUscG/ZuU0/EPXJ1IXXX+dWhqj7b+ebg+uqEJ6i9oYy/zX0PyvoSmtNs5Zn9cm7KS6eqALlHsXrHrRw9b7k+md0HlLqM83dJ15ajnd/IYJSV3pLuphnNxc3bzeqBPau/Djj3bHsCsCqcuH3TZRNkbd3JK7aihlTjcu+QC1CQwSad+tyMz52+lb3EUXmlt2qLll6qzqtx5DFY3fLcOWV3dIxH8aO3sD1fXHUq65p241r5vKlcPeRRak8OtJ0aUj7z51p6gDqyuoo1o7zvqu28vm/m5cYu8hpf4KrRW+U//MrbzHOHt3xTLbHa4FgRKBVIeWhE5vpOz9DteezrTAPGfCLb/o7cm+1ZKbu8vYvDPM08fSE8bbLdRI10+jOdaez6kRXFGFtPZt64fGELrX+K4Zm83T/+lhJ9XVAx1nL1nUF9TYfLcyo/be0HkGJL1pZfPf5nZaZkFSV2pOC83Td5vXG3XCEreio9gVgdTlwyApG6NubsldNdS+DbK7bMYf9iD8vtvydn3zx6PDt2hOn5snRxSHFP6tLRHq8uoOsWP+aeycIFnDf01tw64cCqTw60nRpSPvPnWnqAOrK6ijFkEyftA8arz9yPqfJbBWsZ16Dn1gnHyXeka4FiQCqVLR7Xt9B9FOh2tPOQper/eXJvu/xqbGLamxNMQ82cHak7vocF0vT/Yt/OPo18AXFL+zYfvmzXebV/TrqOGkzfZZRSfDVg9U3/5HV2pOXxhvb1PvDTAhPUxTjdsMtVvrJiR1pX259vgfmNcbdcISkKLYFQ67IQySsl3q5pbcVUPpNusSzeh7exCnfRfqyv+cvoWxZTvN97OLQwr/1jeZHyrLh+2QkrMW2wRnSNb9PCvpWPXKoUAqdj0punTk3afuFHV/FF8hOKoCaZ75e6Lej94rWqvYTp1KV+nmTfaZEa4FiUCqW3Sl1ddTrsPtmX+odPAC280LmJ1Cs43Ne8A82ZGWG2//NfbHrqJ/+S8DXxAG6Z32pa1PGrtgzzU5RMfctkZXT6qr7+1o9rHegmbp4T1t/oDfc2j2pqKzTEi/lzJ+Z/rQ+Ec0IakrbQuMv8i43oRNWAKS+64ITBW2G6xrQnBcZbvUzS2xq4xvbt9k7Wj852gO4rjvrLu/T6SGP5kflvwW26hUaMvCIYV9a/OefHV5dYdE/qexc4Jkbc4qaq5eORRITtcT89J2JXdfIGWnhO2PsBWUURVI++vRD3phlap7itYqtlN3lq+6V/8375DCCNeCRCCdQc+ETk+nZg7Xnl2UtT9wgR2UY5842fg5WeLa8S9ljAgU/GdQIT1DeYNenTn7Gvvu6U1Pd8uj3NfVk+rqhdZ2TtUvLnE/9D8Vyvytv0kXKWeZkPTumb/plxifMiGpK/0duLp9ZVxvwiYsAcl9VwSmCtsN1jUhOK6yXermOkGaYK3QwfgF2hyE33e7jjCubrrTtwhu2cLikEp+a3V5dYe4/dOYRYSkXDkUSE7XkyJIJXdfIGWnhO0PdQV1VAWScfv2ZuNH8w3KWsV3ai9jV79FA/UI14JEII0vuulq/NwzfrReT8+bp98uuj2TR1rwEmXJ/mWuMX3ncO2oSMX/RK1CqmX/Ceiu0N95/n0qu9Ju9aS6eqBJdGTw73X7Ry233g8wftM4k+aFfZc+5k+j0dvL9NItSOpKe7PsGx/vmtcbdcISkKLYFQ67oeQfQuyNUTfXCdIw67LNaJk9CL/vZtNh24t9yv4We7PtLXvb3DJ1VodvrSwftkPsIv3TBDZQXV+9YitXDuV3JIfrSREkdvepO0XdH+oK6qgqpLUZtfZfptykKLlTZ9J1ek/rEuy1IBFIf1UI3WM4LzP3N/O+Eevn6x1F155O1v+D+phTvzD+r7Dur/8ju8xuh2tHZ3rTWuiP0OoKpN1U3ny3v7W5C9bZN8w60E/qSXX1QLsOCdxu1vV7qZ31/n90UkFW6AEO9nfpY/zGcFirF837Ry1I6koN6Svz9K3m9UadsASkKHZFIHX5MEhFGxO2uU6QrHvvt+dmbbMHibDvLqJrlSmV/dXIvhl4k7llyqxO31pdXt0hVtw/TWC3WBuo7gv1iq1cORRIDtcTBRK3+9Sdog6srBA2qgrJGGRW+Ua6slbxnVpYtc6ucg2L7wy5RzaYDx0w/8fb/WRZ606lJ6mj8cP6p6rmHptO5xrnvEAtjV/O1lYu+6c+hdqZv/feYv7iVvLa8To1ManPzw7dd6L+RMon47q5f2R1GqAvpVPMZbbXzvpdORm2erDpGdTX/N1383VUIfBP2zHjDnpE3QQLkj4i89R6+wOQ1JVutu5NX1PZvvu7aEJ7F376SQhtFLsikLq8ek1QN0bZXEdIWea9KI9Sp9A9tey+21gh4zPHb3ErnW/Mt7qSuWXqrA7fWl1e3SGR/mkCn7Q3UF1fpaFcOexdZEFyuJ4okLjdp+4UdWB1BXXbrB0X/IeZQvVIeWCXw069lkab9/U5XQsCJfZYu5fLU+4J/9e+HJW2HkNQUIHa3tSz/AN0lvFvlJFzRX99Xxeq2/+yPHrO2IJu1GTwnafS0b87XTuM32hq3Tzigpy80J8Pl1DZs+1eMf79jrr33jYNPqQq962/hI4YcNd1delG80ZU6KS6eqg3K1Bm89Ob5FKdHwLnvEbZZf9SLhGAtC6T7tYDkNSVNlal42/oWdH+D1iZsMQfZKPYFYHU5cN+Iikbo26uE6Q+eX3u6Z2VszA4iMO+C/4n9AjV3+X0LTZVoxNu7VVxgLll6qwO31pdPmyHWDH/NHb2Bqrrq1ds5cph7yILksP1RIHE7T51p6gDqyuoo1o7LvgPszOPjN+SlbVK7NQFlJextvjOkHz099a721bNyW89LHA/2I+nlC1/wrsanWycvq9qqRbGD8XxzcqU62D9VlL4cIuypRreYV6PHSDte65tXnbt3stDSxfdhTtC/3fokaXqXPe7fnm56j/se6Jd1ayKJz2/33yISeikunpRv49qUzW7coeJwYeo6nuqF3t4lQ1JPz3TvOfYghS20vJulUo3fe4POiF8QgdI7rsikLJ8GCRlY9TNdYL05NyO5ct3nB8axGHfhR4i1LLocYVh++tnc8smLrVEKLM6fOuw5dUdYsX809gFNlBZP4yGcuWwdpH9EKGS1xMFErf71J2iDqyuoI5q77jgP8yV5oN/lLVK7NT9h9NJevGdIQkJeZ/kc5Cifj5EupfYThmjPDAlzgDJdwGSQwntlD11qu52v1TkAMl3AZJDCe2UQSX+4hh7gOS7AMmh+HfK8iEnUvNd7pdzCZB8FyA5FP9O+Tiz/MUJPd3fDpAQEgiQEBIIkBASCJAQEgiQEBIIkBASCJAQEgiQEBIIkBASKAFI//zt1r+F210vE3//7Eri4n8X/pfM1XcndccU7kzi6tuSu2MKtyVx9Z3SO6boifwJQPpbc2u3/qfrZeJvx84kLq7pe5K5+p5k7pjt+o4krv5HYRIX1/bovydx9X+kd8yfgOQWIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApeQbp3YnCkysBEhMgcfkXUrOsn4VHLwqQmACJy7+QjqYvhEcvCpCYAInLv5COpE+FRy8KkJgAicu/kOrSHOHRiwIkJkDi8i+k2jRTePSiAIkJkLj8C6k6vSM8elGAxARIXP6FVI2mCY9eFCAxARKXfyHl0wPCoxcFSEyAxOVfSBWor/DoRQESEyBx+RdSWeomPHpRgMQESFz+hVSKThIevShAYgIkLv9CyqLjhEcvCpCYAInLt5AKiZoIj14UIDEBEpdvIe0kaig8elGAxARIXL6F9BfREcKjFwVITIDE5VtIBUR1hEcvCpCYAInLt5DWE1UXHr0oQGICJC7fQlpDlC88elGAxARIXL6FtIIoT3j0ogCJCZC4fAtpGVFp4dGLAiQmQOLyLaQlRFnCoxcFSEyAxOVbSIuIqEB49lCAxARIXL6FtNCAtEF49lCAxARIXL6FNN+AtFZ49lCAxARIXL6F9LEBaaXw7KEAiQmQuHwL6QMD0nLh2UMBEhMgcfkW0vsGpB+EZw8FSEyAxOVbSG8ZkBYLzx4KkJgAicu3kF4zIH0tPHsoQGICJC7fQnrRgJS0g38DEhMgcfkW0iTKTN7BvwGJCZC40hPSjr/d2vM0laVPXS8WZ7t2J2tlM31vMlcv3J7ExXfpu5K4+vbk7hh9WxJX/1d6x2yXgLTbtb2PURWa7365+Crcm6yVzfR9yVx9339JXLxQL0zi6v8ld8dEcbWKv0LxHSMBKYqbdg9RLZou/NM0FG7aMeGmHVd63rSLAtL9VD95R9EHJCZA4vItpNHUNHlH0QckJkDi8i2kkdSaXhGePRQgMQESl28hDacONEV49lCAxARIXL6FdAedTs8Lzx4KkJgAicu3kG6js+k54dlDARITIHH5FtLNdB49LTx7KEBiAiQu30K6kXrQE8KzhwIkJkDi8i2kAXQZPSY8eyhAYgIkLt9C6k9X0sPCs4cCJCZA4vItpH7Un8YLzx4KkJgAicu3kPrSjTROePZQgMQESFy+hXQZDaaxwrOHAiQmQOLyLaRLaCiNFp49FCAxARKXbyF1p5E0Snj2UIDEBEhcvoV0AY2lEcKzhwIkJkDi8i2kc+lBGiY8eyhAYgIkLt9COocepTuFZw8FSEyAxOVbSGfTkzREePZQgMQESFy+hXQWPUe3Cs8eCpCYAInLt5A602S6SXj2UIDEBEhcvoV0Gr1CNwrPHgqQmACJy7eQTqFpdL3w7KEAiQmQuHwL6WR6m64Vnj0UIDEBEpdvIXWg6XS18OyhAIkJkLh8C6k9zaarhGcPBUhMgMTlW0jtaC71FZ49FCAxARKXbyGdQJ9Rb+HZQwESEyBx+RZS68wF1Et49lCAxARIXL6FdHz2l9RTePZQgMQESFy+hdQi52u6SHj2UIDEBEhcvoV0bKlv6QLh2UMBEhMgcfkWUvPSS6ib8OyhAIkJkLh8C6lp2R+oi/DsoQCJCZC4fAupcbmf6Czh2UMBEhMgcfkWUqO8FdRZePZQgMQESFy+hdSw4mo6RXj2UIDEBEhcvoV0dOW1dLLw7KEAiQmQuHwL6aj89XSi8OyhAIkJkLh8C+mIqhuprfDsoQCJCZC4fAvp8GoF1Ep49lCAxARIXL6FdNihWkZL4dlDARITIHH5FlLtGlp2c+HZQwESEyBx+RZSzVpaqSbCs4cCJCZA4vItpOp1tLKNhGcPBUhMgMTlW0iH1tUqHC08eyhAYgIkLt9CqlZPq3yE8OyhAIkJkLh8C6nKkVrVw4RnDwVITIDE5VtIletrh9YWnj0UIDEBEpdvIVU8WqtVQ3j2UIDEBEhcvoWU11CrU0149lCAxARIXL6FVO4YrV6+8OyhAIkJkLh8C6lME61+ReHZQwESEyBx+RZSqWZaw/LCs4cCJCZA4vItpJxjtWNKC88eCpCYAInLt5CyjtOa5QrPHgqQmACJy7eQMlpqx2YJzx4KkJgAicu3kOh4rWWG8OyhAIkJkLj8CmkXtdZa0Vbh4YMBEhMgcfkV0k46QTuBNgsPHwyQmACJy6+QdlBbrR1tFB4+tDogOQdIXH6FtJ3aaSfSeuHhgwESEyBx+RXS33Si1pHWCQ8fDJCYAInLr5D+opO0TvSL8PDBAIkJkLj8CukP6qidSquEhw8GSEyAxOVXSL8bkDrTz8LDBwMkJkDi8iskjU7WzqTlwsMHAyQmQOLyK6QC6qSdTcuEhw8GSEyAxOVXSFvoVO0c+l54+GCAxARIXH6FtJlO07rREuHhgwESEyBx+RXSJjpdO5++Ex4+GCAxARKXfyF11i6ib4SHDwZITIDE5VdIG+hMrQd9JTx8MEBiAiQuv0JaT2dpF9MXwsMHAyQmQOLyK6Tf6P+0XjRfePhggMQESFx+hfQrna31ps+Ehw8GSEyAxOVXSOuoi9aX5gkPHwyQmACJy6+Q1tA52pU0V3j4YIDEBEhcfoX0C3XV+tGHwsMHAyQmQOLyK6TV1E3rT7OEhw8GSEyAxOVXSCvpPO06mik8fDBAYgIkLr9CWkHnazfQ+8LDBwMkJkDi8iukn+kCbRC9Izx8MEBiAiQuv0L6iS7SbqK3hIcPBkhMgMTlV0j/o+7arTRNePhggMQESFx+hbSMemi30WvCwwcDJCZA4vIrpB+pp3YHvSI8fDBAYgIkLr9C+p4u0YbSS8LDBwMkJkDi8iukpdRLG05ThIcPBkhMgMTlV0hLDEgjabLw8MEAiQmQuPwKaTFdqo2iicLDBwMkJkDi8iuk7+gybQw9Izx8MEBiAiQuv0L6lnprY+lJ4eGDARITIHH5FdIi6qM9QI8JDx8MkJgAicuvkL6hy7WH6BHh4YMBEhMgcfkV0tfUV3uEHhIePhggMQESl18hfUVXaI/ROOHhgwESEyBx+RXSQrpSe4rGCg8fDJCYAInLr5C+pKu0Z2m08PDBAIkJkLj8CukL6qdNoruFhw8GSEyAxOVXSJ/T1doLdJfw8MEAiQmQuPwM6SUaKjx8MEBiAiQuv0JaQNdor9IdwsMHAyQmQOLyL6T+2us0WHj4YIDEBEhcfoU0n67V3qSbhYcPBkhMgMTlZ0jv0CDh4YMBEhMgcfkV0mcGpPfpBuHhgwESEyBx+RXSp3SdNsvAlJwAiQmQuPwK6RMD0gfUT3j4YIDEBEhcfoU0j67XPqKrhIcPBkhMgMTlZ0jzqK/w8MEAiQmQuPwK6WO6QZtPlwkPHwyQmACJy6+Q5tJA7XO6RHj4YIDEBEhcfoa0kHoKDx8MkJgAicuvkD6iG7VFdKHw8MEAiQmQuFIIacf4PpfcXWCf/uOBS7vfviIGSHNokLaYzhMePhggMQESVwoh3TNkzcYHBuyzTt805JdND/b6NzZIS6mr8PDBAIkJkLhSB0nr+ovxU+ncpebp7WN+0/Wt56yMHtKHdJP2I3URHj4YIDEBElfqIH15wX7j7fWvh85Y3s382l0bjLQ/3fqQbvlzBZ3lern42rkrSQtb6YXJXL3w7yQu/o++M4mr/7U3iYv/uUf/K4mr75TeMX9HC+mDy823Q58Nfrz9usnmu3ktjb6O/KVGc2mo/jud7Xo5hPzZvtApN0h9zbchSOuvftL8CaUvG2K0fLdbc+j23Vuos+vl4qtwb5IWttL3JXP1ff8lcfFCvTCJq/+3P4mL796nJ3P1QvEdEy2kr+ybdm/YHy29ZLryOfffkWbRrdo66ih8uzQYfkdiwu9IXKn7HemPrqt0fVu3ZdYH/7v4W/Vz7pBm0mBtPZ0oPHwwQGICJK4U3v09dtCaDSNv3q/PeV//r99U82tjuPvbhLSR2ggPHwyQmACJK4WQdk7o3WuMcfFxw/Sl51jNiB7SDLpN20qthIcPBkhMgMTl14cImZC07ObCwwcDJCZA4vIrpOk0RNNKHyM8fDBAYgIkLr9Cet+EVOFo4eGDARITIHH5GlL+4cLDBwMkJkDi8iuk90xIh9YWHj4YIDEBEpd/Id2uaXUOER4+GCAxARKXXyG9ax5Av16+8PDBAIkJkLj8CukdE1KDPOHhgwESEyBx+RfSnZrWuJTw8MEAiQmQuPwK6W0T0nFZwsMHAyQmQOLyL6ShmtaKCoSnDwRITIDE5VdIb5mQ2tMG4ekDARITIHH5FdKbNEzTTqa1wtMHAiQmQOLyL6ThmnYarRSePhAgMQESl18hvWFC+j/6n/D0gQCJCZC4/AvpLk3rSkuFpw8ESEyAxOVXSNNMSBfQIuHpAwESEyBx+RfSCE3rSV8KTx8IkJgAicuvkF43IV1G84WnDwRITIDE5WtIV9Jc4ekDARITIHH5FdJrdLemXUOzhacPBEhMgMTlV0hTTUgDaIbw9IEAiQmQuPwLaZSm3UjvCk8fCJCYAInLr5BeNSHdQm8ITx8IkJgAicu/kO7RtNvoNeHpAwESEyBx+RXSKyakofSy8PSBAIkJkLh8DWkETRaePhAgMQESl18hvUyjNW0UTRSePhAgMQESl18hvWRCGktPC08fCJCYAInL15DG0WPC0wcCJCZA4vIrpBdpjKY9RA8LTx8IkJgAicuvkKaYkB6jB4WnDwRITIDE5V9IYzXtSfNNMgIkJkDi8iukF+g+TXuW7hWePhAgMQESl18hTTYhTaaRwtMHAiQmQOLyL6T7zXschgpPHwiQmACJy9eQXjVf2yUZARITIHH5F9I488ANtwpPHwiQmACJy6+QnqcHzAOADxKePhAgMQESl18hTTL/hPQ+3SA8fSBAYgIkLr9CmkjjNW0W9ReePhAgMQESl18hPWdCmkNXCU8fCJCYAInLv5Ae0rR51Fd4+kCAxARIXH6F9CxN0LT5dKnw9IEAiQmQuPwK6Rnzgd9fUk/h6QMBEhMgcfkV0tMmpG/oIuHpAwESEyBx+RfSI5q2mM4Vnj4QIDEBEpdfIT1Fj2raD9RFePpAgMQESFx+hfSk+Szzn+gs4ekDARITIHH5GtJKOk14+kCAxARIXH6F9IQJ6RfqJDx9IEBiAiQuv0J6nB7XtF+pg/D0gQCJCZC4/ArpMRPSRmonPH0gQGICJC7/QnpC07ZQa+HpAwESEyBx+RXSo/Sk8TajpfD0gQCJCZC4/ArpEXrKeJt9rPD0gQCJCZC4/ArpYeuw36WaCE8fCJCYAInLr5AmWJDKNRKePhAgMQESl38hPWO8rXiU8PSBAIkJkLj8CukhetZ4m19PePpAgMQESFx+hTTegnRIHeHpAwESEyBx+RXSg9aL9dWoJTx9IEBiAiQuv0J6wIJU51Dh6QMBEhMgcfkX0iTj7eFVhKcPBEhMgMTlV0jj6Hnjbf1KwtMHAiQmQOLyK6T7abLxtkF54ekDARITIHH5FdJ9FqRjSgtPHwiQmACJy9+QmuUITx8IkJgAicuvkMbSFONti0zh6QMBEhMgcfkV0hgLUisqEB7fDpCYAInL35Da0kbh8e0AiQmQuPwKaTS9aLw9kX4THt8OkJgAicuvkO61IJ1Ma4XHtwMkJkDi8jekU2ml8Ph2gMQESFx+hXQPvWy8PYN+Fh7fDpCYAInLr5BGWZDOpmXC49sBEhMgcfkX0ivG2660VHh8O0BiAiQuv0K6m1413p5Hi4XHtwMkJkDi8jek7vS18Ph2gMQESFx+hSNkk4kAACAASURBVDSCphpve9KXwuPbARITIHH5G9KlNF94fDtAYgIkLv9Ces14ezl9Ijy+HSAxARKXJ5DKKeXKQLrLgnQVfSQ8vh0gMQESlyeQehg1yGl7wbnHZrS8XgrS68bbq+kD4fHtAIkJkLi8umn3RpNN5rufG74vA2m4Bek6mik8vh0gMQESl1eQmkyz3z/VXBLSQHpXeHw7QGICJC6vIOXODfxkKiUDaRhNM97eRG8Jj28HSEyAxOUVpJq9rHf7e9SQhDTYeisfIDEBEpdXkEZQ04H33DOgEd0uBelN4+3t1uMb5AMkJkDi8grS/vtrkFHV4XslIQ21npUkHyAxARKXd3+Q3f/r11/9si8aRtFAGmr9djTCOiiXfIDEBEhc3kH695u3Nb1QFtK99Jzw+HaAxARIXJ5BejCPaKF+5+VRUXKHdAe9o5lHt3tKeHw7QGICJC6vID1LXZ82IE3JHicD6XYL0jh6THh8O0BiAiQuryA166//a0DS7zhaBtIQC9JD9LDw+HaAxARIXF5BKv2RDenDHBlIu3Xz+vIoPSg8vh0gMQESl1eQDpluQ5pWQRLSE3Sf8Ph2gMQESFxeQTqt4y4T0h9NOktCeppGC49vB0hMgMTlFaRPsurfSFf0qZDzuSSkiTRKeHw7QGICJC7P7v6ee5z5yIbWn0bjKGpIk+ku4fHtAIkJkLg8fKp5wZIlf+rRFS2kl2io8Ph2gMQESFxeQWo7M0pDMUGaSkOEx7cDJCZA4vIKUu3xyYA0jW4VHt8OkJgAicsrSO81emePPKS3aZDw+HaAxARIXF5BOqkp5dasayYJ6X26QXh8O0BiAiQuryC1P+XUQJKQZtG1wuPbARITIHF5fYDIHSslIX1I/YTHtwMkJkDi8hrS3HxJSB/TFcLj2wESEyBxeQZpRq+T2rdv3yavqiSkz6i38Ph2gMQESFxeQZpK2bWpZmnqFNXfk6KF9DldIjy+HSAxARKXV5Banrldz/qx8NGTt0tC+op6CI9vB0hMgMTlFaS8Gbqe9YOuDxogCelbukB4fDtAYgIkLs+e2Ddb1yvM1/UFNSUhLaFuwuPbARITIHF5Bem4C//TGw/V9ffKSUL6gboIj28HSEyAxOUVpJfoVH14Vr+7a7WThPQTnSk8vh0gMQESl2d3f08dq+88najOomggbf/Trf/0v423q6mz6yXj6Z9/k7JsIL0wmasXbkvi4v/oO5O4+t9J3TF79L+SuPou6R3zNwPJatVP0T1ydfcet/bphcZbjTq7XjKe9u5LyrKB9P3JXH1/YRIX36vvTeLqhcndMXoyV98rvWP+iwQp2qK9afcrdRD+gWqHm3ZMuGnH5dVNuyrB8iQhbaR2wuPbARITIHF5BambVesyTUT/jrSFWguPbwdITIDE5fGDVjd3mCEJSctoKTy+HSAxARKX14/+XtRSFFJOc+Hx7QCJCZC4vIa0uYwopNKNhce3AyQmQOLyGNL+0bVFIZVvKDy+HSAxARKXV5CaWzWpSreKQqpUX3h8O0BiAiQubyEdd8oj/5VQkwik/MOFx7cDJCZA4vL6d6ToihrSIXWEx7cDJCZA4vI5pJo1hce3AyQmQOLyClJ22XJKYpDqHCI8vh0gMQESl1eQrjsmu8355x6bcWzPHkZikA6vIjy+HSAxARKXV5DeaLrRfLe8wXQ3RDFBql9JeHw7QGICJC6vIDV+w37/VHNRSA3KC49vB0hMgMTlFaTcj+z300qJQjqmjPD4doDEBEhcXkGqecl+893ec2qIQmqWKzy+HSAxARKXV5DuovrXjhgx4Bi6QxTScZnC49sBEhMgcXkFad+YGuZryFYbsVcUUivaKjy/FSAxARKXd3+Q3f/r11/9si8aRjFAakObhOe3AiQmQOLyDNLOTbq+a/KDv8hCak/rhee3AiQmQOLyCtLyQ8bqhccTVVwsCqkjrROe3wqQmACJyytI5zddrb9ET65ud6EopFNotfD8VoDEBEhcXkE65BVdP6+Jrr9SRxRSZ1ohPL8VIDEBEpdnf5Cdp++tfJuuz8kVhXQW/SQ8vxUgMQESl1eQ6kzU59A8XZ8k+wfZLvSD8PxWgMQESFxeQbqy+u11j9yrFzST/R3pXFosPL8VIDEBEpdXkDa1oaoLdb1Hxe9FIV1A3wrPbwVITIDE5d0fZLeZx89ftCUaR9FD6kFfCc9vBUhMgMTl4VPNd80uiEpRLJAuoc+F57cCJCZA4vIQ0lp6RxzSZTRfeH4rQGICJC6fQ+pL84TntwIkJkDi8jmkq2iO8PxWgMQESFw+h3QNzRKe3wqQmACJy0NI/y35W4+yqCENoBnC81sBEhMgcXl+gMi1opBupHeF57cCJCZA4vIG0med63eeZZ7Yfa/sy7rcTG8Kz28FSEyAxOUJpIU5GYflZEzT9Q+PogaikAbTNOH5rQCJCZC4PIHUreJSveD4RusvpEoT9ohCup1eFZ7fCpCYAInLE0iH32i8mU2ls67VomEUA6Sh9KLw/FaAxARIXJ5Ayn7CeLOOOv4YHaMYII2gycLzWwESEyBxeQKJnjPebKbZ0TqKHtIomig8vxUgMQESl88hjaGnhee3AiQmQOLyOaT76Qnh+a0AiQmQuLyBdMfChQtn0oSFZqKQHqRHhee3AiQmQOLyBpKaKKSH6SHh+a0AiQmQuDyBNEJNFNJjNE54fitAYgIkLp+/GPOTNFZ4fitAYgIkLp9DepbuFZ7fCpCYAInL55Cep5HC81sBEhMgcfkc0gs0XHh+K0BiAiQun0N6me4Unt8KkJgAicvnkF6n24TntwIkJkDi8g7Sv9+8remFwpDepJuF57cCJCZA4vIM0oN5RAv1Oy+PilLUkN6lgcLzWwESEyBxeQXpWer6tAFpSvY4UUgz6Trh+a0AiQmQuLyC1Ky//q8BSb/jaFFIH1I/4fmtAIkJkLi8glT6IxvShzmikOZRX+H5rQCJCZC4PHvpy+k2pGkVRCHNp8uE57cCJCZA4vIK0mkdd5mQ/mjSWRTSl9RTeH4rQGICJC6vIH2SVf9GuqJPhZzPRSEtoguF57cCJCZA4vLs7u+5x5lPRmr9aTSOooe0mM4Vnt8KkJgAicvDRzYULFnypx5dUUP6kboIz28FSEyAxOXzhwj9TGcIz28FSEyAxOUJpAZqopBW0anC81sBEhMgcXkCqb2aKKR11FF4fitAYgIkLp/ftNtI7YTntwIkJkDi8g7SllmTp3ywRRjSFmolPL8VIDEBEpdXkP66KNu8+zuj1z+ikLTMFsLzWwESEyBxeQXp8pwrp8x495lu1F8WUm5T4fmtAIkJkLi8glR5iv1+SBVZSGUbCc9vBUhMgMTlFaRSm+3388rKQqpwlPD8VoDEBEhcXkFq8YX9/skOspDyDxee3wqQmACJyytIc49fsF/X985s/J0spENrC89vBUhMgMTlFaQ21ajcEUeUoToNo3l0Q/SQalcXnt8KkJgAicuzm3ZtY3l0Q/SQ6lYRnt8KkJgAicvnj2zQ6lcUnt8KkJgAictDSNv/spKF1Kic8PxWgMQESFxeQfrl7HLJeKExrUmu8PxWgMQESFxeQTq5Yq9bh1jJQjouU3h+K0BiAiQuryCV+yIaQLFDakUFwhtgBkhMgMTl2eG4NiYHUlvaKLwBZoDEBEhcXkG65Z7kQOpAvwpvgBkgMQESl1eQ/jut/a1jrWQhnUKrhTfADJCYAInLK0hjiZJyr11n+ll4A8wAiQmQuLyCVOOCz1evtZKFdDYtE94AM0BiAiQuz55GkaQ7G7rREuENMAMkJkDi8grScUuTA+kCWiS8AWaAxARIXF5B+uyU75MCqSctFN4AM0BiAiQuryC1r03l61rJQrqU5gtvgBkgMQESl1eQTjo1mCyky2me8AaYARITIHF5/TSKHStlIfWjD4U3wAyQmACJy2tIc/NlId1A04U3wAyQmACJyzNIM3qd1L59+zZ5VWUhDaZpwhtgBkhMgMTlFaSplF2bapamTjNlId1FLwpvgBkgMQESl1eQWp65Xc/6sfDRk7fLQhpLzwhvgBkgMQESl1eQ8mboetYPuj5ogCykCfSI8AaYARITIHF5Ban0bF2vMF/XF9SUhfQU3Se8AWaAxARIXJ49ROjC//TGQ3X9vXKykCbTSOENMAMkJkDi8grSS3SqPjyr39212slCep2GCG+AGSAxARKXZ3d/Tx2r7zydqM4iWUjv0Y3CG2AGSEyAxOXtH2RX/bQnGkcxQPqQrhbeADNAYgIkLs8g7dyk67smP/iLMKT51Ft4A8wAiQmQuLyCtPyQsXrh8UQVF8tCWkTdhTfADJCYAInLK0jnN12tv0RPrm53oSykH+kc4Q0wAyQmQOLy7Lh2r+j6eU10/ZU6spBW0WnCG2AGSEyAxOUVpNx5+t7Kt+n6nFxZSBvoROENMAMkJkDi8gpSnYn6HJqn65NqyELSMo8X3gAzQGICJC6vIF1Z/fa6R+7VC5oJ/46klWksvAFmgMQESFxeQdrUhqou1PUeFaM6BkoMkPKPEN4AM0BiAiQu7/4gu838W+yiLdE4igVSrZrCG2AGSEyAxOXhIxt2zS6ISlFskI7MF94AM0BiAiQuDyGtpXeSAKlxGeENMAMkJkDi8j2kVhmbhLdAAyQ2QOLyPaSu9J3wFmiAxAZIXL6HdAO9I7wFGiCxARKXJ5DW79TX/qf/t+TvMC07xve55O7g/Q8bbukWH6RxyThoAyAxARKXJ5BKT9ep5BP67hmyZuMDA/ZZp+f3nhAnpNfpZuEt0ACJDZC4PIFU5rL5NHFBoOCZWtdfjJ9K59ov9/Lx1oVxQlpIFwlvgQZIbIDE5QmkS0gpeOaXF+w33l7/euDDIKTCbUZ//u7Wbv0v+8TGzBNcLxxz/+ySX7MovTCZq+/5K4mL79D/SeLqfyZ3x+h/JHH1ndI75i8HSIUzXqARkwMFz/zgcvPt0GeLQZrX0ujrErcD+Y4s/08Ml0bIH+0LnQq71+7UFcUv+EFf820JSEuvNfpxj1v79MLAqcH0guulY23vXvEllfT9yVw9qYvv1ZO6Z5K7Y/Rkri5+lfnPGZKu/z7j2YkfKAcs/sq+afdGMUhWMfyOpC2gDsK3TvE7Eht+R+Ly6u9I+27JMX9BKjcudM4fXVfp+rZuyxKFpLXK+ER4GwCJC5C4vII0js6bNGvGM2fQlNBZYwet2TDy5v36nPd1Q8Wcbpr2b1yQXqQTpR/cAEhMgMTlFaRGN9vvr24ROmvnhN69xhgXHzdM1688x+y9uCAVtKT8LbIbAUhMgMTlFaRSH9vvZ5bRoygmSNqmk2mx7EYAEhMgcXkFqdx0+/275eUhaddLP94OkJgAicsrSCd2su7O+7fzyUmA9CBNkN0IQGICJC6vIM3MOKz/PaP61cz8KAmQ3pQ+kj4gMQESl2dPo3inoXn3d9OoXkI2VkjfUVfZjQAkJkDi8vD5SBu/ifLQJzFD2pLbTHYjAIkJkLh8f/ATq3p5shsBSEyAxOX7Z8hadaIVohsBSEyAxHVgQLqSPhDdCEBiAiSuAwPSUHpRdCMAiQmQuA4MSA8J/yEJkJgAictDSMUPfiIIaQoNE90IQGICJC6vILX8yX7/ZqNkQJpJ14puBCAxARKXV5ACRxEqvFv4hcbsvhJ+JVlAYgIkLm8gKcc+aeHgJmFIq+hU0Y0AJCZA4vIG0tJHqNuVZlfdtT4ZkLZmHyu6EYDEBEhcXt20O2NlNIDihaRVqyO6EYDEBEhcHt5rl0xIjWRf3AWQmACJyytIVYLlJQXSifSb5EYAEhMgcXkFqZtV6zJNBiQFUjdaIrkRgMQESFwe37Tb3GFGUiBdQXMlNwKQmACJy+vfkRa1TAqkwfS65EYAEhMgcXkNaXMSjiJkdB89IbkRgMQESFweQ9o/unZSID1HoyQ3ApCYAInLK0jNrZpUpVuTAultGiS5EYDEBEhc3kI67pRH/iuhRgLSfLpUciMAiQmQuA6QP8guo/+T3AhAYgIkLs8grX7/lRkbkgZpU0YbyY0AJCZA4vII0ntNrId+t/00SZC0vKMlNwKQmACJyxtI46lsr4cnT7i4bObzSYJUt4rkRgASEyBxeQJpaWb7TdaJje1ySrwEpgykFlkFghsBSEyAxOUJpMsr/x449Xvla5ID6TRaKbgRgMQESFyeQDq8X+jk1fWTA6k7fSW4EYDEBEhcnkAq9UDo5EPJeYiQ1p9mCW4EIDEBEpcnkMqPDZ28LznPR9LupJcENwKQmACJyxNITS8KnTyneXIgPUiPCG4EIDEBEpcnkG7LWRY49WXmsORAmkx3CW4EIDEBEpcnkDZVrDXbfL9van6V30uykYD0Ht0guBGAxARIXN78QXZuBTr8/D5da1DVL6NxFAekBdRLcCMAiQmQuDx6iNC662oRUb1bN0flKA5I/6OzBDcCkJgAicu7R39v27AjOkVxQdqU0VpwIwCJCZC4DpCnUWhaxfqCGwFITIDEdcBAqpcvuBGAxARIXAcMpJaZW+Q2ImZI3z557RMbor0wIDEBklKqIJ0u+XrMsUDaOGtUl0PMJ1vlX78ouq8AJCZAUkoVpIvpc7mNiBbS8hdvOKGUYajKmXdNu7YyZZ7yYjQ/FgGJCZCUUgXpVslDREYFafMNRxqGMhte9pj9wPP1j7Ygqn3n/1y/EJCYAEkpVZAm0ENyGxENpI1dqOyJN09drZ43t1cZyun2rstXAhITICmlCtI0ukVuI6KAtLEznbCm5NmrRx9FdPTYXyJ9KSAxAZJSqiB9Tj3lNsId0vpOdOKvjp/Z+vY5OVR2bISvBSQmQFJKFaR1dJLcRrhC+vVE6rSe/eyPQ/IzJ/NfDEhMgKSUKkhapXpyG+EGad2JdGrEPxvNK1v6I/aTgMQESEopg9Q4d6vYRrhAWn08dd4YeYXJGbV/4j4HSEyApJQySJ1pudhGRIa0qgV12+S2xK3UirMGSEyApJQySFcQf2Mq1iJCWn4Mnb/ZdYmt51J35lOAxARISimDNIxeENuISJB+akSXRnMwyvXH0QjnzwASEyAppQzSUzRabCMiQFp6BPWJ7qCuP9bIfNnxE4DEBEhKKYM0na4T2wge0uLDqH+0d2rMLpW3wOl8QGICJKWUQVpM3cQ2goW0pTndGP0yT1Bdp4ekAxITICmlDNKmzFZiG8FCuj82rQOpncPde4DEBEhKKYOk5cv9RZaDtCK//A+xrFNwJl1c8lxAYgIkpdRBOrKS2EZwkC6mkbEttO4YGlPiTEBiAiSl1EFqleH+150oYyDNyWzg+ofYYi2umjW1+HmAxARISqmD1FnuyebOkLY0pbdjXmpmbom77gCJCZCUUgepBy2U2ghnSGPpgjjWeqzEXXeAxARISqmDJPgSSY6QlleM7Z6GYNdQx/DbnIDEBEhKqYN0Jzk/kCCOHCH1pFFxLbalM10VdgYgMQGSUuogPUCPSW2EE6SZGQ1jvach0NqG9ID6MSAxAZJS6iBNorulNsIB0ubGGe/Hu9w3lXM+UT4EJCZAUkodpLdpkNRGOEAaTRfFv95L1EJ5oCsgMQGSUuogfUq9pTaiJKSfKub9mMCCXei+og8AiQmQlFIH6Xs6R2ojSkK6KLEnaSyrkFd0jx8gMQGSUuogracTpTaiBKQZGY3ivKch0BjqGjoNSEyApJQ6SFrpY6Q2ojikzY0z3ktsxYLji+6cByQmQFJKIaSaNaQ2ojikUYkfffKznNrrAicBiQmQlFIIqXEpqY0oBmlZXkX22FpR15+uD5wCJCZAUkohpJPI+SDCsVcM0mUOz4WIuXW1sz+1TwESEyAppRBSV1oqtBHhkL7KPszlcJBRNZWOs189CZCYAEkphZAup08cLhlP4ZC60HMiq55N91vvAYkJkJRSCGkQvSW0EWGQ5mQ0ju74W24F/5gESEyApJRCSKNootBGhEE6id4QWvZeOtd8B0hMgKSUQkiP0TihjVAhvU7thFbVClrSKxogsQGSUgohvUJ3CG2EAmlrs4zZQqtq2qfZdX4FJDZAUkohpFl0jdBGKJCeVh7bk3hX00BAYgMkpRRCWkg9hDaiCNKmw7O/ElrUbG3NnPmAxAVISimE9DOdIbQRRZDGUh+hNe1eoBYFgMQESEophLQp4wShjQhB+vWQ0t8LrRnoLHoAkJgASSmFkLRyDYU2IgTpdrln3QZaWi5vIyA5B0hKqYRUq7rQRgQhrcirtFJoyVCj6GJAcg6QlFIJ6ZjSQhsRhHRNrMf6jqItzehd8UWVAIkJkJxyhNSONshsRADS4twav8ksqPZR1mFSj1J3CpCYAMkpR0j/R/+T2YgApB70iMx64Q0U/8VLDZCYAMkpR0gX0xcyG2FDmp9VX+z1LdS21wo+MykZARITIDnlCOlaqaN/25A60xSZ5Yqlv04tZR5P7hQgMQGSU46Q7qBXZTbCgvQdtYj2ZZdjS99zBo1PyspmgMQESE45QrqPnpTZCAvSWBors1rx9D1LylX6OTlrAxIbIDnlCOlpqau+Belk+k5mteLpe7S76IrkrA1IbIDklCOk12iIzEaYkNblNpJZrEQGpI31soq/jp9UgMQESE45Qpot9TwKE9LzSbuT2nys3UTqnKTVAYkJkJxyhLQw8eM42pmQesq9AGCxrAetnkDTkrM6IDEBklOOkJbTmTIbYUAqqFZli8xiJbIgfZBxTHLWByQmQHLKEdJGaiOzEQakWVI/3UpmP43iPHo4KasDEhMgOeUISSsrdP+AAWkQPS+zVslsSEvLVFuTjNUBiQmQnHKGJHUYfQNSo5xfZNYqWeCJfTfSLclYHZCYAMkpZ0iNyshsxI6dS6ijzFIOBSCtPaT0kiSsDkhMgOSUM6S2JHGQbhPS2MReoS9iwaeajxM7WIsaIDEBklPOkM6ixF9+xWzHzk70jchKTgUhbWmU+ZH86oDEBEhOOUPqSV+KbMSOraUaiCzkWOjgJ1PlDuJaFCAxAZJTzpD6C/0RdcdUukFkIceKjiLUKQlP1AAkJkByyhnS7TRVZCN29KbpIgs5VgTp8+zDZX6pUwIkJkByyhnSWHpKZCO2HVo5Kc+NtVOOa3eZ/H0agMQESE45Q3pK6HkUn9BFIus4p0BanldphfDqgMQESE45Q5pKt4tsxG1Cr9HnnHqk1Tuov/DqgMQESE45Q5oldLVslr1aZB3nVEjr6+R8Lbs6IDEBklPOkL6UeaTp9xknSSzDFXbs7yepi+zqgMR0MELa9rtbu/W/HM5dQae7fmUUPUD3SSzDpRcqH2jH0XTR1fc47Ripduj/JHH1PwvdLxN/e/Q/krj6Tukd85cEpD173dqvO527J6uV61dG0Vn0s8QyXPp+9aPPM45139wY2u9+kfjbp+9L5vJJnd35KiPVPukdUygBKd6bdlqVwwR+qK4vc9RO90vFX7GXdelCT0iujpt2TAfjTbu4ITUoL7AJL9FALyF9K3uEcUBiAiSnGEhtJQ6jfxnN8hKS1l/sRaTNAIkJkJxiIHWhxF9gb2uNCn96CmlVfrllcqsDEhMgOcVAupzmJbwFH9H5OzyFpI2my+RWByQmQHKKgXQLvZHwFgympz2GtOnIzI/FVgckJkByioE0hp5OeAuOzV7pMSRtCp0stjogMQGSUwykZxJ/NPWarOM1ryFp7YSe/6EBEhsgOcVAeoNuTnQDptEA7yF9knmU1PM2AIkJkJxiIH1CfRLdgFtpsveQtJ40Tmh1QGICJKcYSD8k/hDQjrQsBZB+KFtF6Dh6gMQESE4xkBI/aPGWvLpaCiBpt9BAmdUBiQmQnGIgaXlHJzj/p+aTY1MAaV31Ut+KrA5ITIDkFAepbn6C899HD6QEkva40BOTAIkJkJziILXMTPDFUi6gz1IDaWsLelNidUBiAiSnOEidKcFXOa6TtyU1kLTZGY0lXjEJkJgAySkO0sX0RULj/0idtBRBMn4YPiSwOiAxAZJTHKTr6f2Exn+ebtNSBemHslUF7gIHJCZAcoqDNIImJzR+f+sXldRA0gbTgMRXByQmQHKKg/SoeadbArXMMl9GL0WQ1tfJ/Srh1QGJCZCc4iC9ktizTdfnNjHfpQiS9rTAq0kDEhMgOcVB+oD6JTL9+3Sl+S5VkLaeQNMSXR2QmADJKQ7St3R+ItMPs4/CnypI2seZDRJ9FDggMQGSUxykNYk9Re4MWmy+SxkkrQfdn+DqgMQESE5xkDT7l5w425p/qPU+dZB+LJe/KrHVAYkJkJxiIdWsnsDwC+kc633qIGl30tWJrQ5ITIDkFAupWc7W+Id/hO6x3qcQ0oY62QsSWh2QmADJKRZSJ0rgplEv+sB6n0JI2iTqmNDqgMQESE6xkLpTAn/TPLq0/ZKuqYSktaPXElkdkJgAySkW0nU0I+7ZV2W2tU+kFNInWUdtSmB1QGICJKdYSMMTeLDdq3SjfSKlkIwbmIkcUwyQmADJKRbSI/Rg3LMPopftE6mFtLxCIq/QDEhMgOQUC+mVBF6PuV1G4FmBqYWk3UVXxL86IDEBklMspA/pqnhH31SmfuBUiiFtPCJrftyrAxITIDnFQvqOzo139Dl0SeBUiiFpUyj+V4MGJCZAcoqF9Fv818HRNCFwKtWQtE70UryrAxITIDnFQtLKNIp39G6h4z2kHNLn2YdvjHN1QGICJKd4SHWqxTt6zUoFgVMph6RdTiPjXB2QmADJKR5Si6w4D2q1mE4Pnkw9pJWVy/8vvtUBiQmQnOIhnU5x/hHmGRoaPJl6SNo91Du+1QGJCZCc4iFdTJ/HN/lV9G7wZBpA2lQ/c25cqwMSEyA5xUO6ochDbDXP/jV4Mg0gaa9Q+7hW9x7ShqVzXn1seP8BsxJ4AosVICmlHNLdNCmuwddmtwidTgdIWqf4HjXoEaRVX06ffN/gvmefUD+Pgh12U2JPpQIkpZRDboLdGAAAGDBJREFUepzui2vw1+i60Om0gPR5dt0NcayeREibfvxi9rOjru95WrMauSE9lY5qc86Vt42bMnNKt9JEjUcsjX99QFJKOaTXaXBcgw+kV0Kn0wKS8UvbsDhWTwakn4df1a3t0fkhPDnVG5/S/dqRj7827wf1z11rH++URZntxq+M89sAklLKIX1MfeMavGXW6tDp9IC0snL5ZbGvLg9py/2VTD3l6rU664q7Hpz4/hcR7hb9aUxLotyzJq6P5xsBklLKIS0NHMAkxtZlNy/6ID0gaWPp4thXF4c0uxmVG/7BEotGNPfaLbr9KKLyPabF/uc8QFJKOaQN1DaeuV+na4s+SBNImxtkfhDz6sKQfr4kg877IfhRlHd/f3xtDaJq/WIdHpCUUg5JyzsqnrlvDD6pzyxNIGlvUeOYD7wqCsm8Vdfg7aKPo/47UsHbvSoSHX7Ll7F8N0BSSj2kenG9jOzxmUW/IqUNJO0iujvW1SUhzW1JZQardyfE8gfZjS93L0vUYPB3UX8FICmlHlKrjDiOHbIup5nyUdpAWpFfJvrroZ0cpJX9MqnzkrCzYnxkwy+Pd86mzNajo3zQFiAppR7SWRTHwz2nqb8ipQ8k7SE6NcbVpSAVPJ5PRxZ/bYzYHyK0YnzrDMrt/Piv7hcFJLXUQ7rMfF3yWBsU9ky69IG09YRYH98gBGlWUyo/ssSP9rgea/fV4COIKlz8puvdeICklHpIg+M5wGKrTPUArekDSVuQc2hsLywrAsnhVp1VvA9aXTDwUKL83jMiPxoPkJRSD+nhOA7I9VtuU/XDNIJk/KyM7aj6ApAcb9VZxf/o74IZvfOI6gyMdBxcQFJKPaQ36KaYp55G16gfphOk9XWzYno+ReKQnG/VWSX0NIr1z/9fLlHLd9gLAJJS6iEtpO4xTz2IXlQ/TCdI2uvUPJYHCSQKafnFGXT+j8wnE30+0qoJ7TMzevzMfBaQlFIPaX1Gu5inbpUZdg9tWkHSzqUxMVw6MUj8rTorgSf2fXwsVRxd4PgpQFJKPSQt/7BYh/6t2Ov8pRek/1Us/330l04I0kctqOzgCMcvkniG7ObR5am14wEwAUkpDSA1y3H+D4+v2K9IaQZJu5+6RH/hBCBx99UVJfNU8x/Ooex+60qeD0hKaQDp/4i7ic81iKaEfZxmkApaxXC8yLghudyqs5I6ZsPLdahGyb+PAZJSGkDqR7NjHLp1+K9I6QZJm59T2+E/cOfiheR2q85K7OAnvw3OKfnDD5CU0gDSSJoY28y/5TYOPyPdIGkDaEC0F40PkvutOivBowh9ejyVHR5+dyQgKaUBpImxHqX0jeJ/80w7SL8dlj0vyovGAymaW3VWkofj2mp80yZhT1kCJKU0gDSb+sU28830QvgZaQdJe4VaRnkPShyQfmxJ5e+O6iHzsse1W3YuZV21puhjQFJKA0g/0v/FNvMJxX5FSkNIWhe6P7oLxg5pTnXqEuXdM9IHiJx2OFV/PvQRICmlAaSC3Gbs55z6LfeYYuekIaRlFfJ+cL+UFgekiWUyBkd7aEfxI62uH5xLJwUffwdISmkASTsstufIvlnipmAaQtLGRPkKajFC2jo8o9wL7hcLlIRDFn/dkUoH7i0EJKV0gNSOfotl5JtLPOUnHSEVtFQOvBeh2CD9ejbV/Dj6iyfj2N9bH69CR75lngIkpXSA1J0WxjJym4ziT4VOR0jap9l1onmaaUyQvm9OrX6K4fLJOYj+qn6ZGd1/BqSw0gHSTfRGDBOvzy3xGn9pCUm7mgZFcalYIM2qRhfGdFjkZL0axYyGVHn8VkBSSgdID9LDMUz8VsnXQU9PSGtr5kTxYucxQHo0N+ue2EZI2su6bBxamk78CpCKSgdIr8V0+O9r1CPa2aUnJO0Fau3+x6SoIW25liq+HuMESXx9pG9PoVJD43nVgGgDJKciQloQ06F+65UpcddEmkLSzqSHXC8TLaS1Z9DhMb8kW1JfaOzlWnR4NA+viDNAcioipHXUIfqB5zv8+TZdIS0pV2m522WihLSoAbXlnqnKl9xX7Ft7fRZ1juGpV7EFSE5FhKRVrhv9wEPpkRLnpSskbRRd6HaR6CC9VZmuivloyMl/6cs5TaniA7E+myzKAMmpyJBaZ0b9rAOtZWbJ40mmLaTNTelNl4tEBWl8TvbYeL5/0l9DtmB8HjX7KCmrA5JTkSFdTh9GO+9Pma1Knpm2kLQ5WfVcfiGPAtLmflT5bddLOeXBizEv607Z/dYmYXVAcioypPsdbq4xPUTDS56ZvpC0K+mWyBdwh7SyAx0Z01+si/LkVc1frUPV43sd4IgBklORIb1f7BgMETqDvih5ZhpDWlMj12FgJVdIX9WnU2I7eGtRnkCyHsnaebH06oDkVGRIq6K+2259mcMdzk1jSNpEahvxsdpukF6vSP1ifzG9QN5A0rT5rYq9noxAgORUZEhajSpRjvui8lrmRaUzJK0zPRbp0y6QRmflRvzyyHkFyXr67DGzRFcHJKdcIJ1KUT4Usxe953BuWkP6rkx+pNcbighpY086JJGrp2eQNO2nCynzeskfSoDklAukG+itqKYtqFbZ6a8paQ1JG06tI/xZNhKkn1pR06WJfGsPIWnaW3WoScyPveADJKdcID1B90Y17Sy6yOns9Ia06Wyq8wn72QiQPqlN50TzTAw+TyFpa7pT6bHRPnnXNUByygXSPOoV1bQ30vNOZ6c3JPNZrWXZlx/jIb2alzEwwUcNeAtJ0ybnU8fonmHvHiA55QJpY3aLqKZtkOv4p780h2QdZ4EzwUHaOjyz1JOJfl+vIWn/O5XypzhcNI4AySkXSNpRZaP5v3cRneJ4ftpD0j6pw91KYyBtuIiqz0n423oOSds6Ope6R/+ArwgBklNukLrRN1EMew9zjKv0h6QtO56aOB4Z1RnSoubUclni39V7SJr2aSOq94HTJ2IMkJxyg3Q7RXODoH2G831YPoCkbehB1WY6nO8E6eercukiiSfNpQKStuHqjOwhcTxUvViA5JQbpCl0u/usK7ObOn/CD5A0bWRWrsNDCktCWjekPNV5QuRbpgSSpk2rTq0WJbo6IDnlBumbaF5S6EnuKen+gKRNrUDXlHi0T3FIm+6rRvn3CP1lM0WQtBVdqHzUj0NmAiSn3CBtrVPK/VeCrsQc0s0nkLQvjqBOq4udFw5p67OHU5lB8T5GtUSpgqRpj5SnLpEe0OEeIDnlBkkb6/ggurBWl6/F/LXPL5C0VR1LPCMiDNIbzSm7j8CdDMFSB0lb1IqqJ3REB0ByyhXS+mrl3P4DG8T+HuUbSOZz9IodCkiBNLcjZZzzVfEvSaQUQtI235adcXUC95gAklOukLS73J4Bt6xsFe6JmP6BpGkTcrPCnpoYgrS4dya1miH5rVILSdM+PJIaRPsyUSUDJKfcIf2aX6H4rw/hXU7sYQv8BEmbWY0uWF/0YQDSioG51FD8eaaphaSt70c5g+N9lBMgOeUOSbuNhkb69Dc5h7F3ZfkKkra4sXoAbwvS2lvLUZ0n5Q/Hk2JImvZCPnWI84BdgORUFJBWls+9NcJN6vOIf+SZvyBp67pQrdBNHgPSxjFVqMpo4SeYWqUckrasE1WK8SWCAwGSU1FA0l46hOqx9/N8nHEM/x+2zyBpW2/JKBO8Gbfn96frUtlb1kT8gnhLPSRt69jS1COerQMkp6KBpK3um0l3MJ/rRK/yX+g3SJo2qUzGLfZ9+TOaUk7fkofqkykNIGnagiZ0WBx3ogCSU1FB0rQPazK/KL1DbSJ8mf8gafNqUZd1mjbnJMro9nUS1rdLC0jaxuszs26K6sWj1QDJqSghad/UcDpunba1JTk94DOYDyGZTyNv/HbXDDolhhfgi7n0gGT8P1iLjo310HyA5FS0kLSvq9OIkudOpjMjfZEfIWkbLyaiptNif1XzGEoXSNrq86jsg7GtDkhORQ1JW3govVb8vC+PzIr4kl2+hKRpo497piD2VzWPpbSBpGlPVqAzXF+aQw2QnIoekvZR1pHh9wQv7plFvSN+iU8hWR0skLTFbahahDuMSgRITsUASetDdykf/TywFB02PvKxRgGJKZ0g2c9Cj/6oSIDkVCyQVlQu/2Po9PVlqM4jbk+3BCSmtIKkaXOPoqPmRnthQHIqFkjafcGj160dkkeHjHX/kz8gMaUZJO23vhk5Q6M8kjkgORUTpC3HZJh3dv86qgpVGhbNbQFAYko3SJr2ajVqG90LVwCSUzFB0t6jZpP7t8imcjdFfkB4MEBiSj9I2vIzqEJUx+sDJKdig6SdS0RZzQZFe3cpIDGlISRNe7AMnRfFf5CA5FSMkJZddOu0GF5OEZCY0hKStvBYqvWO66UAyakYIcUYIDGlJyRt8+CsjH5u9yEBklOAxHUwQtK02fWo0aeRLwJITgES18EJSVvTg0r3nhbpIeEHLKQd4/tccndBydOAlFAHKSRNm1iVqGL3F37jPn/AQrpnyJqNDwzYV+I0ICXUQQtJ2zKjX3Wi0p0fd74L70CFpHX9xfhJdO7S4qcBKbEOXkhGBXMH1yfKaj3a4aCYByqkLy/Yb7y9/vXipwEpsQ5qSGYLBjcnymw9vPjzhA9USB9cbr4d+myx0ws6GX273y1dd71I2pbc2ZO7Y/yx+i8T2mcQHXPXorDF/XWV2Rs1pL4KpKLTiy41+r7Qrf36XtfLxN++fUlcvFDfn8zV9ydzx+zVk7pnBHfMmqfOziGqd/2ne0KL63Krl0z8KrMnWkhf2Tfn3ih+2gw37eLvoL9pF2rl451ziWr3ftl+1syBetPuj66rdH1bt2XFTwNSYgGS0m8vd88jyu/+8sYDF5I+dtCaDSNv3q/Peb/oNCAlHCCFt+Hl3tWIynZ+/I8DFdLOCb17jTEuPm5Y0WlASjhAKtGWGf1qEJXu8lBMB0uJLTxEyClAYvInJLMFg4+2/sD0o/tF4wqQnAIkJv9CMnbMsrtaE2U2Hyz60mvBAMkpQGLyNSTjd6TFo1tnEDUYHPURU6IOkJwCJCa/QzL6+fHOOUR1+81gXj84zgDJKUBiOgAgaeYfmLqWJarV++WYD8TPB0hOARLTgQHJaO1z3coRVb3sNbdDHEYbIDkFSEwHDCTN/ANTz3zD0tUyr94BSE4BEtOBBMlo81u9KxE1vOuHxFcHJKcAiekAg2S08eWuOZTZevy6BFcHJKcAienAg2S0anxrotJdX07o1yVAcgqQmA5ISEZfDD6MqEY/l+MQRQqQnAIkpgMVkqYVzOhdjqjB8HgfjgdITgES04ELyWj9pM7ZlNXxcfY4RJECJKcAiemAhmT0w+gm5jG93oz9UQ+A5BQgMR3okIwWDKxKVGfgohhXBySnAInpIICkaVve7F6GqPnoFbGsDkhOARLTQQHJaPXjHTOoVOdJ0T8YD5CcAiSmgwWS0ZLhhxMd2i/aJ1wAklOAxHQQQTKa2y/fvEf8f9FcFpCcAiSmgwuS+cBW8wFEHR93f+FhQHIKkJgONkhGK80HEOW53iMOSE4BEtNBCEkz7xE/lKjmwIjHegAkpwCJ6eCEpGkFb3Yva94j/jN7CUByCpCYDlZIRmvMe8RzO09iXqoWkJwCJKaDGJLR98OPJKrUe4bT5wDJKUBiOrghaeY94lWIjhr8XYlPAJJTgMR00EPStI2Tz8qlzA4Pzw1/Ti0gOQVITIBktmLscWRU++Srxr0dPAQyIDkFSEyAFOiLe3q3rWJqogoteg574ctNgOQUIDEBktrquZMGd22QZXLKrtep3/g3JV/tApBcAyQmv0Gy2/Tty8N7ty5v/Xiq1Lz78Je/LZBYFpBcAyQmf0Ky+2f9jPEDO9fNMDnlNug68PG5cT1jvShAcg2QmHwNyd4xv5i39pqXtn48Hdqx9+g3l8W7ICC5BkhMBwAku83fvjm6d8dqoVt7kxZsiXlBQHINkJgOGEiBVs193Li1l2lyyqnbeeD4GbEczhWQXAMkpgMNkt3GBZOGd29e1v7x1Nq4tfdtVMcoAiTXAInpwIRkt2XRqyMvPSHf4lSx5Q3uCwKSa4DEdCBDCmTe2uvaIOsU90sCkmuAxHQQQLLbGMV9eYDkGiAxHTSQogmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCURSO69O/pP9wulZ/tHT0n1CHH3w+jvUj1C3L0y+r9UjxBfSYU0rOX6ZC6fzPa17JvqEeJuRstpqR4h7q5ruTPVI8QXIDkHSKkJkJwCpJQESCkIkJwDpNQESAgdzAESQgIBEkICARJCAolC2jG+zyV3F4SfLv4+XYs0+w3nGF2U2vki5TS7vuGWbsU/l4ZFGj3dd3tYopDuGbJm4wMD9oWdLv4+XYs0e9/pmqb9keIBI+Q0+/zeE7oV/1waFmn0dN/tYUlC0rr+Yvy3cu5S9XTx94LfTrRIs+sXLkr1eBFzml3/eOvCbsU+l4ZFGj3dd3t4kpC+vGC/8fb619XTxd8LfjvRIs2+55xHb7xizIaUzhcpp9mNN9a1Mc33e6TR0323hycJ6YPLzbdDn1VPF38v+O1EizT735c9tGLFyMv+Sd10kXOaXQ9cG9N8v0caPd13e3iikKwHAwT2SuB08feC3060SLNbF9h10ZwUjeaa0+x6EFJ67/dIo1ul8W4PTxLSV/bP5jfU08XfC3470SLNbl/iuldTNpxLTrPrgWtjmu/3SKPbpe9uD08S0h9dV+n6tm7L1NPF3wt+O9Eizb7usUJd//eieamekctpdj1wbUzz/R5p9HTf7eGJ3v09dtCaDSNv3q/Peb/odPH36VqE2bdfMmHzhjF9d6d6RDan2f/U5nTTtH/Tfb9HGD3td3tYopB2Tujda8yfuj5uWNHp4u/TtUiz/zKsx6X3bEn1hHxOs19p/jXznPfSfb9HGj3dd3tYeIgQQgIBEkICARJCAgESQgIBEkICARJCAgESQgIBEkICAVKaNILM8jq85XrJ9g2YBRZG832Yr0YJBkhp0gi647nnnhl2GD3sdkmTwpKS/26AlNIAKU0KONheN+9fl0uaFB4FpDQLkNKkoIOb6Wtd//S0vDLHTTI+OunExafkVetpHhFkaqsyeS2n6haFM4xbgS3bV7FeuKFj1T1hC+hFX96+SqH54Qk19hatCEjJCZDSpKCDYfS5Pjerw/Q5/elBXT+1TquPCt7M6qPrr9F5M2acSTMsCiu70aKfJtGbxuU3Zw4MX0Av+vInyHxW3K8ZNysrAlJyAqQ0KejgxOy/9ePqmwfA7mrcyDvVYGVwqqnrY04xfv5sy+5lU7jS+HfbUf4c43OP0XfhC+hFX65lX22ceICWKCsCUnICpDRpBM3cvHnTN1fQtXoB3fiv0dP0jX5qWfNzfTKDl6p9UhEkvW+2cZPvpCahBQKQlC8/65B9ut6qsXoWICUnQEqT7Lu/Kfu63foSCvS2fmpd83Mmm23Dm1TIyqL2CqQFNF7fmDEutEAAkvLlL9En+lq6Tz0LkJITIKVJI2jC7NkfLPhLNyVcsdBKUyB1yLpz/g8/1lQh6Uc30x/O2hRaIAQp9OU7yg7Q78/4TT0LkJITIKVJyp1uf1Cf4MkQpFXUzzhRWDoM0lha1vrMEgsoX673qKkff3LYWYCUnAApTVL/DNS6ovmDacrQwiJIP9HduvnnozY2havIvGN7U9YlNLXkAkVfrr9H79CksLMAKTkBUpqkQvo0p9mUD4flXK78RNpTp9Z7n99y8sl58/4xKdxFd5v3fZ9NFXYVLXDLY2afKV+u78k/ovS2sBUBKTkBUpoU9sCEBafn5Rw9rlCBpC9qW/bQa7ZNr1p5hUlh/XE5Joi36CplAbsBypfr+tV0UfiKgJScAMnPvW8+DAKlQ4Dk4/Yc3ybVI6BAgOTbfnvvzKxvUj0ECgRIvm1SRr2ZqZ4BBQMkhAQCJIQEAiSEBAIkhAQCJIQEAiSEBAIkhAQCJIQE+n/r20q2hiJ7owAAAABJRU5ErkJggg==", + "text/plain": [ + "plot without title" + ] + }, + "metadata": { + "image/png": { + "height": 420, + "width": 420 + } + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de4CMdf//8fee7bJECNFRRSkVioSi033fodItUXSgRHdHpaJbErrJV+V3d5eSziqdnA9JRZEUKpGzsNbux3FZpz1cv7lm5vp8rtmdw/W55jPNzHW9nn/szs5c3vu5rp2Hnd2dA2kIoaijeC8AIScESAgpCJAQUhAgIaQgQEJIQYCEkIIACSEFAZJUK6lD4BlDaIKawa0yqm6NZoCqhUhW6YCE3FBqBwfT/+SWYf/whTtwVg+qZ7soIA0jUY2IW78zTW66afuVdKbs2ipV8Ezr2pkntxlb6Dn9DxrHz29Nb/PPQl2Nc2cQDQk6xhKkXc9cViejVsvBWyr/+5BHoRld+XBh6OVHLsTXfKX/C5R9zoAN0YwPlWVIlnfQe4ieP/U9qWVEcfh8By74F+avgTSlq6dLqI7+rmfErevdLTfdtL0CSG/nUEbLq8/NpJrzNG0WNTHOX021jvDPkpJe4D99c0oUkF7LpswW11xahdJHVfr3oY7CUapeEnEnwhYSUrVbPHXvUJOyv4zuMwTNKiTrOyh7RZGbXjnfgQv+Wf8aSN4+p06WtttEcsfHvH30kKZQ6pADnvcFD1DaCq3sdPrWf8GD9Kj4LBca36n2ZjWzD2kipT9X5Hl/eHwVerbCZSGPwj46NfJehC0kJP+xK76PTimL8nMEnd/B0naWd1D2iiI3PUjeAxfis8YD0lM07b8NPLfwyl9vXa1Kk6GH9PMOPNGkSmbjQZ6rcDf95kVbbShNW9KhWu0+ReXjz8luOqpcC9jec+nqG+pkNf+Ab+8rAJKYqWlTr6yZUf+62RVOHn+xZbWsMwfmiX9TdAK/yT2Y7vHccKDbfB8drZXCb/CspLvrNPOdfIUe9kIyT9p6y4nZzSf5rjemNfsOdRptN8bsyqap/pPzUlP/0LSBNFn/YCn9I3CvzOO7em9+8aWYdsa0u5UP3sP06cL21au1W8AXYj76FY5dWXVaJQ6J6VNs6V7Ls2e76OKAtQZ86mBfXPMBCful0fPvoHm+6avtWdx/W1at1vFb/oX3/YxkOv4BW4c9fOaDEuLaFbBU/cD5Pmtbmumd8DX/GlU8qB1ouvf8GdSx4rVAGaRn6JGcW/tp2m1U/9GnWtOFnv+Uj7ejFoMePIdalWoz76DW4z/WnqUnT7hlYEO6fXCj/r0z6R0tYPtn6Zka1zz8D6KvjO19mSGZZnr+769z79N31Up5J+Bk2d+oyYNP/43qix8+X6GWxskjf3resKwqe70fvU/X8I1W0l39abn35CUnfaxDMk/a24jaD7233p3e641pzZUgPeu9EvrqSQMDrpwBe2UeP30U1Rw/fr//AtPOmHe38sEbTPdnd3m0a0ra18ZCTCurdOwa0zK+MtOn2NOQrhzRv959+rXHfEU3f+ogX9yAAxLuS+PNv4Pm+aavtue2NJ07oFc1zz75D5EXkvn4B2wd9vCZD0qIa1fAUvUD5/usb1A374T76FX/56h4UF+jPt7zb9f3I/BaoAzSKKrh+fFD+4haeAaX309PaNqn1Nqz0GNNdMZTvd87R1OWZ/f+TMtoskfT3qDrA7cfTZnvejYapK92aoibduaZ59NGzznbc1sHnJxIbY5q+n9A3fk/6mb67YK3XvSi9/0V9Lnps/T5kQbop9bSI1N1SOZJ/6ZbPCfz6+nXG/OafYf6qy+PGmM60Lt85Cw6O/DKad6rgPEBt01MO2Pe3coHbwilzvBsOZZa+xdiXlnFY7cuNWN/sE/xb7pZ37OT9D0zr9X8qYN8cc0HJOyXxpdvB83zzV/tKfQ3zz/9I6fqQf8h8kIyHyDz1uEPn/mghLp2mZfqPXDez1qUk7nbs0Vp3ax9fFbgQd2bWfO45+yjNbKLKl4LlEEaTd6DdhXN9x63jPqe2wyfef+DH0zPCUjX6udcSP/1vN1FTQO3H+37lrqM2oSGZJ7ZKCVfP31MCzjZlny3iTIzDxv/6CLfJxF9T+fp79anNCw1fZY+2vk19a/NE7TaC8k8qTkt1U8P16835jVXuhXdiFby03mUXh4aknl8ACTTzph3t/LBG+I7YEdzUvb4FmJeWeCx2/VZY/3bY5BP0Zy+008Oqwgp8FNX+uKaD4ivEF8aX8Egia/2NbRYPz3+0U1mSOYDZN46/OEzH5RQ1y7zUgUkzzealz1vv/T+z+KfVeGgdqa5npNfUI9K1wKFkB7W31Uj3+3zi+hP7/ui/Pzh+pXSgDRYP7O997gdptMCtx/t+8l/PV0YGpJ55gBqMinfd5Y4WV6FfP/xnu+/nabpN2qWVlh2c/re8/Zx828DdEgv0hTPbYaGl2g6JPOkskzyspyrX2/Ma64E6QTxo452kKg4JKSAhQZAMu2XeXcrH7wh9Lh3iwvoF99CKh598etvT3cfC/YpPHtWrL+fUxFS4Keu+MUNOCCV11phF4JDEl/tqsT/0xOQAg6Qeevwh898UEJdu8xLNUFaqP+cqPWjaWJWhYM6hfpq+k32WZWuBQoh6b/sPSy+bks8F7at4j1pgjRW37YDrfW8PeLZ+4Dt/ZduoOZhIJlmHr83g+jcxzdr5pNFlOnbsCPNKe2g95V2Mc3WAntV/wZ//KT0neIsHdLuLM/PTPM8X0QdknnSAariPbncc70JWHMlSKcKv9p2fUIlSP5Vmcf7rgnGck37Zd7dSgfP88l9N1k7eP5z1BcSePT9e+X99ffl1GSN/mHlT3GAsvieBUIK+NQVv7jmAxL+S+MrGCT+1T7kH6YnIAUcIPN1w1vlw+fPdFBCXrvMSzVBKj+dftVKTqx9XMyqcFCLq9Uu1Y7k1i2pdC1QCElf5xFKGeZvs+dHs9yHPpg1594wkAK2twLJPFPTdr7aNZcyPzKfPEgZvi2voLkl3v2cot1a6ffQh6pn79c+oX+aztIhad1Tt2k9PRfpkMyT9vuvbj94rjcBa64E6Vp6jZ+eQRcEgeRflXm875pgLNe0X+bdDQZpvHdCe88P0PpCAlYWcOwOn+G5umnBPoWxZ0srQqr8qc3jzQck0pdGLyykw5RWbmwoIAUcoEqQKh8+f6aDEvLaZV6qCZLn9u0jnm/N/zLNqnhQe3kO9af0gFbpWqAYklaDxB+YT/b9uebfYSAFbG8FknmmtyP/Sz/hqPlkDvl+VDyPfjb+0SQ60/h7Xfmza73vB3p+0riOFpr2xAtpHo0syu6leSGZJ5Wm+W58fKFfb8xrrgRpnLgV77kJ4LmVcT+9qZ/+rNLPSOaFVv5DiG9nzLsbDNJQ77YX0GrfQswrCzx2c+iUogoX+T5Fabpvzz7T98y81iCf2jQ+4ID4Cvel8e+geb75q51LzJhi+hnJfIAqQQp5+MwHJdS1y7xUM6QtKSeX3266SVH5oM6iAVoP7xYVrgWqIV1Dn3g/3KP/ubmafqr8knCQTNtbgRQwc6vvhll7WmM+2d7314A96dn8N2mH6/pvN2vac3SZ9/3v1K4gjd/BwfdZ+nh+Yjil1Tv670e9kMyTmtAP+ulB+vXGvOZKkPZV5788XZiauU3/J96bGk9WgmQeHwBJ7EzA7gaD5P3tfVFm2gHfQswrq3Ds/kn3mVZpOl5NfTcDH9b3zLTWYJ/aPN58QLyF+tL4D4t3B83HwvzVvtL7TVIb1el7MyTzAaoMKdThMx+UENeugKWaIXkWMrtaU800q+JBLand6HDVJhUPRgwgfUTNdKiL0m/WtFrkuR6VP1NP/3XRDLpBC3ZdMG9vPli+7X2ZvyOZZq6ijvqPz0UN03abTmpv02X66Ue9Pxb6m5FCd+o/++YPoOr+L22HlCfpJfOeeCFpw1I7nV7uh2Se9Ij3t+mba/p+/S3W7IP0zdccrfdeFPp//kdfyfH+fu0V6uAZuKa2fuUx71XAePM1wbwz5kMYDFKa/luUl+lK/ptasbIKxy6vesq3QT/FILrJs76NJ+h7Zl5rkE9tHm8+IOG+NP4LfTtonm/+ar9FLTw/uW+pmbPXf4i8kMwHqDKkUIfPfFBCXbvM++Y9cMYX5m06nUx37ApyUO+jkfrv+ipdC1RD8vw8cvIjw7pl5C7Tj/VZzz3X+px5dOLz2zemZNzVP8h1wby9eVd92/taSTn/8PV+wMyedMbAfw84lR7Ub0Txk+VdqdljT3Wis3eblvlJdUptfnWzTGr0q/+cDyk9Z59pCz+krak0XPNDMk/Kq00t/9Wjhu8/YNOaK/1BVtPeq0aZl/69bVWq4r07RUF1avNwj2pj6W+BexUwPuA7kmlnzLsbDFKf3D4jeqdlLDUWYlqZsVfGf0IvUePDwT7Fzjp06aBeNQbqe2Zea5BPbR4fcEC8hfjS+PLtoHm++atddj2d2v/2XHrdOEReSOYDVBlSqMNnPiihrl3mpXoPnPGFKc4lz0/JplmVDupiyk3ZUvFgxAJS2ettctMb9vZ+sYecmdVowG7tjqr1ftWer511cTBIpu0DDpZ3e1/iV7jDAmaW/fey2mk12r1Zrt/FhJ/USl68OCeryZMBSrTdz7aunV6z/RvGXVS14/Uq3L3KB0m7OlX/zbEXUsCktV1PqHL+63vo0sA1B4GkFQ5vUzuj1iVD/b8S/K1jTrVLv2B0ReBeBYwPgGTaGfPuBoP0yoIO1ap1WMQXYlqZsVf8LkItxP0KA47XH/qevbHKK8K01iCfOmC8+YB4C/Gl8eXfQdP8gK92ybgLsqu29/7I6j1EvrsImQ5QZUihDp/5oIS6dpmX6jtwxhfmbv3OP6ZZlQ5q+WnUTqt4MBRAQnFM5WOQLD8eItGL7qCMMt0xRSpASuIAKUhRHZTjjWofjbxVsAApiQOkIEV1UB6q9BdHqwFSEgdIQbJ/UNYOvpyaH468XdAAKYkDpCDZPyhfpVa71fbD/QEJIQUBEkIKAiSEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQVFAenQ/kgdLimKuI2tDhyNzVzPig/GZvCBY7GZu784VivefzxGc4tLIl9z7BWrFR8KuWLx8P0oIO1nkTqi7Yu4ja32HI/NXHbYwl7ZandJbOayYu1AjCaXxmjuIa0oRpPLYjT3oHYwxCV7ASl4gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkIkKQDJBEgGQGSdIAkAiQjQJIOkESAZARI0gGSCJCMAEk6QBIBkhEgSQdIIkAyAiTpAEkESEaAJB0giQDJCJCkAyQRIBkBknSAJAIkI0CSDpBEgGQESNIBkgiQjABJOkASAZIRIEkHSCJAMgIk6QBJBEhGgCQdIIkAyQiQpAMkESAZAZJ0gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkoMSEtenXwUyNfm7Mumj0DJB4giVwEaecbl5C/c/p+kGd3zwCJB0gi10D64O56RO2fe2/K68/2vTyLKLfbh4W29gyQeIAkcgmkjdcT5fRZZHy4/dN+DYjOemGbjT0DJB4gidwBqaATtfx8R8DlhbNvzKDaz/wpvWeAxAMkkTsgPUqX51feZNX91aj2szsqXxA2QOIBksgFkBa/eQM1WB10o3UPVqVT3pLbM0DiAZLI+ZC6E1HTFaE2+6NfOl3xvcyeARIPkESOhzSZzhnywa4wGy5qRxmDg9zwCxUg8QBJ5HRIBeekLY606aST6ELr35QAiQdIIqdDeoVuirzYdTdQlREFFvcMkHiAJHI2pKUfnZT5o5Xlvl6LrrR4xyFA4gGSyNGQ3iaiIdbWu7oDNZhtaUtA4gGSyNGQbqRe71q9H9CuQakZI6xsDEg8QBI5GlLdE2XuTvdxLbp+a+TNAIkHSCInQ/qe/i615lWt6PxfIm4FSDxAEjkZ0nM0Tm7RebdQvQWRNgIkHiCJnAzpRloiu+whKTmR7jEESDxAEjkZUsu0ndLrnlQl9d/htwAkHiCJnAypQT0bC59Xl24I+ysHQOIBksjBkPZmNLez8lUt6OzvwlwOSDxAEjkY0la60tbS8x+gqhNDXwxIPEASORjSCrrZ5uJfzaHeIZ8cBZB4gCRyMKSvqK/d1S9qTBf9EOIyQOIBksjBkD6lR20vf/MNlPNC8LtFABIPkEQOhjSJnotiByadQFf+FuwCQOIBksjBkP6PJkSzByvaUO33gpwPSDxAEjkY0jB6O6pdKHg6k3r8XulsQOIBksjBkB6mz6PciYVNqOqQik/XBUg8QBI5GNJdtDDavdg5siY1ej3wlw6AxAMkkYMh3UQhn4TLehseyKSLAx46C0g8QBI5GFIn2qhiT5ZcRSkdJ4u7vwISD5BEDobUIs3qEwNF6OOLier8a5n/I0DiAZLIwZDOqKVsbxbeWZ1S2r7q/b0DIPEASeRgSCecqXB/tk24hKhm368ByRQgiZwLqTS1pdpd+q7/iUQXPL8JkIwASeRcSHvoKrW7xFjem53SKKvHp4p+9qoQIIkAySj+kDZRN7W75O2XJ08jqt/nPTuv9RchQBIBkpEKSAfH9ek5vMB3evuzvW554ncZSD/TXWp3yd/u+f+sTlTl6rGrFA8GJBEgGamANGLw5ryxA8v0k+X9JhQffe+fRRKQvqYH1e6Svz3H2c7P7juTiJo9PDvc68XIBkgiQDJSAIl12eT5rnTDKi+czms1bW/ndRKQpll92m/J/L+1W/ZsuwyimjdOWKNqMCCJAMlIAaQl3co9b+//yPvBY+OLjnzQ95gEpHfpebW75E/8+nvTGz3qEqU0f3iWkm9MgCQCJCMFkObeob8dMtH7wZ6BnTv33qifWtjC07Lw/1TvFXon8kbRVr5iZLt0zzem7h8Vx/6TIWSqjJ+KBOlO/a0PUsmDE/YXT+2lI1x+m6dfSiJV9h/6JOJGtiqv8DGbckd9oqq3fHIwurllWml0A0JWccWqwopFWozmlmllIS45bhXSD76bdlP10yu6HPG8vWu6cZmFm3bD6GO132T9BbtnQ+GCfzUiyu3+gfxTu4pw006Em3ZGCm7a7emyQdMOdF2tn/65s37TqbcMpMdoltpd8hfiLkKFc+71fF+q2Wuq7Z+XAEkESEYqfv09+qHNO555pFybP10r7j3h4LFPu+2UgDSQvlK7S/5C39euYMbddYjqPmzzD0yAJAIkIxWQisf37jXKs/mYoZq2dXivHo//yi+yAOkusv5K5TKFvdPqrk9vy6W0v38i8wJnRoAkAiSj+N9F6Fb6We0u+Yt07++tY5sSNR4p/6BCQBIBklH8Id1Iq9Xukj8LD6NY0D2Dsrp/LTkYkESAZBR/SH+jDWp3yZ+lxyOteboRUfNx22UGA5IIkIziD6kjxeAe2szyA/vyJ7dLoXrDLby6sxEgiQDJKP6QLiOV9ygVWX+E7Pd3Z1PNQeusbg5IIkAyij+kFplq98hI5qHmfzxSg3LusfjrcEASAZJR/CGdW13tHhnJPWfD1pH1KaO7pV/EA5IIkIziD+mMumr3yEj2yU/yXjiNUq//MvKGgCQCJKP4Q6rfUO0eGck/i1DBexcSXRLslS0CAiQRIBnFH1Ktxmr3yMjO03EVfnApUevZ4TcCJBEgGcUfUtWmavfIyObz2s28kugfS8JtAUgiQDKKP6T0C9XukZHtJ4icfSmldgnzvP6AJAIko7hDKqZWavfIyP4zrRZOOoOyHwp5JzxAEgGSUdwh7aXL1e6RUTRPWbxzTF2q9Wxe8AsBSQRIRnGHlE+d1O6RUXTP/b3t6erUYFzQ+1wAkgiQjOIOaTtdp3aPjKJ9Ev0/+mVS0ylBLgAkESAZxR3SRuqido+Mon81ip+6pdJl8yqdDUgiQDKKO6Q1dLPaPTJS8bIuC6+k1EovmA5IIkAyijuk7+h2tXtkpOb1kT4+m3KfCfytAyCJAMko7pCG0r1q98hI0QuN7XyuBjX+0HwOIIkAySjukN6gx9TukZGyV+xbe1sqXbNMfAxIIkAyijukgnuXRdzGVgpf+nJBK8p8YIvxESCJAMko7pCOaPvU7pGRyteQLZzUkE4a538BQEASAZIRIFlr22NZdKHvbuGAJAIkI0Cy2vIulNJd/1U4IIkAyQiQrPfRWVR9RD4gmQIkI0CSaOfw6nTePEASAZIRIEm15mZK7X8AkIwAyQiQJJt2FtV/NSaTAUkESI6HxHYMyaKrwzyA1n6AxAMk50Nih9e3p+zHonnNvxABEg+Q3ABJ2/f/atF5c5UPBiQeILkC0n62pjul9t6seDAg8QDJJZAY+6IxnfT/1A4GJB4guQYS2/5YJl2j9JcOgMQDJPdAYmxRK8p+WuFr0gASD5DcBIkVjK1Bzb9VNhiQeIDkKkiMre5KGYNCPP+ddIDEAySXQWJsSgNqUvmZhmwFSDxAch0ktql3Snq/P1UMBiQeILkPEmMfN6TTvlAwGJB4gORGSGzbA6kpvSVeFz1EgMQDJFdCYmzmmXTKp9EOBiQeILkUEtv+QFpKtPcZAiQeILkVEmOzz6Z670Y1GJB4gOReSGzHYxnUZV0UgwGJB0guhsTYN+dT3cn2BwMSD5BcDYntfDqTuvxhdzAg8QDJ3ZC835RqT7I5GJB4gOR2SCx/SCb9fbWtwYDEAyTXQ2Lsu1ZU46VCG4MBiQdIgMRY4biq1MbGa2wAEg+QAElvxRVURf4hf4DEAyRA8jWpFrX6TnIwIPEACZD8/X49ZTwg95A/QOIBEiDx3qtHTefLDAYkHiABkmhjb5J6yB8g8QAJkMx91JBO+8zyYEDiARIgBeR9yJ/VR1cAEg+QAKlCs86iem9b2xSQeIAESBWz/ugKQOIBEiBVbsF5VMfKHVkBiQdIgBSknU9mUrdNETcDJB4gAVLQvmtOjaZH2giQeIAESMHLfywtpV+EOzoAEg+QAClUs0+jpovCbgFIPEACpJBt7k1VRoZ7nBIg8QAJkMI0qSZd+VvoiwGJB0iAFK6VbenE0M99B0g8QAKksBWOzKTuoe7HCkg8QAKkCC1qSmctCH4RIPEACZAitaNfSvpjQR+GDkg8QAKkyH1cj1otD3I+IPEACZAs9Md1lDuu8tmAxAOkCh3YHakj2v6I29hqb0ls5u4+bGGvIvXfHOq6oeKZe0qjnhu8Yq0oRpPLYjT3UNKt2AMpxCX7VEA6VhKpMq004jb2Ko/RXCUrXt+GTllQ8cyEXnHQkm/FWozmlmllIS45rgISbtqFKsid73DTjoebdoBkuRmN6IIl5jMAiQdIgGS9TTdTNfMD/gCJB0iAJNOEKikD8vlHgMQDJECS6pvTqDV/DRhA4gESIMm1+XqqbTzzHSDxAAmQJCt8Oi39ad9JQOIBEiBJ90Ud+rv3mVEAiQdIgCTfLy2p8WIGSKYACZBslNfP+3twQOIBEiDZ6pXslH47AYkHSIBkr69PpTZrAMkIkADJZpv/QfXDP1uX/QBJBEiyJRkkVvh0qvF7cNUBkgiQZEs2SIx9UZdukniBP+sBkgiQZEs+SLs3t6CzZF8K3UqAJAIk2ZIQUkleb6r2pvrBgCQCJNmSERJj/0//PbjqwYAkAiTZkhMSW3gqXfa74sGAJAIk2ZIUElvfkerPVjsYkESAJFuyQmIFg1Iz/6N0MCCJAEm2pIXE2Acn0N1Bn4vVZoAkAiTZkhgS++lsukbhX5QASQRIsiUzJLbhcjr3F2WDAUkESLIlNSSW153qf61qMCCJAEm25IbECh9LqfahosGAJAIk2ZIcEmMTMtJfUDMYkESAJFvSQ2Kf1qB+4V682XKAJAIk2ZIfElvciLruUDAYkESAJJsDILHVzemSddEPBiQRIMnmBEjsz2vpnBVRDwYkESDJ5ghIbFdfqjs/2sGAJAIk2ZwBibGRqTnvRDkYkESAJJtTILHJVdJGRzcYkESAJJtjILG5talfQTSDAUkESLI5BxL76Sz6+7YoBgOSCJBkcxAktr4NXbzG/mBAEgGSbE6CxPK60Snf2x4MSCJAks1RkFjhY3TCF3YHA5IIkGRzFiTGXsrIfMXmYEASAZJsToPEpuamPGZvMCCJAEk2x0Fii06mW2095x0giQBJNudBYr+dTx022RgMSCJAks2BkNjWq6npSvnBgCQCJNmcCInt7EX1F0sPBiQRIMnmSEiMPZVSa6HsYEASAZJsDoXEXkitLvuMxoAkAiTZnAqJvZqe84ncYEASAZJsjoXE3sjIfFdqMCCJAEk250JiH2RlTpYZDEgiQJLNwZDYF1XTJkgMBiQRIMnmZEhsZm7qi9YHA5IIkGRzNCS2oFbKc5Y3BiQRIMnmbEjsu5NosNVtAUkESLI5HBJb2oAesLgpIIkASTanQ2IrTqO7rT01OCCJAEk2x0Niq86g3paeXQiQRIAkm/MhsTXnUrd8C9sBkgiQZHMBJLbhYupq4aF+gCQCJNncAIltakVXR37hF0ASAZJsroDE/uxAbbdG2giQRIAkmzsgsbzr6NLNEbYBJBEgyeYSSCzvemoe4cXIAEkESLK5BRLb1YPOWR12C0ASAZJsroHECm6jxr+E2wCQRIAkm3sgscJ7qdHyMJcDkgiQZHMRJMYG0ck/hL4UkESAJJurILGnqc63IS8EJBEgyeYuSOz5lBPmhboMkESAJJvLILFxoZ+mC5BEgCSb2yCFeZouQBIBkmyug8Tezcx8J+gFgCQCJNncB4m9n5X5VrDzAUkESLK5EBL7JCcj2BPeAZIIkGRzIyQ2LagkQBIlMqSD4/r0HF7g/2BW3xvv/xGQ7KQAEptZNW1ipTMBSZTIkEYM3pw3dmCZ9/SC3ssLvuhXDEg2UgGJzaia8XbF8wBJlMCQWJdNnu9KN6zyftDvq4DLAEkiJZCCSQIkUQJDWtKt3PP2/o/007s7f/Wvmx9dq58sOeBp7+5IHdH2R9zGVnuPx2bu7sPagdgM3lOiZMyMnIx3A88p1oqUTK5caYzmHorZiscXYRQAACAASURBVMtiNNcDKcQl+6xCmnuH/nbIRP3tus5Pbi+a2GO/5+TCFp6Whf+nKBZ9WzXzi3ivAfHK+KlIkO7U3xqQPLfwSm9d4Dm56j5Pvx2PVJlWEnEbe5XHaG7ir3h2duZn5o9LE37FFSvVSmM0+a9f8TGrkH7w3bSbqp9mnTd43g6calyGn5EkUvMzkt7UKgEvRYafkUQJ/DPSni4ePAe6rvZ+G+s9Q9OOdV8ESDZSB4l9nJX5nvgIkEQJDEkb/dDmHc88Uq7Nn65pU3utZC/3PgJINlIIKVASIIkSGVLx+N69Rnk2HzPU8y3p7dtvfGIbvwiQJFIJKUASIIkSGVKYAEkipZB0Se/7TwKSCJBkczsk9lFW5ge+U4AkAiTZXA9JSAIkESDJBkjsw8wq3sfMApIIkGQDJC4JkESAJBsgMa+kTwHJHCDJBkh6UzKzPwMkU4AkGyB5+8AjCZBEgCQbIPmanJE9G5B4gCQbIPmbnJGzEJCMAEk2QDLySJoZm8mAxAMk6ZIOEns/PfuL2EwGJCNAki75IBVPTc+JjSRAMgIk6ZIQkvZOes60WEwGJCNAki4ZIR2YFBtJgGQESNIlJST2RnrOdPWTAckIkKRLTkgeSblzlE8GJCNAki5JIcVEEiAZAZJ0yQqJvZ5efa7iyYBkBEjSJS0kNjFNtSRAMgIk6ZIXEpuQWmOB0smAZARI0iUxJI+kE5eqnAxIRoAkXTJDYmOowQqFkwHJCJCkS2pI7DE6Y426yYBkBEjSJTck1p8u2qJsMiAZAZJ0SQ6psAddvkPVZEAyAiTpkhwS23k1/S1f0WRAMgIk6ZIdEtvemnoUqpkMSEaAJF3SQ2KbzqcH1UwGJCNAki75IbG1Z9IwJZMByQiQpHMAJPbTSSnjVUwGJCNAks4JkNiimmlvKpgMSEaAJJ0jILG5OZkfRz8ZkIzkIFU1lQlIivtLIbFPMqt9GfVkQDKSg3SLp3My2nS74cKUFvcDkuL+WkjstdRa30c7GZCMpG/aTW22U3/3R5PpgKS4vxiSijuwApKRNKRmH/ve/685ICnur4ak4A6sgGQkDSlzgf87UxYgKe4vhxT9HVgByUgaUoNe3nflt9QHJMX99ZCivgMrIBlJQxpG5z8wYsTApvQEICnur4cU9R1YAclIGlL5f+qTp9pPlwKS4uIASb8D6y1R3IEVkIxs/EG2/M9lP2wqs8IIkKSKB6Qo78AKSEY2IB358TOmlQCS8uICKbo7sAKSkTykF3KJlmpP3WGJEiBJFB9IUd2BFZCMpCFNpC6veiC9nT4GkBQXJ0jR3IEVkIykIV3QXzvigaQ9eTYgKS5ekKK4AysgGUlDqvKlD9K8DEBSXNwg2b8DKyAZSUOqO8MH6ePqgKS4+EGyfQdWQDKShnRVh8M6pD3NrgEkxcURkt07sAKSkTSkr9MaP0h39ame8R0gKS6ekNjjtu7ACkhG8r/+XnCRfs+GS76x4giQZIorJHt3YAUkIzsPNS9YuXKvZi1Akii+kGzdgRWQjKQhtZll0RAgyRZfSLbuwApIRtKQGo4DJGdCsnMHVkAykoY0rennxwEpJsUbkn4H1gfkJgOSkTSkdudTZoNT9QBJcXGHxNY2lrwDKyAZSUNq27GTP0BSXPwhSd+BFZCMbD9B5MH1gKS4BICk34F1ksRkQDKyDWlBLUBSXCJAYnOrytyBFZCM5CHN7NWubdu2rXNrA5LiEgIS+zCj2kLLGwOSkTSkKZTekBpUoSst/T0JkCRKDEjsfyknWb7bHSAZSUNqcV2RlvZbyctXFAGS4hIEEhtKZ6+3uCkgGUlDyp2paWm/atpDAwFJcYkCifWl1hbvLARIRvIP7JujadUXadriBoCkuISBVHA9dSmwtCUgGUlDuujmY9p5QzRtWlVAUlzCQGLbW9F9ljYEJCNpSO9SJ+3ptH7DT74MkBSXOJDYusb0nJXtAMlI/tffU0ZrxVcTNVoOSIpLIEjspzqpVp5ZCJCMbP5BdsMaa/dcBSSJEgkS+zIna2bkrQDJCK8hK507ILEP0msuibgRIBlJQzrRKBeQFJdYkNj/UaPfI20DSEbSkLp6uyS7Gf6OpLoEg8Qeogu2RtgEkIzs3rTLbz8TkBSXaJAKe1CnCI89ByQj2z8jLW8BSIpLNEhs5xXUK/wWgGRkG1J+NiApLuEgsc3N6PGwGwCSkV1I5SMbWoFUtDdSR7UDEbex1f6S2Mzde8TCXtlqX2ls5u49rB20+S/XNEyZEO7yMptzI3VYOxSjybFacbFWHOKS/cEhNffWrDYNsgLpWEmkyrTSiNvYqzxGc1214l9qZswMc3ECrjhCWozmlmllIS4Rf3ENAumiji8dswIJN+0kSrybdp6mZVb7KvSluGlnhD/ISucuSGxiapjH+QGSESBJ5zJIYR/nB0hG0pDSc6qaAiSFJSikcI/zAyQjaUgDzk1vfdMNF6Zc2OMWT4CksESFFOZxfoBkJA1p6vl5+ru158yIhAiQJEtUSGEe5wdIRtKQzpvqe/+/5oCkuISFxNadGeJxfoBkJA0p80vf+4+zAElxiQsp5OP8AMlIGlKDnuX6u9LO9QFJcQkMKdTj/ADJSBrSv6nxfcOGDTyXngQkxSUypBCP8wMkI2lIZaPq668hW2dYKSApLqEhBX+cHyAZ2fiDbPmfy37YVGaFESBJldiQgj7OD5CM5CEV79S0w5Nf2ARIqktwSMEe5wdIRtKQ1tYdrZW0JKqxApAUl+CQgj3OD5CMpCHddP5G7V16ZeNlNwOS4hIdEtt8XsXH+QGSkTSkuu9r2o3NNO39RoCkuISHxH5rmPJSwBmAZCT/B9mFWmnNxzVtfiYgKS7xIbHFJ2R8ZP4YkIykITV6Q5tPCzVtEv4gq7okgFTxcX6AZCQN6e56T5x6ZqlWcAF+RlJdMkCq8Dg/QDKShrSzNdVeqmm31PgFkBSXFJACH+cHSEY2/iB7QH82h+W7rDgCJJmSA1LA4/wAycjOQ80PzymwpAiQ5EoSSObH+QGSkR1IW+hzQIpBSQLJ/Dg/QDICJOlcD8n0OD9AMgIk6QBJPM4PkIwASTpAEo/zAyQjO5COrdyvWQyQJEoiSMbj/ADJyP4TRG4BJMUlEyQ2hk75HZBEkpC+vabxNbP1E0efw8u6qC6pILEH6aJtgMSTg7Q0I+WUjJSPNW3eWXQOICkuuSAVdqerdwGSkRykrjVWaQUtm26/mU4YfzwYHECKouSCxPIup/6AZCQH6bQHPW/mUJW0+5gVRoAkVZJBYusb04uxmMucDyn9v543W6nDb9YYAZJUyQaJ/VQ79Z2YDHY8JHrd8yaf5lh1BEgyJR0kNi8ne15MBgMSINkv+SCxD1NPWhmLuYAESPZLQkilQ+mcjTGY63hITy5dunQWjV+qB0iKS0ZI7E5qm6d+ruMhmQMkxSUlpF3XUE/1c50OaZg5QFJcUkJiW86jocrnOh2SdIAkUXJCYr81SHlF9VxAAiT7JSkktrBq5ueK5wISINkvWSGxD9Nr/aB2LiABkv2SFhIbR6f+oXQuIAGS/ZIXEutPl+4Iu6FkgARI9ktiSAXXU9eCsFvK5QpIR378jGklgKS8JIbEtrekhxTOdQOkF3KJlmpP3WGJEiBJlMyQ2LrTaay6uS6ANJG6vOqB9Hb6GEBSXFJDYt+fkDFV2VwXQLqgv3bEA0l78mxAUlxyQ2LTM3O/VTXXBZCqfOmDNC8DkBSX5JDYayn1f1E01wWQ6s7wQfq4OiApLtkhsUF0wVY1c10A6aoOh3VIe5pdA0iKS3pIhbfQVbuUzHUBpK/TGj9Id/WpnvEdICku6SGxne2pj5K5LoCkLbhIfzDSJd9YcQRIMiU/JLapKY1UMdcNkDStYOXKvZq1AEkiB0BiP9dJfUvBXHdAkgiQJHICJPZldpU50c91OqRzzAGS4hwBiU1OPXF51HOdDqmtOUBSnDMgsWF09oZo5zodknSAJJFDILG76bJon1jIFZB2zZ789txdgKQ8p0DadR3dGuVcF0Da9890/dffKb0OAZLinAKJbWlGT0Y31wWQ7si4++2ZX7zWlfoDkuIcA4n9dnLKf6Oa6wJINd/2vR98IiApzjmQ2KLqmZ9FM9cFkLLyfe8X5gCS4hwEiX2UXnNpFHNdAOni733vX2kPSIpzEiQ2nk5Za3+uCyAtaLm4XNNKZ533MyApzlGQ2ED9xZrt5gJIretQ1TPOyKZGTazcuwGQJHIWpMKbqLPtJxZyAaSL28jcuwGQJHIWJLa9FT1gd64LIMkFSBI5DBJbdwb9x+Zcd0Aq2ucNkBTnNEhsea209+zNdQGkTf+oihcai81gx0FiMzKrfWNrrgsgXVGj16DB3gBJcc6DxCam1FtlZ64LIFX93gogQLKRAyGxwdR0k425LoBUNw+QAMkoIqTCHtQpX36uCyA9OgKQAMkoIiS2swPdLj/XBZCOXdV20Ghv/KyD4/r0HF7AP1zQeSkg2cmRkNjmpvSs9FwXQBpNVPG3diMGb84bO7DM/9G+27sBkq2cCYmtqJM6WXauCyDV7/bdxi3ejHNYl02e70o3rDKkTbodkGzlUEhsXnaVLyXnugBSVqVfNizpVu55e/9H/o/6HvFBKjngae/uSB3R9kfcxlZ7j8dm7u7D2oHYDN5TEpu5u4u1ohhNLrW01Vspp2yQm3soZisui9FcD6QQl4h7LgRAumiVVqG5d+hvh0z0fnCw90rNB2lhC0/LKm6M3NhTdFVpvNcQt8r4qQBI33b8pcKGc+/U3/ohvfii5oe0Wv+j7dqjkSrVjkXcxlbHymIz17Pi47EZfKw8NnOPlsRqxUctrvjwVTRUam7cVyxdiVYS6qLgkNo2pGqnejPO+cF3026qfnpl7yIDkjf8jCSRU39G8rS2QeqHMnNd8DNSu05Gxjl7umzQtANdV+unx3Tr2bNnl+6jAMlGDobE5mbW/FlirgsgGR1cz0+OfmjzjmceKdfmT/ft/m3zDwCSjZwMiY2iiySeNdJFkBbU4ieLx/fuNcqz+Zihvo9x085ejobEutOd1jd2A6SZvdq1bdu2dW7tYLwqBkgSORvSn01oguWNXQBpCqU3pAZV6MpZgKQ4Z0NiP+RmLbS6rQsgtbiuSEv7reTlK4oASXEOh8TeoNOtPqTCBZByZ2pa2q+a9tBAQFKc0yGxe+m6QmtbugBSlTmaVn2Rpi1uAEiKczyk/NY0zNqWLoB00c3HtPOGaNq0qoCkOMdDYr+flD7N0oYugPQuddKeTus3/OTLAElxzofEvkiv86uV7VwASZsyWiu+mqjRckBSnAsgsaHUysrfZd0AyduGNcetOAIkmdwAqfB6GmBhMzdAKt6paYcnv7AJkFTnBkhsy1kpkyJv5QJIa+uO1kpaEtVYAUiKcwUktii76vcRN3IBpJvO36i9S69svOxmQFKcOyCx/1HjLZG2cQGkuu9r2o3NNO39RoCkOJdAYr3pxkibuABS5kKttObjmjY/E5AU5xZIeRfS8xE2cQGkRm9o82mhpk2qD0iKcwsktrJWxqzwW7gA0t31njj1zFKt4AL8jKQ610Bin6Sd/EfYDVwAaWdrqr1U026pUfE5UAAp2twDiT1K7XeFu9wFkDTtgP632OW7rDgCJJlcBKmgIw0Kd7krIGmH5xRoFgMkiVwEia1vlPJOmIvdAWkLfQ5IMchNkNi8zBN+Cn0pIAGS/VwFiT1P520PeSEgAZL93AWJ9aDeIS8DJECyn8sgbW9GL4e6zOmQthdrW45px1buB6QY5DJIbFn1rK9CXOR0SFVmaGTpAX2AZCO3QWLvpDRaH/wSp0PKvn0RvbHYHyApznWQ2AC6JvjTCjkdUk8yBUiKcx+k/DY0NOgFTodUMvMtGjbZHyApzn2Q2Jp6qVODne90SJ46rbMCCJBs5EJIbHZG7WBPK+QCSJq2e+bEN+ZaesJiQJLKjZDYMGoZ5GmFXACp7NEM/QekqmMASXWuhFTYme6tfK4LII2hGyfNnvnatfQ2ICnOlZDYlrPplUpnugBS00d87++5GJAU505IbFFO1e8qnucCSFlf+d7PygYkxbkUEnuNGm+ucJYLIFWd4Xv/RTVAUpxbIbE76YYK57gA0uVXHtPfHbnmCkBSnGsh7WxFowLPcQGkWSmn9B/xbL8GqV8CkuJcC4mtqpUxM+AMF0DSPm+i//r7fEsvIQtIMrkXEvs0re5q88dugKRpeT9afOoTQJLKxZDY43S5+WmF3AEJT34Sm8FuhlTQiR4yfegOSHiEbGwGuxkS23BKylviI0ACJPu5GhL7ukqN5fwDQAIk+7kbEnuBzt1mnAYkQLKfyyGxW+mfxkl3QMKTn8RmsNsh7biAxvtPugBSizW+9580BSTFuR0S+7FG1gLfKRdA8j+LUMlwvNCY6lwPib2X0mid94TjIZme+wQPo1AdILEH6OoC/b3jIa16ibrerdf339sBSXGAxHZdQU/q7x0PSdOuXW8FECDZCJAYW1s/9WPmCkhyAZJEgORpTmbNFa6AdKJRLiApDpD0RtDFeW6A1NXbJdnNBgKS4gDJW1fq6wZI/vLbzwQkxQGSty1n0UT3QNKWtwAkxQGSr8XZNda6B1I+nkVIdYDk7z90WalbIJWPbAhIigMkf4VX0wuOh9TcW7PaNAiQFAdIRmvrZC2JzeREg3RRx5eOAZLiAIk3hZruiM3khIEkFyBJBEi8Q33o/thMThxIG6e/P3MHIMUgQOIdOnhG6ucxmZwokKY18971u803gKQ8QOId0uanNdgQi8kJAmkc5fR6cfL4W3NS3wQk1QES75BW9AB1j8XkxIC0KrXtTu+JvMsyLL0EJiBJBEg8D6SdF9LrMZicGJDuqLnbf2p3zXsBSXGAxNPvIvRdlRqr1E9ODEin9eMn72kMSIoDJJ73vnYj6IpC5ZMTA1LWWH7y/3AXIdUBEs8LqbAjjVY+OTEgVRvNTz6PxyOpDpB4vnt//1oza5HqyYkB6fx/8pOdmwOS4gCJ538YxWQ6P0/x5MSA9HjGav+pJalDAUlxgMQzHo90Ez2seHJiQNpZ4+Q5+vuyKbVO3F2ZDSBFFSDxDEgbG6ZOVzs5MSBpC6rTaTf16VKfai+x4giQZAIkHn+E7PTUU7YonZwgkLStA04motMH5VtyBEgyARJPPNS8P/VSOjlRIHk6sOOgNUWAJBcg8QSkvHPpTZWTEwiSTEX7InXUwja2OlASm7meFR+MzeD9pbGZu++IdihGk8tiNPewWPHSrBPXKZwcqxUXa8UhLjmgAtLR45Eq00oibmOrkvLYzI3dio/HasWlSbjiUn56JF1zTN3kv2LFgYlHwOKBfQHhpp0o9jftGCtoS+PUTU7Sm3aAJBEg8QKe127VCTk/KJsMSLIBEi/JIbEJdOFOVZMBSTZA4iU7JNaVBquaDEiyARIv6SFtaJA+V9FkQJINkHhJD4lNTTltq5rJgCQbIPGSHxK7i+5SMxmQZAMkngMgbWuc8oGSyYAkGyDxHACJfZVRe62KyYAkGyDxnACJPUF/VzEZkGQDJJ4jIOW3ogkKJgOSbIDEcwQk9lO1qj9GPxmQZAMknjMgsXF0ya6oJwOSbIDEcwgkdi0NjXoyIMkGSDynQPqjTsb8aCcDkmyAxHMKJPYunb09ysmAJBsg8RwDid1G90Y5GZBkAySecyD9eUbKh9FNBiTZAInnHEhsdlr99VFNBiTZAInnIEjsEbo+qsmAJBsg8ZwEKf9iejWayYAkGyDxnASJLcmuvjKKyYAkGyDxHAWJjabWBfYnA5JsgMRzFqTCq2iE/cmAJBsg8ZwFif1WK/Nb25MBSTZA4jkMEnuLmuywOxmQZAMkntMgsVvoX3YnA5JsgMRzHKRNjVK/sDkZkGQDJJ7jILGZaQ022JsMSLIBEs95kNj91MPeZECSDZB4DoSUdx69YWsyIMkGSDwHQmLfVTlhlZ3JgCQbIPGcCIk9S1cU2pgMSLIBEs+RkAra0fM2JgOSbIDEcyQk9mvNrMXykwFJNkDiORMS+x9dkCc9GZBkAySeQyGxG+lR6cmAJBsg8ZwKaWPD1OmykwFJNkDiORUS+zTl1C2SkwFJNkDiORYSu4dul5wMSLIBEs+5kPKa0mS5yYAkGyDxnAuJLcw88XepyYAkGyDxHAyJDaGOUndwACTZAInnZEgFl9H/yUwGJNkAiedkSOzn3JwfJCYDkmyAxHM0JPYyXbjT+mRAkg2QeM6GxLrQk9Y3BiTZAInncEjrTkqfa3ljQJINkHgOh8Q+Tjl9q9VtAUk2QOI5HRK7k+62uikgyQZIPMdD2tY4ZYrFTQFJNkDiOR4Sm5d+0jprWwKSbIDEcz4k9jjdYm1DQJINkHgugJTfjKZa2hCQZAMkngsgsXlpjf60sh0gyQZIPDdAYn3pISubAZJsgMRzBaQtJ6cvtLAZIMkGSDxXQGJT6KJdkbcCJNkAiecOSOx6Gh15I0CSDZB4LoH0e41qkZ8OHJBkAySeSyCxMXRVxG0ASTZA4rkFUsGl9GakbQBJNkDiuQUSW5pZN9IL+QGSbIDEcw0k9ijdEWELQJINkHjugZR3durM8FsAkmyAxHMPJDY9pXH4V6gAJNkAieciSKwXDQ57OSDJBkg8N0HaVC/z+3CXA5JsgMRzEyQ2kdqEe+pVQJINkHiugsSupfFhLgUk2QCJ5y5IK6tW/y30pYAkGyDx3AWJPUs3hr4QkGQDJJ7LIO1qTu+GvBCQZAMknssgsW8zGoZ8wkhAkg2QeG6DxAZS/1AXJTKkg+P69Bxe4Du9Z+xt3Z9YB0h2AiRelJC2nZo6J8RFiQxpxODNeWMHlnlPPzx4084Xeh0BJBsBEi9KSOxjOjfES70kMCTWZZPnu9INq/TTRaO2aVph5/WAZCNA4kULiXWjYcEvSGBIS7qVe97e/xE/Y21X/d8e3uGJ7Y3UUe1AxG1sta8kNnP3HtGKYjN4X2ls5u49rB2M0eTYrfhQdAM21M7+OegFZdHNDVmxVhzikv1WIc29Q387ZKLxcdGAyfq7hS08LQv/TxGKUW/StfFegr8yfioSpDv1txzS9nte0b9DaasHe1p7NFKl2rGI29jqWFls5npWfDw2g4+Vx2bu0ZJYrfho4q74SEeaHOz82K24JNRFViH94LtpN9X30aqeM0yX4WckifAzEi/qn5EYW1al1h9Bzk7gn5H2dNmgaQe6rvZ+8PutP5kvAySJAImnABJ7im4Ncm4CQ9JGP7R5xzOPlGvzp2vH+nlf8Am//rYTIPFUQMpvlvJJ5XMTGVLx+N69Rnk2HzNUW9XZ20xAshEg8VRAYvPSzthe6cxEhhQmQJIIkHhKILG7g7xABSDJBkg8t0La3CD964rnAZJsgMRzKyT2AV1c8QUqAEk2QOK5FhL7R6UXqAAk2QCJ515Iq2tU+yXwHECSDZB47oXE/kNXB54BSLIBEs/FkAouockBZwCSbIDEczEktiTzpI3mjwFJNkDiuRkSe4TuMn8ISLIBEs/VkPLOTp1l+hCQZAMknqshsWkpZ5leoAKQZAMknrshsZ70hPgAkGQDJJ7LIa2vY3qBCkCSDZB4LofEXjW9QAUgyQZIPLdDYtfQi8ZJQJINkHiuh7Sias01/pOAJBsg8VwPiQ2nm/ynAEk2QOIB0q7m9J7vFCDJBkg8QGLfGC9QAUiyARIPkBgbQAO87wFJNkDiAZL+AhVpC/T3gCQbIPEAydNHdJ7+AhWAJBsg8QBJ7yYazgBJPkDiAZLeulrZPwGSfIDEAyRvL9KVgCQfIPEAyVthe3oVkKQDJB4g+VqWVWsdIMkGSDxA8vcE9QQk2QCJB0j+8s9L+TIWcxkg2QiQREkGic1NPWtHTAYDknyAJEo2SOwueiQ2gwFJOkASJR2kzQ3Tv4nJYECSDpBESQeJfUItCmIxF5CkAyRR8kEq+zv9JxZzAUk6QBIlIaTVNXJ/ibyZdIAkHSCJkhASG13xBSqUBEjSAZIoGSEVtKK31M8FJOkASZSMkNiijMAXqFASIEkHSKKkhMQeoruVzwUk6QBJlJyQ8s4KeIEKJQGSdIAkSk5IbFpK07xIG0oGSNIBkihJIbFb6SnFcwFJOkASJSuk9bUzl6idC0jSAZIoWSGx/9FlheE3lAyQpAMkUdJCYtfQy0rnApJ0gCRKXkgrqtZcq3IuIEkHSKLkhcSGUTeVcwFJOkASJTGk/AvofYVzAUk6QBIlMST2TXqjrermApJ0gCRKZkisPw1UNxeQpAMkUVJD2naK7wUqlARI0gGSKKkhsQ/pvHxVcwFJOkASJTckdiM9q2ouIEkHSKIkh7TmBP0FKpQESNIBkijJIbHx1FHRXECSDpBEyQ6psB1NVDMXkKQDJFGyQ2JLM+usVzIXkKQDJFHSQ2JP0C1K5gKSdIAkSn5I+RfQOyrmApJ0gCRKfkjsm4x6Kp5TCJCkAySRAyCxh6iPgrmAJB0giZwAKa9JytTo5wKSdIAkcgIkNjdNwd3AAUk6QBI5AhLrT/dEPReQpAMkkTMgbTs9dUa0cwFJOkASOQMSm5Zy5vYo5wKSdIAkcggkdic9EOVcQJIOkEROgfTnqWnzo5sLSNIBksgpkNinKWdH92zggCQdIIkcA4n1oMeimgtI0gGSyDmQNtVPXxjNXECSDpBEzoHE3qdmO6OYC0jSAZLIQZDYjTQ0irmAJB0giZwEaV3tzO/sz40xpKJ9kTpqYRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns4HiueRC13255brBWHuOSACkhHI1aqHYu8kZ2OlcVmrmfFx2M0uTxGc0uwYl64Fd9AY23PLdFKQl2kAhJu2kmEm3a8eNy0Y+z3mtk/2p2Ln5GkAySRsyCxCdTW7uv4AZJ0gCRyGCTWkV6wOReQpAMkkdMg/Vojd6W9uYAk0i8KqwAAC59JREFUHSCJnAaJjaEO9m7cAZJ0gCRyHKTCDjTB1lxAkg6QRI6DxH6uWv1XO3MBSTpAEjkPEhtB19qZC0jSAZLIgZAKLqXXbcwFJOkASeRASGxpVq218nMBSTpAEjkREhtC3eTnApJ0gCRyJKT8C+kt6bmAJB0giRwJiX2bedIG2bmAJB0giZwJiQ2inrJzAUk6QBI5FFJeU/pIci4gSQdIIodCYgszGm6RmwtI0gGSyKmQ2L/oLrm5gCQdIIkcCynvnJRPpOYCknSAJHIsJDY79ZQ/ZeYCknSAJHIuJNaXBsjMBSTpAEnkYEjbTkudJTEXkKQDJJGDIbHPU87aYX0uIEkHSCInQ2K308PWNwYk6QBJ5GhIm09OX2B5Y0CSDpBEjobEptC5lp9XH5CkAySRsyGx7vSE1U0BSTpAEjkc0vq6mYssbgpI0gGSyOGQ2Lt08S5rWwKSdIAkcjok1pmesbYhIEkHSCLHQ1p3Yub3ljYEJOkASeR4SOwValVgZTtAkg6QRM6HxP5Oo61sBkjSAZLIBZB+OyFnuYXNAEk6QBK5ABIbT+0sPK8+IEkHSCI3QCq8ksZH3gqQpAMkkRsgsRXVcn+JuBEgSQdIIldAYqPp6ojbAJJ0gCRyB6SCNvS/SNsAknSAJHIHJPZDlVprImwCSNIBksglkNgw6hJhC0CSDpBEboFU0Iomh98CkKQDJJFbILElWSf+EXYDQJIOkESugcSeoO5hLwck6QBJ5B5I+c3pnXCXA5J0gCRyDyT2TUa9jWEuBiTpAEnkIkjsIeod5lJAkg6QRG6ClNck5ePQlwKSdIAkchMkNjet0daQFwKSdIAkchUkdh/1C3kZIEkHSCJ3Qdp2RuqMUJcBknSAJHIXJDYt5cztIS4CJOkASeQySOwu+leISwBJOkASuQ3Sn6emzQ9+CSBJB0git0Fin6WcnRf0AkCSDpBEroPEbqXHgp4PSNIBksh9kDY1SP8q2PmAJB0gidwHib1PzYK9aBIgSQdIIhdCYjfRkCDnApJ0gCRyI6R1dTIXVz4XkKQDJJEbIbHXqUXlF00CJOkASeRKSOx6GlHpPECSDpBE7oT0e83sHyueB0jSAZLInZDYBGpb8Xn1AUk6QBK5FBK7jsZWOAeQpAMkkVsh/Vojd2XgOYAkHSCJ3AqJjaX2gTfuAEk6QBK5FlLhFTQh4AxAkg6QRK6FxFZUrf6r+WNAkg6QRO6FxJ6ja8wfApJ0gCRyMaSCS2mi6UNAkg6QRC6GxJZm1VorPgIk6QBJ5GZIbAjdJD4AJOkASeRqSPkX0lv8A0CSDpBErobEvs08aYNxGpCkAySRuyGxx+hW46QKSAfH9ek5vKDyaUCSC5B4SQIprylN859UAWnE4M15YweWVToNSHIBEi9JILH5T+X7TymAxLps8nwnumFVxdOAJBkg8ZIFkkgBpCXdyj1v7/+owumiNZ4K9kXqqFYUcRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns46VZcrBWHuOSAVUhz79DfDplY4fTCFp6Whf+nCDk+/nNOREh36m/9kMTpdSM9rT8SqVLtWMRtbHW0LDZzj5TEasVHkm/F5TGaW6Idj9HkWK34eOgVW4X0g+/m3NSKp/XwM5JE+BmJ58qfkfZ02aBpB7qurngakCQDJJ4rIWmjH9q845lHyrX508VpQJIPkHjuhFQ8vnevUZ7NxwwVpwFJPkDiuRNSmABJIkDiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiAZJ0c0buiuX4GPTVyD/jvQTJFo/cGO8lSLZs5O/xXoJkK0eujLhNTCGNbvFHLMfHoJda/BzvJUg2scV38V6CZO+3mBfvJUj2eYvPI24DSAEBUuwDJOkAKfYBUuwDJOkAKfYBEkIoRICEkIIACSEFARJCCooJpIPj+vQcXlD5dOJmXuWesbd1f2JdfNcTuQrHdUHnpXFcjKUCVjyr7433/xjX5VjJvOTtz/a65YnQf0qOCaQRgzfnjR1YVul04mZe5cODN+18odeROK8oUoHHdd/t3RIeknnFC3ovL/iiX3GcVxQx05LL+00oPvreP4tCbRoLSKzLJg/mG1ZVPJ24mVdZNGqbphV2Xh/vNYWvwnEdPen2RIcUsOJ+X8V5NZYyL3l/57WatrdzyFsqsYC0pFu55+39H1U8nbhVWuXarntDb50IBa54Sd8jCQ/JvOLdnb/6182Pro3ziiIWcJAfG1905IO+x0JtGwtIc+/Q3w6ZWPF04lZxlUUDJsdtLdYKWPHB3iu1hIdkXvG6zk9uL5rYY398VxSxgIO8Z2Dnzr1D30E4JpDuFJ/ffDpxq7DK7fe8Uh7H1VgpYMUvvqglASTTitd19txcKr11QXxXFDHzkksenLC/eGqvkDdUYgHpB993xKkVTydugatc1XNGXFdjJfOKV/YuSgJI5hWzzhs8bwcm+LUiYMkruui/frpreqhtYwFpTxfPYTrQdXXF04lbwCp/v/WnOC/HQuYVj+nWs2fPLt1HxXtN4TOvuKy357+qY90XxXtNETIv+efO+u8Ye/+lkLTRD23e8cwj5dr86eJ0Ymda8bF+U/THPib6r79NK/Y+4PS2+QfivaQIma8VU3utZC/3TvRjbF5yce8JB4992m1nqE1jAql4fO9eozy3JscMFacTO9OKV3X2NjPeS4qQ+RjrJfxNu4AVl719+41PbIv3iiJmXvLW4b16PP5ryE1xFyGEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQUBEkIKAqQEaRjp5bb/NOKWbc8JMcDSn2RD/GsUZYCUIA2jJ19//bWhp9CLkbbUKays/HUDpLgGSAmS30HRqbmR7oGmU3gZkBIsQEqQDAeP0DJN++aq3OyLJnk+anf5io65dXroz78xpVV2bospmpfCtZ5bgS3anuh9uGaH2scDBmjin7c9sUT/8NL6pWIiIMUmQEqQDAdD6TttQVr7GfP70wua1qlRqy8LPknro2kf0o0zZ15HM70U1nel5Wsm0See7fNTHwgcoIl//l+a7/nwz5RHTBMBKTYBUoJkOLg8fb92UWP9sS9dPDfyOpH+zN6dGmjaqI6e7z8H0nv5KNzt+bodrNbZc9kE+jlwgCb+OUu/x3NiLK00TQSk2ARICdIwmpWfv/PHu+g+rYAePOLpVfpR65SjX9Yn1diqYTsBSbsz3XOTr10zPsAPyfTP/1a3TNNanWc+C5BiEyAlSL5ff1P6gKPaSvL3mdbpVP0ync2Bp5tVT0ujtiZIi2mclpcyhg/wQzL983fpa20LPW8+C5BiEyAlSMNo/Jw5cxfv03QJdy31xkyQ2qc9tejX3xqYIWlnX6C9mLaTD+CQ+D8/mDNQ+0/KNvNZgBSbAClBMv3SbQ/1MU5ySBuon+dESZUASKNp9SXXVRpg+ufaLQ20llcEnAVIsQmQEiTzn4EuqaF/Y3p7SImAtIaGa/qfj1r7KPQl/RfbO9N60pTKA8Q/16bR5zQp4CxAik2AlCCZIX2TccHb84Zm3GH6jnS80cnTvnv0iityFx7SKfybhuu/+/4HVT8sBjw6Qe9b0z/Xjtc6o8qBgImAFJsAKUEKuGPC4qtzM84eU2KCpC1vk3PSvQdm1K65Tqew/aIMHcSn1Nc0wNdA0z/XtHvon4ETASk2AVIyN12/GwRKhAApiTvesnW8l4D8AVLStm3adWmJ/1pdbgmQkrZJKafPivcakBEgIaQgQEJIQYCEkIIACSEFARJCCgIkhBQESAgpCJAQUtD/B4iXRsAuE6zrAAAAAElFTkSuQmCC", + "text/plain": [ + "plot without title" + ] + }, + "metadata": { + "image/png": { + "height": 420, + "width": 420 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create a data frame to store the results\n", + "results_y <- data.frame(\n", + " Alphas = model_y$lambda,\n", + " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", + ")\n", + "\n", + "results_d <- data.frame(\n", + " Alphas = model_d$lambda,\n", + " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", + ")\n", + "\n", + "# Plot Outcome Lasso-CV Model\n", + "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n", + "\n", + "# Plot Treatment Lasso-CV Model\n", + "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n" + ] } - }, - "outputs": [], - "source": [ - "# Add LassoCV results to the table\n", - "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0LzDsUi8gmQM" - }, - "source": [ - "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { + ], + "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 857 + "provenance": [] }, - "id": "7uzcIGhVgmei", - "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", - "vscode": { - "languageId": "r" + "kernelspec": { + "display_name": "R", + "name": "ir" + }, + "language_info": { + "name": "R" } - }, - "outputs": [], - "source": [ - "# Create a data frame to store the results\n", - "results_y <- data.frame(\n", - " Alphas = model_y$lambda,\n", - " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", - ")\n", - "\n", - "results_d <- data.frame(\n", - " Alphas = model_d$lambda,\n", - " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", - ")\n", - "\n", - "# Plot Outcome Lasso-CV Model\n", - "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n", - "\n", - "# Plot Treatment Lasso-CV Model\n", - "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "R", - "name": "ir" }, - "language_info": { - "name": "R" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "nbformat": 4, + "nbformat_minor": 0 } From 498cd672870bfc8a02efa76f22c31f891f93c62b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:28:26 -0700 Subject: [PATCH 082/261] Linting errors --- .lintr | 1 + PM2/r_experiment_non_orthogonal.irnb | 27 ++++++++++++++------------- PM2/r_heterogenous_wage_effects.irnb | 10 +++++----- PM2/r_linear_penalized_regs.irnb | 15 +++++++-------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/.lintr b/.lintr index 9e79e4da..d5b122eb 100644 --- a/.lintr +++ b/.lintr @@ -2,3 +2,4 @@ linters: linters_with_defaults( line_length_linter(120), object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")) ) + diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index a8e7be1c..3a2ac434 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -51,7 +51,7 @@ " y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", " y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1)\n", " y <- y1 * D + y0 * (1 - D)\n", - " return(list(y=y, D=D, X=X))\n", + " return(list(y = y, D = D, X = X))\n", "}" ] }, @@ -99,7 +99,8 @@ " V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated\n", " V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated\n", " hat <- hat1 - hat0 # estimate of the treatment effect\n", - " stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect\n", + " # standard error of the estimate of the treatment effect\n", + " stderr <- sqrt((V0 + V1) / length(y))\n", " return(list(hat = hat, stderr = stderr))\n", "}" ] @@ -211,7 +212,7 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", + " require(rlasso)\n", "\n", " # residualize outcome with Lasso\n", " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", @@ -621,7 +622,7 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", - " require(hdm)\n", + " require(rlasso)\n", "\n", " # residualize outcome with Lasso\n", " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", @@ -630,7 +631,7 @@ "\n", "\n", " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post=FALSE)\n", + " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", " Dres <- D - as.numeric(dhat_rlasso)\n", "\n", @@ -638,7 +639,7 @@ " hat <- mean(yres * Dres) / mean(Dres^2)\n", " epsilon <- yres - hat * Dres\n", " V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2\n", - " stderr = sqrt(V / length(y))\n", + " stderr <- sqrt(V / length(y))\n", "\n", " return(list(hat = hat, stderr = stderr))\n", "}" @@ -743,8 +744,8 @@ "\n", "\n", " # run a big lasso y ~ D, X\n", - " DX = cbind(D,X)\n", - " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " DX <- cbind(D, X)\n", + " yfit_rlasso <- rlasso(DX, y, post = FALSE) # could just use this functionality\n", " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", @@ -843,7 +844,7 @@ }, "outputs": [], "source": [ - "gen_data_nonRCT <- function(n, d, p, delta, base) {\n", + "gen_data_non_rct <- function(n, d, p, delta, base) {\n", " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", " D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4)\n", " y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1)\n", @@ -870,15 +871,15 @@ "\n", "for (i in 1:n_experiments) {\n", " # Generate data for each experiment\n", - " data <- gen_data_nonRCT(n, d, p, delta, base)\n", + " data <- gen_data_non_rct(n, d, p, delta, base)\n", " y <- data$y\n", " D <- data$D\n", " X <- data$X\n", "\n", "\n", " # run a big lasso y ~ D, X\n", - " DX = cbind(D, X)\n", - " yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality\n", + " DX <- cbind(D, X)\n", + " yfit_rlasso <- rlasso(DX, y, post = FALSE) # could just use this functionality\n", " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", @@ -958,7 +959,7 @@ "\n", "for (i in 1:n_experiments) {\n", " # Generate data for each experiment\n", - " data <- gen_data_nonRCT(n, d, p, delta, base)\n", + " data <- gen_data_non_rct(n, d, p, delta, base)\n", " y <- data$y\n", " D <- data$D\n", " X <- data$X\n", diff --git a/PM2/r_heterogenous_wage_effects.irnb b/PM2/r_heterogenous_wage_effects.irnb index 0deee78b..a6426423 100644 --- a/PM2/r_heterogenous_wage_effects.irnb +++ b/PM2/r_heterogenous_wage_effects.irnb @@ -86,8 +86,8 @@ "outputs": [], "source": [ "center_colmeans <- function(x) {\n", - " xcenter <- colMeans(x)\n", - " x - rep(xcenter, rep.int(nrow(x), ncol(x)))\n", + " xcenter <- colMeans(x)\n", + " x - rep(xcenter, rep.int(nrow(x), ncol(x)))\n", "}" ] }, @@ -103,7 +103,7 @@ "outputs": [], "source": [ "# create the model matrix for the covariates\n", - "controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2'\n", + "controls_formula <- \"~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2\"\n", "Zcontrols <- model.matrix(as.formula(controls_formula), data = Z)\n", "Zcontrols <- center_colmeans(Zcontrols)" ] @@ -129,7 +129,7 @@ "outputs": [], "source": [ "# create the model matrix for the linear heterogeneity\n", - "linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "linear_het_formula <- \"~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)\"\n", "Zhet <- model.matrix(as.formula(linear_het_formula), data = Z)\n", "Zhet <- center_colmeans(Zhet)" ] @@ -156,7 +156,7 @@ "source": [ "# create the model matrix for the higher order heterogeneity\n", "Zhet <- as.data.frame(cbind(Zhet, \"sex\" = Z$sex))\n", - "nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)'\n", + "nonlin_het_formula <- \"~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)\"\n", "Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet)\n", "interaction_cols <- Zinteractions[, grepl(\"sex\", colnames(Zinteractions))]" ] diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index 6769569d..4cc301fe 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -582,19 +582,18 @@ "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", "# number of iteration. Could iterate until a convergence criterion is met.\n", "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", - " require(glmnet)\n", "\n", " # Need to demean internally\n", " dy <- Y - mean(Y)\n", " dx <- scale(X, scale = FALSE)\n", "\n", - " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", "\n", " i <- 1\n", " while (i <= iter) {\n", - " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", " i <- i + 1\n", " }\n", "\n", @@ -606,7 +605,7 @@ " yhat <- newX %*% bhat + a0\n", "\n", " return(yhat)\n", - "}\n" + "}" ] }, { @@ -640,11 +639,11 @@ " # of the default implementation in glmnet for everything. Could do better here - maybe\n", "\n", " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", "\n", " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", "\n", " ## ------------------------------------------------------------\n", From 2e6d334e3725603ad626fc96c120c51e28234e4c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:30:34 -0700 Subject: [PATCH 083/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 51106fd3..0fa3495b 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2'] #, 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From d42c676ba466d0503a42443c6c38b6b30b13ce3a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:43:53 -0700 Subject: [PATCH 084/261] linting errors --- ...r_convergence_hypothesis_double_lasso.irnb | 5 +- PM2/r_experiment_non_orthogonal.irnb | 16 +++--- PM2/r_heterogenous_wage_effects.irnb | 2 +- PM2/r_linear_penalized_regs.irnb | 8 +-- PM2/r_ml_for_wage_prediction.irnb | 21 ++++---- PM2/r_orthogonal_orig.irnb | 49 ++++++++++++------- 6 files changed, 54 insertions(+), 47 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index 538431b9..4a8e690b 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -762,16 +762,15 @@ "outputs": [], "source": [ "double_lasso <- function(y, D, W) {\n", - " require(rlasso)\n", "\n", " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yfit_rlasso <- hdm::rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dfit_rlasso <- hdm::rlasso(W, D, post = FALSE)\n", " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", " dres <- D - as.numeric(dhat_rlasso)\n", "\n", diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index 3a2ac434..383e4871 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -212,16 +212,15 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", - " require(rlasso)\n", "\n", " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yfit_rlasso <- hdm::rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dfit_rlasso <- hdm::rlasso(W, D, post = FALSE)\n", " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", " Dres <- D - as.numeric(dhat_rlasso)\n", "\n", @@ -622,16 +621,15 @@ "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n", "\n", "double_lasso <- function(y, D, W) {\n", - " require(rlasso)\n", "\n", " # residualize outcome with Lasso\n", - " yfit_rlasso <- rlasso(W, y, post = FALSE)\n", + " yfit_rlasso <- hdm::rlasso(W, y, post = FALSE)\n", " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", " yres <- y - as.numeric(yhat_rlasso)\n", "\n", "\n", " # residualize treatment with Lasso\n", - " dfit_rlasso <- rlasso(W, D, post = FALSE)\n", + " dfit_rlasso <- hdm::rlasso(W, D, post = FALSE)\n", " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", " Dres <- D - as.numeric(dhat_rlasso)\n", "\n", @@ -671,7 +669,7 @@ " # Calculate single lasso estimate\n", "\n", "\n", - " yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE)\n", + " yfit_rlasso <- hdm::rlasso(cbind(D, X), y, post = FALSE)\n", " hat <- yfit_rlasso$coefficients[2]\n", "\n", " hats[i] <- hat\n", @@ -745,7 +743,7 @@ "\n", " # run a big lasso y ~ D, X\n", " DX <- cbind(D, X)\n", - " yfit_rlasso <- rlasso(DX, y, post = FALSE) # could just use this functionality\n", + " yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality\n", " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", @@ -879,7 +877,7 @@ "\n", " # run a big lasso y ~ D, X\n", " DX <- cbind(D, X)\n", - " yfit_rlasso <- rlasso(DX, y, post = FALSE) # could just use this functionality\n", + " yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality\n", " coefs <- yfit_rlasso$coefficients[2:n]\n", " selected_columns <- X[, abs(coefs) > 0.0]\n", " # run OLS on y ~ D, X[chosen by lasso]\n", diff --git a/PM2/r_heterogenous_wage_effects.irnb b/PM2/r_heterogenous_wage_effects.irnb index a6426423..f4fe0350 100644 --- a/PM2/r_heterogenous_wage_effects.irnb +++ b/PM2/r_heterogenous_wage_effects.irnb @@ -211,7 +211,7 @@ "source": [ "# this cell takes 30 minutes to run\n", "index_gender <- grep(\"sex\", colnames(Zinteractions))\n", - "effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE)\n", + "effects_female <- hdm::rlassoEffects(x = X, y = y, index = index_gender, post = FALSE)\n", "result <- summary(effects_female)\n", "result$coef\n", "print(xtable(result$coef[, c(1, 2, 4)], type = \"latex\"), digits = 3)" diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index 4cc301fe..0856f597 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -498,8 +498,8 @@ }, "outputs": [], "source": [ - "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", - "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level" + "fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level" ] }, { @@ -982,8 +982,8 @@ "fit_ridge <- cv.glmnet(X, y, family = \"gaussian\", alpha = 0, nfolds = 5)\n", "# family gaussian means that we'll be using square loss\n", "fit_elnet <- cv.glmnet(X, y, family = \"gaussian\", alpha = .5, nfolds = 5)\n", - "fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", - "fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", + "fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level\n", + "fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level\n", "\n", "r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = \"lambda.min\"), ypop)\n", "r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = \"lambda.min\"), ypop)\n", diff --git a/PM2/r_ml_for_wage_prediction.irnb b/PM2/r_ml_for_wage_prediction.irnb index 75c80aa0..a6bf99cb 100644 --- a/PM2/r_ml_for_wage_prediction.irnb +++ b/PM2/r_ml_for_wage_prediction.irnb @@ -729,8 +729,8 @@ }, "outputs": [], "source": [ - "fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE)\n", - "fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE)\n", + "fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE)\n", + "fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE)\n", "yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test)\n", "yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test)\n", "\n", @@ -767,19 +767,18 @@ "# Define function to compute lava estimator. Doing an iterative scheme with fixed\n", "# number of iteration. Could iterate until a convergence criterion is met.\n", "lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) {\n", - " require(glmnet)\n", "\n", " # Need to demean internally\n", " dy <- Y - mean(Y)\n", " dx <- scale(X, scale = FALSE)\n", "\n", - " sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits \"sparse part\"\n", + " de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2)\n", "\n", " i <- 1\n", " while (i <= iter) {\n", - " sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", - " de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", + " sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = \"lambda.min\"), lambda = lambda1)\n", + " de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = \"lambda.min\"), alpha = 0, lambda = lambda2)\n", " i <- i + 1\n", " }\n", "\n", @@ -824,11 +823,11 @@ " # of the default implementation in glmnet for everything. Could do better here - maybe\n", "\n", " ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", + " ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge\n", " ridge_lambda <- ridge_mod$lambda # values of penalty parameter\n", "\n", " ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model.\n", - " lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", + " lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1)\n", " lasso_lambda <- lasso_mod$lambda # values of penalty parameter\n", "\n", " ## ------------------------------------------------------------\n", @@ -1044,8 +1043,8 @@ "fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 1)\n", "fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = 0)\n", "fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = \"gaussian\", alpha = .5)\n", - "fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE)\n", - "fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE)\n", + "fit_rlasso_extra <- hdm::rlasso(formula_extra, data_train, post = FALSE)\n", + "fit_rlasso_post_extra <- hdm::rlasso(formula_extra, data_train, post = TRUE)\n", "fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test)\n", "\n", "yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test)\n", diff --git a/PM2/r_orthogonal_orig.irnb b/PM2/r_orthogonal_orig.irnb index 76911c26..2504de55 100644 --- a/PM2/r_orthogonal_orig.irnb +++ b/PM2/r_orthogonal_orig.irnb @@ -27,7 +27,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "dSvVz5Z6D14H" + "id": "dSvVz5Z6D14H", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -42,7 +45,10 @@ "metadata": { "_execution_state": "idle", "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "id": "fAe2EP5VCFN_" + "id": "fAe2EP5VCFN_", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -56,8 +62,8 @@ "Orthogonal <- rep(0, B)\n", "\n", "\n", - "lambdaYs <- rep(0,B)\n", - "lambdaDs <- rep(0,B)\n", + "lambdaYs <- rep(0, B)\n", + "lambdaDs <- rep(0, B)\n", "\n", "for (i in 1:B) {\n", " # Generate parameters\n", @@ -72,28 +78,26 @@ " Y <- D + X %*% beta + rnorm(n)\n", "\n", " # Single selection method\n", - " rlasso_result <- rlasso(Y ~ D + X) # Fit lasso regression\n", - " SX_IDs <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates\n", + " rlasso_result <- hdm::rlasso(Y ~ D + X) # Fit lasso regression\n", + " sx_ids <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates\n", "\n", " # Check if any Xs are selected\n", - " if (sum(SX_IDs) == 0) {\n", + " if (sum(sx_ids) == 0) {\n", " Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected\n", " } else {\n", - " Naive[i] <- lm(Y ~ D + X[, SX_IDs])$coef[2] # Fit linear regression with selected X otherwise\n", + " Naive[i] <- lm(Y ~ D + X[, sx_ids])$coef[2] # Fit linear regression with selected X otherwise\n", " }\n", "\n", " # Partialling out / Double Lasso\n", "\n", - " fitY <- rlasso(Y ~ X, post = TRUE)\n", + " fitY <- hdm::rlasso(Y ~ X, post = TRUE)\n", " resY <- fitY$res\n", - " #cat(\"lambda Y mean: \", mean(fitY$lambda))\n", "\n", - " fitD <- rlasso(D ~ X, post = TRUE)\n", + " fitD <- hdm::rlasso(D ~ X, post = TRUE)\n", " resD <- fitD$res\n", - " #cat(\"\\nlambda D mean: \", mean(fitD$lambda))\n", "\n", " Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals\n", - "}\n" + "}" ] }, { @@ -109,25 +113,32 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "MjB3qbGEaRnl" + "id": "MjB3qbGEaRnl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#Specify ratio\n", - "img_width = 15\n", - "img_height = img_width/2" + "# Specify ratio\n", + "img_width <- 15\n", + "img_height <- img_width / 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "N7bdztt1CFOE" + "id": "N7bdztt1CFOE", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Create a data frame for the estimates\n", - "df <- data.frame(Method = rep(c(\"Naive\", \"Orthogonal\"), each = B), Value = c(Naive-1,Orthogonal-1))\n", + "df <- data.frame(Method = rep(c(\"Naive\", \"Orthogonal\"), each = B),\n", + " Value = c(Naive - 1, Orthogonal - 1))\n", "\n", "# Create the histogram using ggplot2\n", "hist_plot <- ggplot(df, aes(x = Value, fill = Method)) +\n", From 6159707e736de86fc08e8ffa9dd5ba53bea69e32 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:52:57 -0700 Subject: [PATCH 085/261] Update r_experiment_non_orthogonal.irnb --- PM2/r_experiment_non_orthogonal.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index 383e4871..4cf8b10b 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -844,7 +844,7 @@ "source": [ "gen_data_non_rct <- function(n, d, p, delta, base) {\n", " X <- matrix(rnorm(n * d), nrow = n, ncol = d)\n", - " D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4)\n", + " D <- X[, 1] + rnorm(n, mean = 0, sd = 1 / 4)\n", " y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1)\n", " return(list(y = y, D = D, X = X))\n", "}" From 09b3ec9c02618437d1aeb0eb0da215332cb3b419 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 13:53:40 -0700 Subject: [PATCH 086/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 0fa3495b..7c989463 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2'] #, 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3'] #, 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From 6dc20f08c876b597027db0ac2cf86a3273ea1af8 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 14:31:56 -0700 Subject: [PATCH 087/261] Linting errors --- ...functional_approximation_by_nn_and_rf.irnb | 191 +++--- PM3/r_ml_wage_prediction.irnb | 608 +++++++++++------- 2 files changed, 499 insertions(+), 300 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index eb4b1823..74f1732f 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -29,7 +29,10 @@ "execution_count": null, "id": "1", "metadata": { - "id": "NULYR1oB9aWz" + "id": "NULYR1oB9aWz", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -37,12 +40,14 @@ "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", "install.packages(\"keras\")\n", + "install.packages(\"magrittr\")\n", "\n", "\n", "library(randomForest)\n", "library(rpart)\n", "library(gbm)\n", - "library(keras)" + "library(keras)\n", + "library(magrittr)" ] }, { @@ -88,21 +93,24 @@ "start_time": "2021-03-30T21:54:42.925266", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1)\n", - "X_train <- matrix(runif(1000),1000,1)\n", - "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", - "dim(X_train)\n", + "x_train <- matrix(runif(1000), 1000, 1)\n", + "y_train <- exp(4 * x_train) # Noiseless case Y=g(X)\n", + "dim(x_train)\n", "\n", "\n", "# shallow tree\n", - "TreeModel<- rpart(Y_train~X_train, cp=.01) #cp is penalty level\n", - "pred.TM<- predict(TreeModel, newx=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.TM, col=3, pch=19)" + "TreeModel <- rpart(y_train ~ x_train, cp = .01) # cp is penalty level\n", + "pred_tm <- predict(TreeModel, newx = x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_tm, col = 3, pch = 19)" ] }, { @@ -118,20 +126,23 @@ "start_time": "2021-03-30T21:54:43.632071", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1)\n", - "X_train <- matrix(runif(1000),1000,1)\n", - "Y_train <- exp(4*X_train) #Noiseless case Y=g(X)\n", - "dim(X_train)\n", + "x_train <- matrix(runif(1000), 1000, 1)\n", + "y_train <- exp(4 * x_train) # Noiseless case Y=g(X)\n", + "dim(x_train)\n", "\n", "\n", - "TreeModel<- rpart(Y_train~X_train, cp=.0005) #cp is penalty level\n", - "pred.TM<- predict(TreeModel, newx=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.TM, col=3, pch=19)" + "TreeModel <- rpart(y_train ~ x_train, cp = .0005) # cp is penalty level\n", + "pred_tm <- predict(TreeModel, newx = x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_tm, col = 3, pch = 19)" ] }, { @@ -187,14 +198,17 @@ "start_time": "2021-03-30T21:54:43.993891", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "RFmodel<- randomForest(Y_train~X_train)\n", - "pred.RF<- predict(RFmodel, newdata=X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.RF, col=4, pch=19,)\n" + "RFmodel <- randomForest(y_train ~ x_train)\n", + "pred_rf <- predict(RFmodel, newdata = x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_rf, col = 4, pch = 19)" ] }, { @@ -228,19 +242,25 @@ "start_time": "2021-03-30T21:54:45.216708", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data_train = as.data.frame(cbind(X_train, Y_train))\n", - "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=100, shrinkage=.01, interaction.depth\n", - "=3)\n", + "data_train <- as.data.frame(cbind(x_train, y_train))\n", + "BoostTreemodel <- gbm(y_train ~ x_train,\n", + " distribution = \"gaussian\", n.trees = 100, shrinkage = .01,\n", + " interaction.depth = 3\n", + ")\n", + "\n", "# shrinkage is \"learning rate\"\n", "# n.trees is the number of boosting steps\n", "# interaction.depth is the max depth of each tree\n", - "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=100)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.BT, col=4, pch=19,)" + "pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 100)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_bt, col = 4, pch = 19)" ] }, { @@ -256,19 +276,24 @@ "start_time": "2021-03-30T21:54:46.639160", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data_train = as.data.frame(cbind(X_train, Y_train))\n", - "BoostTreemodel<- gbm(Y_train~X_train, distribution= \"gaussian\", n.trees=1000, shrinkage=.01, interaction.depth\n", - "=3)\n", + "data_train <- as.data.frame(cbind(x_train, y_train))\n", + "BoostTreemodel <- gbm(y_train ~ x_train,\n", + " distribution = \"gaussian\", n.trees = 1000, shrinkage = .01,\n", + " interaction.depth = 3\n", + ")\n", "# shrinkage is \"learning rate\"\n", "# n.trees is the number of boosting steps\n", "# interaction.depth is the max depth of each tree\n", - "pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=1000)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.BT, col=4, pch=19,)" + "pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 1000)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_bt, col = 4, pch = 19)" ] }, { @@ -304,19 +329,26 @@ "start_time": "2021-03-30T21:54:47.106356", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "build_model <- function() {\n", - " model <- keras_model_sequential() %>%\n", - " layer_dense(units = 200, activation = \"relu\",\n", - " input_shape = 1)%>%\n", - " layer_dense(units = 20, activation = \"relu\") %>%\n", - " layer_dense(units = 1)\n", + " require(magrittr)\n", + "\n", + " model <- keras::keras_model_sequential() %>%\n", + " keras::layer_dense(\n", + " units = 200, activation = \"relu\",\n", + " input_shape = 1\n", + " ) %>%\n", + " keras::layer_dense(units = 20, activation = \"relu\") %>%\n", + " keras::layer_dense(units = 1)\n", "\n", - " model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", + " model %>% keras::compile(\n", + " optimizer = keras::optimizer_adam(lr = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", " )\n", @@ -336,7 +368,10 @@ "start_time": "2021-03-30T21:54:47.320491", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -357,17 +392,20 @@ "start_time": "2021-03-30T21:54:54.939262", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "num_epochs <- 1\n", - "model %>% fit(X_train, Y_train,\n", - " epochs = num_epochs, batch_size = 10, verbose = 0)\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)\n", - "\n" + "model %>% fit(x_train, y_train,\n", + " epochs = num_epochs, batch_size = 10, verbose = 0\n", + ")\n", + "pred_nn <- model %>% predict(x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_nn, col = 4, pch = 19, )" ] }, { @@ -383,18 +421,20 @@ "start_time": "2021-03-30T21:54:56.331780", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "num_epochs <- 100\n", - "model %>% fit(X_train, Y_train,\n", - " epochs = num_epochs, batch_size = 10, verbose = 0)\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)\n", - "\n", - "\n" + "model %>% fit(x_train, y_train,\n", + " epochs = num_epochs, batch_size = 10, verbose = 0\n", + ")\n", + "pred_nn <- model %>% predict(x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_nn, col = 4, pch = 19, )" ] }, { @@ -412,15 +452,18 @@ "execution_count": null, "id": "18", "metadata": { - "id": "_cyeRToRTORV" + "id": "_cyeRToRTORV", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Define the neural network architecture\n", "model <- keras_model_sequential() %>%\n", - " layer_dense(units = 200, activation = 'relu', input_shape = 1) %>%\n", - " layer_dense(units = 20, activation = 'relu') %>%\n", - " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", + " layer_dense(units = 200, activation = \"relu\", input_shape = 1) %>%\n", + " layer_dense(units = 20, activation = \"relu\") %>%\n", + " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", "\n", "# Compile the model\n", "model %>% compile(\n", @@ -437,28 +480,32 @@ "execution_count": null, "id": "19", "metadata": { - "id": "FuBqP_e7Te5Y" + "id": "FuBqP_e7Te5Y", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "num_epochs <- 100\n", "\n", "# Define early stopping based on validation set (20%) performance\n", - "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", + "# Patience set to 5 epochs (default in skorch is 5)\n", + "early_stopping <- callback_early_stopping(monitor = \"val_loss\", patience = 5)\n", "\n", "# Train the model\n", "model %>% fit(\n", - " X_train, Y_train,\n", + " x_train, y_train,\n", " epochs = num_epochs,\n", " batch_size = 10,\n", - " validation_split = 0.2, # 20% validation set\n", + " validation_split = 0.2, # 20% validation set\n", " verbose = 0,\n", " callbacks = list(early_stopping)\n", ")\n", "\n", - "pred.NN <- model %>% predict(X_train)\n", - "plot(X_train, Y_train, type=\"p\", pch=19, xlab=\"z\", ylab=\"g(z)\")\n", - "points(X_train, pred.NN, col=4, pch=19,)" + "pred_nn <- model %>% predict(x_train)\n", + "plot(x_train, y_train, type = \"p\", pch = 19, xlab = \"z\", ylab = \"g(z)\")\n", + "points(x_train, pred_nn, col = 4, pch = 19)" ] } ], diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index ccd197c5..ee43c561 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -41,7 +41,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "ww70bLKfEsOb" + "id": "ww70bLKfEsOb", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -65,7 +68,7 @@ "library(nnet)\n", "library(gbm)\n", "library(rpart.plot)\n", - "library(keras)\n" + "library(keras)" ] }, { @@ -115,11 +118,14 @@ "start_time": "2021-02-13T18:19:43.644436", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv\"\n", "data <- read.csv(file)\n", "dim(data)" ] @@ -153,11 +159,14 @@ "start_time": "2021-02-13T18:19:44.031671", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "Z <- subset(data,select=-c(lwage,wage)) # regressors\n", + "Z <- subset(data, select = -c(lwage, wage)) # regressors\n", "colnames(Z)" ] }, @@ -190,11 +199,14 @@ "start_time": "2021-02-13T18:19:44.233988", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "hist(data$wage, xlab= \"hourly wage\", main=\"Empirical wage distribution from the US survey data\", breaks= 35)\n" + "hist(data$wage, xlab = \"hourly wage\", main = \"Empirical wage distribution from the US survey data\", breaks = 35)" ] }, { @@ -306,15 +318,18 @@ "start_time": "2021-02-13T18:19:45.080930", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1234)\n", - "training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE)\n", + "training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE)\n", "\n", - "data_train <- data[training,]\n", - "data_test <- data[-training,]" + "data_train <- data[training, ]\n", + "data_test <- data[-training, ]" ] }, { @@ -346,21 +361,25 @@ "start_time": "2021-02-13T18:19:45.253820", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "X_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", - "X_flex <- \"sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)\"\n", - "formula_basic <- as.formula(paste(\"lwage\", \"~\", X_basic))\n", - "formula_flex <- as.formula(paste(\"lwage\", \"~\", X_flex))\n", + "x_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", + "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", + " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", + "formula_basic <- as.formula(paste(\"lwage\", \"~\", x_basic))\n", + "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", "\n", - "model_X_basic_train <- model.matrix(formula_basic,data_train)\n", - "model_X_basic_test <- model.matrix(formula_basic,data_test)\n", - "p_basic <- dim(model_X_basic_train)[2]\n", - "model_X_flex_train <- model.matrix(formula_flex,data_train)\n", - "model_X_flex_test <- model.matrix(formula_flex,data_test)\n", - "p_flex <- dim(model_X_flex_train)[2]" + "model_x_basic_train <- model.matrix(formula_basic, data_train)\n", + "model_x_basic_test <- model.matrix(formula_basic, data_test)\n", + "p_basic <- dim(model_x_basic_train)[2]\n", + "model_x_flex_train <- model.matrix(formula_flex, data_train)\n", + "model_x_flex_test <- model.matrix(formula_flex, data_test)\n", + "p_flex <- dim(model_x_flex_train)[2]" ] }, { @@ -375,12 +394,15 @@ "start_time": "2021-02-13T18:19:45.384420", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "Y_train <- data_train$lwage\n", - "Y_test <- data_test$lwage" + "y_train <- data_train$lwage\n", + "y_test <- data_test$lwage" ] }, { @@ -395,7 +417,10 @@ "start_time": "2021-02-13T18:19:45.482466", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -466,12 +491,15 @@ "start_time": "2021-02-13T18:19:45.817632", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# ols (basic model)\n", - "fit.lm.basic <- lm(formula_basic, data_train)" + "fit_lm_basic <- lm(formula_basic, data_train)" ] }, { @@ -486,13 +514,17 @@ "start_time": "2021-02-13T18:19:45.925447", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Compute the Out-Of-Sample Performance\n", - "yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test)\n", - "cat(\"The mean squared error (MSE) using the basic model is equal to\" , mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model)" + "yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test)\n", + "# MSE OLS (basic model)\n", + "cat(\"The mean squared error (MSE) using the basic model is equal to\", mean((y_test - yhat_lm_basic)^2))" ] }, { @@ -524,12 +556,15 @@ "start_time": "2021-02-13T18:19:46.162531", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2]\n", - "MSE.lm.basic" + "mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2]\n", + "mse_lm_basic" ] }, { @@ -561,12 +596,16 @@ "start_time": "2021-02-13T18:19:46.356656", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test)\n", - "cat(\"The R^2 using the basic model is equal to\",R2.lm.basic) # MSE OLS (basic model)" + "r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test)\n", + "# MSE OLS (basic model)\n", + "cat(\"The R^2 using the basic model is equal to\", r2_lm_basic)" ] }, { @@ -598,18 +637,21 @@ "start_time": "2021-02-13T18:19:46.532081", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# ols (flexible model)\n", - "fit.lm.flex <- lm(formula_flex, data_train)\n", + "fit_lm_flex <- lm(formula_flex, data_train)\n", "# Compute the Out-Of-Sample Performance\n", - "options(warn=-1)\n", - "yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test)\n", - "MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2]\n", - "R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test)\n", - "cat(\"The R^2 using the flexible model is equal to\",R2.lm.flex) # MSE OLS (flexible model)" + "options(warn = -1)\n", + "yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test)\n", + "mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2]\n", + "r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test)\n", + "cat(\"The R^2 using the flexible model is equal to\", r2_lm_flex) # MSE OLS (flexible model)" ] }, { @@ -675,22 +717,26 @@ "start_time": "2021-02-13T18:19:47.056488", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# lasso and variants\n", - "fit.rlasso <- rlasso(formula_basic, data_train, post=FALSE)\n", - "fit.rlasso.post <- rlasso(formula_basic, data_train, post=TRUE)\n", - "yhat.rlasso <- predict(fit.rlasso, newdata=data_test)\n", - "yhat.rlasso.post <- predict(fit.rlasso.post, newdata=data_test)\n", + "fit_rlasso <- hdm::rlasso(formula_basic, data_train, post = FALSE)\n", + "fit_rlasso_post <- hdm::rlasso(formula_basic, data_train, post = TRUE)\n", + "yhat_rlasso <- predict(fit_rlasso, newdata = data_test)\n", + "yhat_rlasso_post <- predict(fit_rlasso_post, newdata = data_test)\n", "\n", - "MSE.lasso <- summary(lm((Y_test-yhat.rlasso)^2~1))$coef[1:2]\n", - "MSE.lasso.post <- summary(lm((Y_test-yhat.rlasso.post)^2~1))$coef[1:2]\n", + "mse_lasso <- summary(lm((y_test - yhat_rlasso)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post <- summary(lm((y_test - yhat_rlasso_post)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso <- 1-MSE.lasso[1]/var(Y_test)\n", - "R2.lasso.post <- 1-MSE.lasso.post[1]/var(Y_test)\n", - "cat(\"The R^2 using the basic model is equal to\",R2.lasso,\"for lasso and\",R2.lasso.post,\"for post-lasso\") # R^2 lasso/post-lasso (basic model)" + "r2_lasso <- 1 - mse_lasso[1] / var(y_test)\n", + "r2_lasso_post <- 1 - mse_lasso_post[1] / var(y_test)\n", + "# R^2 lasso/post-lasso (basic model)\n", + "cat(\"The R^2 using the basic model is equal to\", r2_lasso, \"for lasso and\", r2_lasso_post, \"for post-lasso\")" ] }, { @@ -722,21 +768,26 @@ "start_time": "2021-02-13T18:19:47.798358", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE)\n", - "fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE)\n", - "yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test)\n", - "yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test)\n", + "fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE)\n", + "fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE)\n", + "yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test)\n", + "yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test)\n", "\n", - "MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2]\n", - "MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2]\n", + "mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2]\n", + "mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test)\n", - "R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test)\n", - "cat(\"The R^2 using the flexible model is equal to\",R2.lasso.flex,\"for lasso and\",R2.lasso.post.flex,\"for post-lasso\") # R^2 lasso/post-lasso (flexible model)" + "# R^2 lasso/post-lasso (flexible model)\n", + "r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test)\n", + "r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test)\n", + "cat(\"The R^2 using the flexible model is equal to\", r2_lasso_flex,\n", + " \"for lasso and\", r2_lasso_post_flex, \"for post-lasso\")" ] }, { @@ -768,28 +819,32 @@ "start_time": "2021-02-13T18:19:51.477432", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.lasso.cv <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet <- cv.glmnet(model_X_basic_train, Y_train, family=\"gaussian\", alpha=.5)\n", + "fit_lasso_cv <- cv.glmnet(model_x_basic_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge <- cv.glmnet(model_x_basic_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet <- cv.glmnet(model_x_basic_train, y_train, family = \"gaussian\", alpha = .5)\n", "\n", - "yhat.lasso.cv <- predict(fit.lasso.cv, newx = model_X_basic_test)\n", - "yhat.ridge <- predict(fit.ridge, newx = model_X_basic_test)\n", - "yhat.elnet <- predict(fit.elnet, newx = model_X_basic_test)\n", + "yhat_lasso_cv <- predict(fit_lasso_cv, newx = model_x_basic_test)\n", + "yhat_ridge <- predict(fit_ridge, newx = model_x_basic_test)\n", + "yhat_elnet <- predict(fit_elnet, newx = model_x_basic_test)\n", "\n", - "MSE.lasso.cv <- summary(lm((Y_test-yhat.lasso.cv)^2~1))$coef[1:2]\n", - "MSE.ridge <- summary(lm((Y_test-yhat.ridge)^2~1))$coef[1:2]\n", - "MSE.elnet <- summary(lm((Y_test-yhat.elnet)^2~1))$coef[1:2]\n", + "mse_lasso_cv <- summary(lm((y_test - yhat_lasso_cv)^2 ~ 1))$coef[1:2]\n", + "mse_ridge <- summary(lm((y_test - yhat_ridge)^2 ~ 1))$coef[1:2]\n", + "mse_elnet <- summary(lm((y_test - yhat_elnet)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso.cv <- 1-MSE.lasso.cv[1]/var(Y_test)\n", - "R2.ridge <- 1-MSE.ridge[1]/var(Y_test)\n", - "R2.elnet <- 1-MSE.elnet[1]/var(Y_test)\n", + "r2_lasso_cv <- 1 - mse_lasso_cv[1] / var(y_test)\n", + "r2_ridge <- 1 - mse_ridge[1] / var(y_test)\n", + "r2_elnet <- 1 - mse_elnet[1] / var(y_test)\n", "\n", "# R^2 using cross-validation (basic model)\n", - "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the basic model:\",R2.lasso.cv,R2.ridge,R2.elnet)" + "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the basic model:\",\n", + " r2_lasso_cv, r2_ridge, r2_elnet)" ] }, { @@ -821,28 +876,32 @@ "start_time": "2021-02-13T18:19:53.854797", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=1)\n", - "fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=0)\n", - "fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family=\"gaussian\", alpha=.5)\n", + "fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 1)\n", + "fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = 0)\n", + "fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = \"gaussian\", alpha = .5)\n", "\n", - "yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test)\n", - "yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test)\n", - "yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test)\n", + "yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test)\n", + "yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test)\n", + "yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test)\n", "\n", - "MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2]\n", - "MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2]\n", - "MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2]\n", + "mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2]\n", + "mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2]\n", + "mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2]\n", "\n", - "R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test)\n", - "R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test)\n", - "R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test)\n", + "r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test)\n", + "r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test)\n", + "r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test)\n", "\n", "# R^2 using cross-validation (flexible model)\n", - "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:\",R2.lasso.cv.flex,R2.ridge.flex,R2.elnet.flex)" + "cat(\"R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:\",\n", + " r2_lasso_cv_flex, r2_ridge_flex, r2_elnet_flex)" ] }, { @@ -942,13 +1001,17 @@ "start_time": "2021-02-13T18:20:08.391531", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# tree\n", - "fit.trees <- rpart(formula_basic, data_train, minbucket=5, cp = 0.001)\n", - "prp(fit.trees, leaf.round=1, space=2, yspace=2, split.space=2,shadow.col = \"gray\",trace = 1) # plotting the tree" + "fit_trees <- rpart(formula_basic, data_train, minbucket = 5, cp = 0.001)\n", + "# plotting the tree\n", + "prp(fit_trees, leaf.round = 1, space = 2, yspace = 2, split.space = 2, shadow.col = \"gray\", trace = 1)" ] }, { @@ -981,11 +1044,14 @@ "start_time": "2021-02-13T18:20:10.375722", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "bestcp <- fit.trees$cptable[which.min(fit.trees$cptable[,\"xerror\"]),\"CP\"]\n", + "bestcp <- fit_trees$cptable[which.min(fit_trees$cptable[, \"xerror\"]), \"CP\"]\n", "bestcp" ] }, @@ -1018,12 +1084,16 @@ "start_time": "2021-02-13T18:20:10.587972", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "fit.prunedtree <- prune(fit.trees,cp=bestcp)\n", - "prp(fit.prunedtree,leaf.round=1, space=3, yspace=3, split.space=7, shadow.col = \"gray\",trace = 1,yesno=1)" + "fit_prunedtree <- prune(fit_trees, cp = bestcp)\n", + "prp(fit_prunedtree, leaf.round = 1, space = 3, yspace = 3, split.space = 7,\n", + " shadow.col = \"gray\", trace = 1, yesno = 1)" ] }, { @@ -1055,16 +1125,19 @@ "start_time": "2021-02-13T18:20:11.384167", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "yhat.pt <- predict(fit.prunedtree,newdata=data_test)\n", - "MSE.pt <- summary(lm((Y_test-yhat.pt)^2~1))$coef[1:2]\n", - "R2.pt <- 1-MSE.pt[1]/var(Y_test)\n", + "yhat_pt <- predict(fit_prunedtree, newdata = data_test)\n", + "mse_pt <- summary(lm((y_test - yhat_pt)^2 ~ 1))$coef[1:2]\n", + "r2_pt <- 1 - mse_pt[1] / var(y_test)\n", "\n", "# R^2 of the pruned tree\n", - "cat(\"R^2 of the pruned tree:\",R2.pt)" + "cat(\"R^2 of the pruned tree:\", r2_pt)" ] }, { @@ -1105,18 +1178,21 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "ZbLiUr0Lh4Le" + "id": "ZbLiUr0Lh4Le", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# random forest\n", - "fit.rf <- randomForest(model_X_basic_train, Y_train, ntree=2000, nodesize=20, data = data_train)\n", + "fit_rf <- randomForest(model_x_basic_train, y_train, ntree = 2000, nodesize = 20, data = data_train)\n", "\n", "## Evaluating the method\n", - "yhat.rf <- predict(fit.rf, newdata=model_X_basic_test) # prediction\n", + "yhat_rf <- predict(fit_rf, newdata = model_x_basic_test) # prediction\n", "\n", - "MSE.rf = summary(lm((Y_test-yhat.rf)^2~1))$coef[1:2]\n", - "R2.rf <- 1-MSE.rf[1]/var(Y_test)\n" + "mse_rf <- summary(lm((y_test - yhat_rf)^2 ~ 1))$coef[1:2]\n", + "r2_rf <- 1 - mse_rf[1] / var(y_test)" ] }, { @@ -1131,31 +1207,38 @@ "start_time": "2021-02-13T18:20:11.718472", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# boosting\n", - "fit.boost <- gbm(formula_basic, data=data_train, distribution= \"gaussian\", bag.fraction = .5, interaction.depth=2, n.trees=1000, shrinkage=.01)\n", - "best.boost <- gbm.perf(fit.boost, plot.it = FALSE) # cross-validation to determine when to stop\n", + "fit_boost <- gbm(formula_basic, data = data_train, distribution = \"gaussian\", bag.fraction = .5,\n", + " interaction.depth = 2, n.trees = 1000, shrinkage = .01)\n", + "best_boost <- gbm.perf(fit_boost, plot.it = FALSE) # cross-validation to determine when to stop\n", "\n", "## Evaluating the method\n", - "yhat.boost <- predict(fit.boost, newdata=data_test, n.trees=best.boost)\n", + "yhat_boost <- predict(fit_boost, newdata = data_test, n.trees = best_boost)\n", "\n", - "MSE.boost = summary(lm((Y_test-yhat.boost)^2~1))$coef[1:2]\n", - "R2.boost <- 1-MSE.boost[1]/var(Y_test)\n" + "mse_boost <- summary(lm((y_test - yhat_boost)^2 ~ 1))$coef[1:2]\n", + "r2_boost <- 1 - mse_boost[1] / var(y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "WkzBr2OOi9GC" + "id": "WkzBr2OOi9GC", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# printing R^2\n", - "cat(\"R^2 of the random forest and boosted trees:\", R2.rf, R2.boost)" + "cat(\"R^2 of the random forest and boosted trees:\", r2_rf, r2_boost)" ] }, { @@ -1173,15 +1256,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "hKNFcGgwt3gm" + "id": "hKNFcGgwt3gm", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Define the neural network architecture\n", "model <- keras_model_sequential() %>%\n", - " layer_dense(units = 50, activation = 'relu', input_shape = dim(model_X_basic_train)[2]) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", + " layer_dense(units = 50, activation = \"relu\", input_shape = dim(model_x_basic_train)[2]) %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", + " layer_dense(units = 1) # Output layer with 1 unit for regression task\n", "\n", "# Compile the model\n", "model %>% compile(\n", @@ -1197,21 +1283,25 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "c3guqZeeyDd3" + "id": "c3guqZeeyDd3", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "num_epochs <- 100\n", "\n", "# Define early stopping based on validation set (20%) performance\n", - "early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5)\n", + "# Patience set to 5 epochs (default in skorch is 5)\n", + "early_stopping <- callback_early_stopping(monitor = \"val_loss\", patience = 5)\n", "\n", "# Train the model\n", "model %>% fit(\n", - " model_X_basic_train, Y_train,\n", + " model_x_basic_train, y_train,\n", " epochs = num_epochs,\n", " batch_size = 10,\n", - " validation_split = 0.2, # 20% validation set\n", + " validation_split = 0.2, # 20% validation set\n", " verbose = 0,\n", " callbacks = list(early_stopping)\n", ")" @@ -1221,28 +1311,34 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "oFRmau4lzDoa" + "id": "oFRmau4lzDoa", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# evaluating the performance\n", - "model %>% evaluate(model_X_basic_test, Y_test, verbose = 0)" + "model %>% evaluate(model_x_basic_test, y_test, verbose = 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "UZP6ytgUzAlz" + "id": "UZP6ytgUzAlz", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Calculating the performance measures\n", - "yhat.nn <- model %>% predict(model_X_basic_test)\n", - "MSE.nn = summary(lm((Y_test-yhat.nn)^2~1))$coef[1:2]\n", - "R2.nn <- 1-MSE.nn[1]/var(Y_test)\n", + "yhat_nn <- model %>% predict(model_x_basic_test)\n", + "mse_nn <- summary(lm((y_test - yhat_nn)^2 ~ 1))$coef[1:2]\n", + "r2_nn <- 1 - mse_nn[1] / var(y_test)\n", "# printing R^2\n", - "cat(\"R^2 of the neural network:\",R2.nn)" + "cat(\"R^2 of the neural network:\", r2_nn)" ] }, { @@ -1291,56 +1387,60 @@ "start_time": "2021-02-13T18:21:08.655638", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "table<- matrix(0, 16, 3)\n", - "table[1,1:2] <- MSE.lm.basic\n", - "table[2,1:2] <- MSE.lm.flex\n", - "table[3,1:2] <- MSE.lasso\n", - "table[4,1:2] <- MSE.lasso.post\n", - "table[5,1:2] <- MSE.lasso.flex\n", - "table[6,1:2] <- MSE.lasso.post.flex\n", - "table[7,1:2] <- MSE.lasso.cv\n", - "table[8,1:2] <- MSE.ridge\n", - "table[9,1:2] <- MSE.elnet\n", - "table[10,1:2] <- MSE.lasso.cv.flex\n", - "table[11,1:2] <- MSE.ridge.flex\n", - "table[12,1:2] <- MSE.elnet.flex\n", - "table[13,1:2] <- MSE.rf\n", - "table[14,1:2] <- MSE.boost\n", - "table[15,1:2] <- MSE.pt\n", - "table[16,1:2] <- MSE.nn\n", + "table <- matrix(0, 16, 3)\n", + "table[1, 1:2] <- mse_lm_basic\n", + "table[2, 1:2] <- mse_lm_flex\n", + "table[3, 1:2] <- mse_lasso\n", + "table[4, 1:2] <- mse_lasso_post\n", + "table[5, 1:2] <- mse_lasso_flex\n", + "table[6, 1:2] <- mse_lasso_post_flex\n", + "table[7, 1:2] <- mse_lasso_cv\n", + "table[8, 1:2] <- mse_ridge\n", + "table[9, 1:2] <- mse_elnet\n", + "table[10, 1:2] <- mse_lasso_cv_flex\n", + "table[11, 1:2] <- mse_ridge_flex\n", + "table[12, 1:2] <- mse_elnet_flex\n", + "table[13, 1:2] <- mse_rf\n", + "table[14, 1:2] <- mse_boost\n", + "table[15, 1:2] <- mse_pt\n", + "table[16, 1:2] <- mse_nn\n", "\n", "\n", + "table[1, 3] <- r2_lm_basic\n", + "table[2, 3] <- r2_lm_flex\n", + "table[3, 3] <- r2_lasso\n", + "table[4, 3] <- r2_lasso_post\n", + "table[5, 3] <- r2_lasso_flex\n", + "table[6, 3] <- r2_lasso_post_flex\n", + "table[7, 3] <- r2_lasso_cv\n", + "table[8, 3] <- r2_ridge\n", + "table[9, 3] <- r2_elnet\n", + "table[10, 3] <- r2_lasso_cv_flex\n", + "table[11, 3] <- r2_ridge_flex\n", + "table[12, 3] <- r2_elnet_flex\n", + "table[13, 3] <- r2_rf\n", + "table[14, 3] <- r2_boost\n", + "table[15, 3] <- r2_pt\n", + "table[16, 3] <- r2_nn\n", "\n", - "table[1,3] <- R2.lm.basic\n", - "table[2,3] <- R2.lm.flex\n", - "table[3,3] <- R2.lasso\n", - "table[4,3] <- R2.lasso.post\n", - "table[5,3] <- R2.lasso.flex\n", - "table[6,3] <- R2.lasso.post.flex\n", - "table[7,3] <- R2.lasso.cv\n", - "table[8,3] <- R2.ridge\n", - "table[9,3] <- R2.elnet\n", - "table[10,3] <- R2.lasso.cv.flex\n", - "table[11,3] <- R2.ridge.flex\n", - "table[12,3] <- R2.elnet.flex\n", - "table[13,3] <- R2.rf\n", - "table[14,3] <- R2.boost\n", - "table[15,3] <- R2.pt\n", - "table[16,3] <- R2.nn\n", "\n", - "\n", - "\n", - "\n", - "colnames(table)<- c(\"MSE\", \"S.E. for MSE\", \"R-squared\")\n", - "rownames(table)<- c(\"Least Squares (basic)\",\"Least Squares (flexible)\", \"Lasso\", \"Post-Lasso\",\"Lasso (flexible)\",\"Post-Lasso (flexible)\",\n", - " \"Cross-Validated lasso\", \"Cross-Validated ridge\",\"Cross-Validated elnet\",\"Cross-Validated lasso (flexible)\",\"Cross-Validated ridge (flexible)\",\"Cross-Validated elnet (flexible)\",\n", - " \"Random Forest\",\"Boosted Trees\", \"Pruned Tree\", \"Neural Net (Early)\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "colnames(table) <- c(\"MSE\", \"S.E. for MSE\", \"R-squared\")\n", + "rownames(table) <- c(\n", + " \"Least Squares (basic)\", \"Least Squares (flexible)\", \"Lasso\", \"Post-Lasso\",\n", + " \"Lasso (flexible)\", \"Post-Lasso (flexible)\",\n", + " \"Cross-Validated lasso\", \"Cross-Validated ridge\", \"Cross-Validated elnet\",\n", + " \"Cross-Validated lasso (flexible)\", \"Cross-Validated ridge (flexible)\", \"Cross-Validated elnet (flexible)\",\n", + " \"Random Forest\", \"Boosted Trees\", \"Pruned Tree\", \"Neural Net (Early)\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", "tab" ] }, @@ -1412,12 +1512,18 @@ "start_time": "2021-02-13T18:21:09.308835", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "ensemble.ols <- summary(lm(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", - "ensemble.ols\n" + "ensemble_ols <- summary(lm(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex +\n", + " yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex +\n", + " yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex +\n", + " yhat_pt + yhat_rf + yhat_boost + yhat_nn))\n", + "ensemble_ols" ] }, { @@ -1449,12 +1555,18 @@ "start_time": "2021-02-13T18:21:09.551881", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "ensemble.lasso <- summary(rlasso(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn))\n", - "ensemble.lasso" + "ensemble_lasso <- summary(hdm::rlasso(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex +\n", + " yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex +\n", + " yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex +\n", + " yhat_pt + yhat_rf + yhat_boost + yhat_nn))\n", + "ensemble_lasso" ] }, { @@ -1486,25 +1598,28 @@ "start_time": "2021-02-13T18:21:09.894515", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "table<- matrix(0, 17, 2)\n", - "table[1:17,1] <- ensemble.ols$coef[1:17]\n", - "table[1:17,2] <- ensemble.lasso$coef[1:17]\n", - "\n", + "table <- matrix(0, 17, 2)\n", + "table[1:17, 1] <- ensemble_ols$coef[1:17]\n", + "table[1:17, 2] <- ensemble_lasso$coef[1:17]\n", "\n", - "colnames(table)<- c(\"Weight OLS\", \"Weight Lasso\")\n", + "colnames(table) <- c(\"Weight OLS\", \"Weight Lasso\")\n", "\n", - "\n", - "rownames(table)<- c(\"Constant\",\"Least Squares (basic)\", \"Least Squares (flexible)\", \"Lasso (basic)\",\n", - " \"Lasso (flexible)\", \"Post-Lasso (basic)\", \"Post-Lasso (flexible)\", \"LassoCV (basic)\",\n", - " \"Lasso CV (flexible)\", \"Ridge CV (basic)\", \"Ridge CV (flexible)\", \"ElNet CV (basic)\",\n", - " \"ElNet CV (flexible)\", \"Pruned Tree\", \"Random Forest\",\"Boosted Trees\", \"Neural Net\")\n", - "tab <- xtable(table, digits =3)\n", - "print(tab,type=\"latex\") # set type=\"latex\" for printing table in LaTeX\n", - "tab\n" + "rownames(table) <- c(\n", + " \"Constant\", \"Least Squares (basic)\", \"Least Squares (flexible)\", \"Lasso (basic)\",\n", + " \"Lasso (flexible)\", \"Post-Lasso (basic)\", \"Post-Lasso (flexible)\", \"LassoCV (basic)\",\n", + " \"Lasso CV (flexible)\", \"Ridge CV (basic)\", \"Ridge CV (flexible)\", \"ElNet CV (basic)\",\n", + " \"ElNet CV (flexible)\", \"Pruned Tree\", \"Random Forest\", \"Boosted Trees\", \"Neural Net\"\n", + ")\n", + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\") # set type=\"latex\" for printing table in LaTeX\n", + "tab" ] }, { @@ -1528,13 +1643,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "pcyQsL5xmKxR" + "id": "pcyQsL5xmKxR", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# print ensemble R^2\n", - "cat(\"R^2 of stacking with LS weights:\",ensemble.ols$adj.r.squared,\"\\n\")\n", - "cat(\"R^2 of stacking with Lasso weights:\",ensemble.lasso$adj.r.squared,\"\\n\")" + "cat(\"R^2 of stacking with LS weights:\", ensemble_ols$adj.r.squared, \"\\n\")\n", + "cat(\"R^2 of stacking with Lasso weights:\", ensemble_lasso$adj.r.squared, \"\\n\")" ] }, { @@ -1561,7 +1679,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "NPz9qeg2EPAN" + "id": "NPz9qeg2EPAN", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1574,7 +1695,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "fxz49VSXEZDC" + "id": "fxz49VSXEZDC", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1586,13 +1710,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "orzSZz_eEnWg" + "id": "orzSZz_eEnWg", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# convert data as h2o type\n", - "train_h = as.h2o(data_train)\n", - "test_h = as.h2o(data_test)\n", + "train_h <- as.h2o(data_train)\n", + "test_h <- as.h2o(data_test)\n", "\n", "# have a look at the data\n", "h2o.describe(train_h)" @@ -1602,23 +1729,27 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "5PohiG13EqTn" + "id": "5PohiG13EqTn", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "y_name = 'lwage'\n", - "X_names = setdiff(names(data), c('lwage','wage','occ', 'ind'))\n", + "y_name <- \"lwage\"\n", + "x_names <- setdiff(names(data), c(\"lwage\", \"wage\", \"occ\", \"ind\"))\n", "\n", "# run AutoML for 10 base models and a maximal runtime of 100 seconds\n", - "aml = h2o.automl(x=X_names, y=y_name,\n", - " training_frame = train_h,\n", - " leaderboard_frame = test_h,\n", - " max_models = 10,\n", - " seed = 1,\n", - " max_runtime_secs = 100\n", - " )\n", + "aml <- h2o.automl(\n", + " x = x_names, y = y_name,\n", + " training_frame = train_h,\n", + " leaderboard_frame = test_h,\n", + " max_models = 10,\n", + " seed = 1,\n", + " max_runtime_secs = 100\n", + ")\n", "# AutoML Leaderboard\n", - "lb = aml@leaderboard\n", + "lb <- aml@leaderboard\n", "print(lb, n = nrow(lb))" ] }, @@ -1635,7 +1766,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "fIBhP8LSGpA6" + "id": "fIBhP8LSGpA6", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1655,7 +1789,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "MDYChZcXHVgf" + "id": "MDYChZcXHVgf", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1675,11 +1812,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "4mnpHT3wHYq9" + "id": "4mnpHT3wHYq9", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "model_ids <- as.data.frame(aml@leaderboard$model_id)[,1]\n", + "model_ids <- as.data.frame(aml@leaderboard$model_id)[, 1]\n", "# Get the \"All Models\" Stacked Ensemble model\n", "se <- h2o.getModel(grep(\"StackedEnsemble_AllModels\", model_ids, value = TRUE)[1])\n", "# Get the Stacked Ensemble metalearner model\n", @@ -1700,7 +1840,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "J6azOyskHcMu" + "id": "J6azOyskHcMu", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1722,11 +1865,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "c6mkVpADH-hB" + "id": "c6mkVpADH-hB", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample\n", + "pred <- as.matrix(h2o.predict(aml@leader, test_h)) # make prediction using x data from the test sample\n", "head(pred)" ] }, @@ -1734,13 +1880,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "GwLL8pywIBBI" + "id": "GwLL8pywIBBI", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "y_test <- as.matrix(test_h$lwage)\n", - "R2_test <- 1-summary(lm((y_test-pred)^2~1))$coef[1]/var(y_test)\n", - "cat(\"MSE, SE, R^2:\" , summary(lm((y_test-pred)^2~1))$coef[1:2], R2_test)\n" + "r2_test <- 1 - summary(lm((y_test - pred)^2 ~ 1))$coef[1] / var(y_test)\n", + "cat(\"MSE, SE, R^2:\", summary(lm((y_test - pred)^2 ~ 1))$coef[1:2], r2_test)" ] }, { @@ -1756,11 +1905,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "yoJihU54Ioxs" + "id": "yoJihU54Ioxs", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "h2o.shutdown(prompt = F) # shut down the h20 automatically without prompting user" + "h2o.shutdown(prompt = FALSE) # shut down the h20 automatically without prompting user" ] } ], From dff992cfdb5059bab3b7719d1377e9fbd1352178 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 14:43:15 -0700 Subject: [PATCH 088/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 7c989463..c48ac5b3 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -27,7 +27,7 @@ jobs: - name: Install nbstripout run: | python -m pip install --upgrade pip - pip install nbstripout + pip install nbstripout tensorflow - name: Set up R uses: r-lib/actions/setup-r@v2 From efa8643c71a7e5267c27a5165a9feab925b9bca5 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 14:52:57 -0700 Subject: [PATCH 089/261] Removing the object_usage_linter --- .github/workflows/transform-R-to-Rmd.yml | 3 ++- .lintr | 3 ++- PM3/r_functional_approximation_by_nn_and_rf.irnb | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index c48ac5b3..670bf78c 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -62,7 +62,8 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"))) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")), + object_usage_linter = NULL) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) diff --git a/.lintr b/.lintr index d5b122eb..340a9b5d 100644 --- a/.lintr +++ b/.lintr @@ -1,5 +1,6 @@ linters: linters_with_defaults( line_length_linter(120), - object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")) + object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"), + object_usage_lineter = NULL) ) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 74f1732f..51647365 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -337,7 +337,6 @@ "outputs": [], "source": [ "build_model <- function() {\n", - " require(magrittr)\n", "\n", " model <- keras::keras_model_sequential() %>%\n", " keras::layer_dense(\n", From 17b1a6b3c7654c4c09d176fc4ec88b140373e553 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 14:55:10 -0700 Subject: [PATCH 090/261] Update r_functional_approximation_by_nn_and_rf.irnb --- PM3/r_functional_approximation_by_nn_and_rf.irnb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 51647365..5941ec55 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -40,14 +40,12 @@ "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", "install.packages(\"keras\")\n", - "install.packages(\"magrittr\")\n", "\n", "\n", "library(randomForest)\n", "library(rpart)\n", "library(gbm)\n", - "library(keras)\n", - "library(magrittr)" + "library(keras)" ] }, { From b43542c91d615d6de56f7f1154f8f5318871c011 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 22:55:27 -0700 Subject: [PATCH 091/261] Update .lintr --- .lintr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.lintr b/.lintr index 340a9b5d..fbf667eb 100644 --- a/.lintr +++ b/.lintr @@ -1,6 +1,6 @@ linters: linters_with_defaults( line_length_linter(120), object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"), - object_usage_lineter = NULL) + object_usage_linter = NULL) ) From 0904e08753dea6712940991f0abf2d91244c254c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 23:02:29 -0700 Subject: [PATCH 092/261] Update .lintr --- .lintr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.lintr b/.lintr index fbf667eb..a13323f1 100644 --- a/.lintr +++ b/.lintr @@ -1,6 +1,6 @@ linters: linters_with_defaults( line_length_linter(120), - object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"), - object_usage_linter = NULL) + object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")), + object_usage_linter = NULL ) From 0079c12181659b2d1f0e9ebf65bbd7ed6fc454f9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 23:30:49 -0700 Subject: [PATCH 093/261] added traceback printing --- .github/workflows/transform-R-to-Rmd.yml | 9 +- ..._ml_for_partially_linear_model_growth.irnb | 196 +++++++++++------- 2 files changed, 125 insertions(+), 80 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 670bf78c..ad802691 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -24,7 +24,7 @@ jobs: with: python-version: '3.8' # Specify your Python version here - - name: Install nbstripout + - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install nbstripout tensorflow @@ -62,8 +62,7 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")), - object_usage_linter = NULL) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"))) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) @@ -91,7 +90,7 @@ jobs: source(gitrfile) }, error = function(e) { - errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message) + errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = traceback()) } ) } @@ -102,6 +101,8 @@ jobs: for (error in errors) { cat("Error found in file:", error$gitrfile, "\n") cat("Error message:", error$message, "\n") + cat("Traceback:\n") + print(error$traceback) } quit(status = 1, save = "no") # Exit with an error status if errors are found } diff --git a/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb b/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb index 26f2cc70..af73b7fa 100644 --- a/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb +++ b/PM4/r_debiased_ml_for_partially_linear_model_growth.irnb @@ -39,7 +39,10 @@ "start_time": "2021-02-13T17:32:35.344160", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -62,7 +65,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "1I8mEMEM33fS" + "id": "1I8mEMEM33fS", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -77,20 +83,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "hiwEVN6i4FIH" + "id": "hiwEVN6i4FIH", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "y = as.matrix(data[,1]) # outcome: growth rate\n", - "d = as.matrix(data[,3]) # treatment: initial wealth\n", - "x = as.matrix(data[,-c(1,2,3)]) # controls: country characteristics\n", + "y <- as.matrix(data[, 1]) # outcome: growth rate\n", + "d <- as.matrix(data[, 3]) # treatment: initial wealth\n", + "x <- as.matrix(data[, -c(1, 2, 3)]) # controls: country characteristics\n", "\n", "# some summary statistics\n", - "cat(sprintf(\"\\nThe length of y is %g \\n\", length(y) ))\n", - "cat(sprintf(\"\\nThe number of features in x is %g \\n\", dim(x)[2] ))\n", + "cat(sprintf(\"\\nThe length of y is %g \\n\", length(y)))\n", + "cat(sprintf(\"\\nThe number of features in x is %g \\n\", dim(x)[2]))\n", "\n", - "lres=summary(lm(y~d +x))$coef[2,1:2]\n", - "cat(sprintf(\"\\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \\ncoef (se) = %g (%g)\\n\", lres[1] , lres[2]))" + "lres <- summary(lm(y ~ d + x))$coef[2, 1:2]\n", + "cat(sprintf(\"\\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \\ncoef (se) = %g (%g)\\n\",\n", + " lres[1], lres[2]))" ] }, { @@ -139,30 +149,33 @@ "start_time": "2021-02-13T17:33:08.350892", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) {\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", + "dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 2) {\n", + " nobs <- nrow(x) # number of observations\n", + " foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices\n", + " I <- split(1:nobs, foldid) # split observation indices into folds\n", " ytil <- dtil <- rep(NA, nobs)\n", " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", - " cat(b,\" \")\n", - " }\n", - " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", - " coef.est <- coef(rfit)[2] #extract coefficient\n", - " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", + " for (b in seq_along(I)) {\n", + " dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out\n", + " yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out\n", + " dhat <- predict(dfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold\n", + " cat(b, \" \")\n", + " }\n", + " rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other\n", + " coef.est <- coef(rfit)[2] # extract coefficient\n", + " se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est, se)) # printing output\n", + " return(list(coef.est = coef.est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals\n", "}\n" ] }, @@ -191,35 +204,54 @@ "start_time": "2021-02-13T17:33:08.428303", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#DML with OLS\n", + "# DML with OLS\n", "cat(sprintf(\"\\nDML with OLS w/o feature selection \\n\"))\n", - "dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs\n", - "yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS\n", - "DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "dreg <- function(x, d) {\n", + " glmnet(x, d, lambda = 0)\n", + "} # ML method= OLS using glmnet; using lm gives bugs\n", + "yreg <- function(x, y) {\n", + " glmnet(x, y, lambda = 0)\n", + "} # ML method = OLS\n", + "dml2_ols <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10)\n", "\n", "\n", - "#DML with Lasso:\n", + "# DML with Lasso:\n", "cat(sprintf(\"\\nDML with Lasso \\n\"))\n", - "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm\n", - "yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm\n", - "DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "dreg <- function(x, d) {\n", + " rlasso(x, d, post = FALSE)\n", + "} # ML method= lasso from hdm\n", + "yreg <- function(x, y) {\n", + " rlasso(x, y, post = FALSE)\n", + "} # ML method = lasso from hdm\n", + "dml2_lasso <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10)\n", "\n", "\n", - "#DML with Random Forest:\n", + "# DML with Random Forest:\n", "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", - "dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n", + "dreg <- function(x, d) {\n", + " randomForest(x, d)\n", + "} # ML method=Forest\n", + "yreg <- function(x, y) {\n", + " randomForest(x, y)\n", + "} # ML method=Forest\n", + "dml2_rf <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10)\n", "\n", - "#DML MIX:\n", + "# DML MIX:\n", "cat(sprintf(\"\\nDML with Lasso for D and Random Forest for Y \\n\"))\n", - "dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest\n", - "yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest\n", - "DML2.mix = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10)\n" + "dreg <- function(x, d) {\n", + " rlasso(x, d, post = FALSE)\n", + "} # ML method=Forest\n", + "yreg <- function(x, y) {\n", + " randomForest(x, y)\n", + "} # ML method=Forest\n", + "dml2_mix <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10)" ] }, { @@ -243,55 +275,61 @@ "start_time": "2021-02-13T17:33:27.099704", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2), mean((DML2.mix$dtil)^2));\n", - "prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2),mean((DML2.mix$ytil)^2));\n", - "prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y));\n", - "rownames(prRes)<- c(\"RMSE D\", \"RMSE Y\");\n", - "colnames(prRes)<- c(\"OLS\", \"Lasso\", \"RF\", \"Mix\")" + "pr_res_d <- c(mean((dml2_ols$dtil)^2), mean((dml2_lasso$dtil)^2), mean((dml2_rf$dtil)^2), mean((dml2_mix$dtil)^2))\n", + "pr_res_y <- c(mean((dml2_ols$ytil)^2), mean((dml2_lasso$ytil)^2), mean((dml2_rf$ytil)^2), mean((dml2_mix$ytil)^2))\n", + "pr_res <- rbind(sqrt(pr_res_d), sqrt(pr_res_y))\n", + "rownames(pr_res) <- c(\"RMSE D\", \"RMSE Y\")\n", + "colnames(pr_res) <- c(\"OLS\", \"Lasso\", \"RF\", \"Mix\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "a7WC2-_6_wMl" + "id": "a7WC2-_6_wMl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "table <- matrix(0,4,4)\n", + "table <- matrix(0, 4, 4)\n", "\n", "# Point Estimate\n", - "table[1,1] <- as.numeric(DML2.OLS$coef.est)\n", - "table[2,1] <- as.numeric(DML2.lasso$coef.est)\n", - "table[3,1] <- as.numeric(DML2.RF$coef.est)\n", - "table[4,1] <- as.numeric(DML2.mix$coef.est)\n", + "table[1, 1] <- as.numeric(dml2_ols$coef.est)\n", + "table[2, 1] <- as.numeric(dml2_lasso$coef.est)\n", + "table[3, 1] <- as.numeric(dml2_rf$coef.est)\n", + "table[4, 1] <- as.numeric(dml2_mix$coef.est)\n", "\n", "# SE\n", - "table[1,2] <- as.numeric(DML2.OLS$se)\n", - "table[2,2] <- as.numeric(DML2.lasso$se)\n", - "table[3,2] <- as.numeric(DML2.RF$se)\n", - "table[4,2] <- as.numeric(DML2.mix$se)\n", + "table[1, 2] <- as.numeric(dml2_ols$se)\n", + "table[2, 2] <- as.numeric(dml2_lasso$se)\n", + "table[3, 2] <- as.numeric(dml2_rf$se)\n", + "table[4, 2] <- as.numeric(dml2_mix$se)\n", "\n", "# RMSE Y\n", - "table[1,3] <- as.numeric(prRes[2,1])\n", - "table[2,3] <- as.numeric(prRes[2,2])\n", - "table[3,3] <- as.numeric(prRes[2,3])\n", - "table[4,3] <- as.numeric(prRes[2,4])\n", + "table[1, 3] <- as.numeric(pr_res[2, 1])\n", + "table[2, 3] <- as.numeric(pr_res[2, 2])\n", + "table[3, 3] <- as.numeric(pr_res[2, 3])\n", + "table[4, 3] <- as.numeric(pr_res[2, 4])\n", "\n", "# RMSE D\n", - "table[1,4] <- as.numeric(prRes[1,1])\n", - "table[2,4] <- as.numeric(prRes[1,2])\n", - "table[3,4] <- as.numeric(prRes[1,3])\n", - "table[4,4] <- as.numeric(prRes[1,4])\n", + "table[1, 4] <- as.numeric(pr_res[1, 1])\n", + "table[2, 4] <- as.numeric(pr_res[1, 2])\n", + "table[3, 4] <- as.numeric(pr_res[1, 3])\n", + "table[4, 4] <- as.numeric(pr_res[1, 4])\n", "\n", "\n", "\n", "# print results\n", - "colnames(table) <- c(\"Estimate\",\"Standard Error\", \"RMSE Y\", \"RMSE D\")\n", + "colnames(table) <- c(\"Estimate\", \"Standard Error\", \"RMSE Y\", \"RMSE D\")\n", "rownames(table) <- c(\"OLS\", \"Lasso\", \"RF\", \"RF/Lasso Mix\")\n", "table" ] @@ -300,23 +338,29 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "zZMbnWTE_yYd" + "id": "zZMbnWTE_yYd", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print(table, digit=3)" + "print(table, digit = 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "pNDCsznF_zyR" + "id": "pNDCsznF_zyR", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "tab<- xtable(table, digits=3)\n", - "print(tab, type=\"latex\")" + "tab <- xtable(table, digits = 3)\n", + "print(tab, type = \"latex\")" ] } ], From 879c83912ead0e94f650d58dc8095bff7dc02dfe Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 23:42:59 -0700 Subject: [PATCH 094/261] Update r_dml_inference_for_gun_ownership.irnb --- PM4/r_dml_inference_for_gun_ownership.irnb | 498 ++++++++++++--------- 1 file changed, 283 insertions(+), 215 deletions(-) diff --git a/PM4/r_dml_inference_for_gun_ownership.irnb b/PM4/r_dml_inference_for_gun_ownership.irnb index 28e94869..405a0736 100644 --- a/PM4/r_dml_inference_for_gun_ownership.irnb +++ b/PM4/r_dml_inference_for_gun_ownership.irnb @@ -82,7 +82,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "nIdoZ226yN1a" + "id": "nIdoZ226yN1a", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -109,7 +112,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "WHTx8goy46e9" + "id": "WHTx8goy46e9", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -158,7 +164,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "no2XXU9F460B" + "id": "no2XXU9F460B", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -174,23 +183,23 @@ "# enter nonlinearly and flexibly.\n", "\n", "## County FE\n", - "county.vars <- select(data, starts_with('X_J'))\n", + "county_vars <- select(data, starts_with(\"X_J\"))\n", "\n", "## Time variables and population weights\n", "# Pull out time variables\n", - "time.vars <- select(data, starts_with('X_T'))\n", + "time_vars <- select(data, starts_with(\"X_T\"))\n", "\n", "# Use these to construct population weights\n", - "popweights <- rowSums(time.vars)\n", + "pop_weights <- rowSums(time_vars)\n", "\n", "# Unweighted time variables\n", - "time.vars <- time.vars/popweights\n", + "time_vars <- time_vars / pop_weights\n", "\n", "# For any columns with only zero (like the first one), just drop\n", - "time.vars <- time.vars[, colSums(time.vars != 0) > 0]\n", + "time_vars <- time_vars[, colSums(time_vars != 0) > 0]\n", "\n", "# Create time index\n", - "time.ind <- rowSums(time.vars*(seq(1:20)))" + "time_ind <- rowSums(time_vars * (seq(1:20)))" ] }, { @@ -206,78 +215,84 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "0yv3j0wJ464e" + "id": "0yv3j0wJ464e", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - " ###### Create new data frame with variables we'll use\n", + "###### Create new data frame with variables we'll use\n", "\n", "# Function to find variable names\n", - "varlist <- function (df=NULL,type=c(\"numeric\",\"factor\",\"character\"), pattern=\"\", exclude=NULL) {\n", + "var_list <- function(df = NULL, type = c(\"numeric\", \"factor\", \"character\"), pattern = \"\", exclude = NULL) {\n", " vars <- character(0)\n", " if (any(type %in% \"numeric\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.numeric)])\n", + " vars <- c(vars, names(df)[sapply(df, is.numeric)])\n", " }\n", " if (any(type %in% \"factor\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.factor)])\n", + " vars <- c(vars, names(df)[sapply(df, is.factor)])\n", " }\n", " if (any(type %in% \"character\")) {\n", - " vars <- c(vars,names(df)[sapply(df,is.character)])\n", + " vars <- c(vars, names(df)[sapply(df, is.character)])\n", " }\n", - " vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)]\n", + " vars[(!vars %in% exclude) & grepl(vars, pattern = pattern)]\n", "}\n", "\n", "# census control variables\n", - "census <- NULL\n", - "census_var <- c(\"^AGE\", \"^BN\", \"^BP\", \"^BZ\", \"^ED\", \"^EL\",\"^HI\", \"^HS\", \"^INC\", \"^LF\", \"^LN\", \"^PI\", \"^PO\", \"^PP\", \"^PV\", \"^SPR\", \"^VS\")\n", + "census <- NULL\n", + "census_var <- c(\"^AGE\", \"^BN\", \"^BP\", \"^BZ\", \"^ED\", \"^EL\", \"^HI\", \"^HS\", \"^INC\", \"^LF\", \"^LN\",\n", + " \"^PI\", \"^PO\", \"^PP\", \"^PV\", \"^SPR\", \"^VS\")\n", "\n", - "for(i in 1:length(census_var)){\n", - " census <- append(census, varlist(data, pattern=census_var[i]))\n", + "for (i in seq_along(census_var)) {\n", + " census <- append(census, var_list(data, pattern = census_var[i]))\n", "}\n", "\n", "# other control variables\n", - "X1 <- c(\"logrobr\", \"logburg\", \"burg_missing\", \"robrate_missing\")\n", - "X2 <- c(\"newblack\", \"newfhh\", \"newmove\", \"newdens\", \"newmal\")\n", + "X1 <- c(\"logrobr\", \"logburg\", \"burg_missing\", \"robrate_missing\")\n", + "X2 <- c(\"newblack\", \"newfhh\", \"newmove\", \"newdens\", \"newmal\")\n", "\n", "# \"treatment\" variable\n", - "d <- \"logfssl\"\n", + "d <- \"logfssl\"\n", "\n", "# outcome variable\n", - "y <- \"logghomr\"\n", + "y <- \"logghomr\"\n", "\n", "# new data frame for time index\n", - "usedata <- as.data.frame(time.ind)\n", - "colnames(usedata) <- \"time.ind\"\n", - "usedata[,\"weights\"] <- popweights\n", + "usedata <- as.data.frame(time_ind)\n", + "colnames(usedata) <- \"time_ind\"\n", + "usedata[, \"weights\"] <- pop_weights\n", "\n", - "varlist <- c(y,d,X1,X2,census)\n", - "for(i in 1:length(varlist)){\n", - " usedata[, varlist[i]] <- data[,varlist[i]]\n", + "var_list <- c(y, d, X1, X2, census)\n", + "for (i in seq_along(var_list)) {\n", + " usedata[, var_list[i]] <- data[, var_list[i]]\n", "}\n", "\n", "####################### Construct county specific means,\n", "# time specific means, initial conditions\n", "\n", "# Initial conditions\n", - "varlist0 <- c(y,X1,X2,census)\n", - "for(i in 1:length(varlist0)){\n", - " usedata[, paste(varlist0[i],\"0\" , sep=\"\")] <- kronecker(usedata[time.ind == 1,varlist0[i]],\n", - " rep(1,20))\n", + "var_list0 <- c(y, X1, X2, census)\n", + "for (i in seq_along(var_list0)) {\n", + " usedata[, paste(var_list0[i], \"0\", sep = \"\")] <- kronecker(\n", + " usedata[time_ind == 1, var_list0[i]],\n", + " rep(1, 20)\n", + " )\n", "}\n", "\n", "# County means\n", - "varlistJ <- c(X1,X2,census)\n", - "county.vars <- as.matrix(county.vars)\n", - "for(i in 1:length(varlistJ)){\n", - " usedata[, paste(varlistJ[i],\"J\" , sep=\"\")] <-\n", - " county.vars%*%qr.solve(county.vars,as.matrix(usedata[,varlistJ[i]]))\n", + "var_list_j <- c(X1, X2, census)\n", + "county_vars <- as.matrix(county_vars)\n", + "for (i in seq_along(var_list_j)) {\n", + " usedata[, paste(var_list_j[i], \"J\", sep = \"\")] <-\n", + " county_vars %*% qr.solve(county_vars, as.matrix(usedata[, var_list_j[i]]))\n", "}\n", "\n", "# Time means\n", - "time.vars <- as.matrix(time.vars)\n", - "for(i in 1:length(varlistJ)){\n", - " usedata[, paste(varlistJ[i],\"T\" , sep=\"\")] <-\n", - " time.vars%*%qr.solve(time.vars,as.matrix(usedata[,varlistJ[i]]))\n", + "time_vars <- as.matrix(time_vars)\n", + "for (i in seq_along(var_list_j)) {\n", + " usedata[, paste(var_list_j[i], \"T\", sep = \"\")] <-\n", + " time_vars %*% qr.solve(time_vars, as.matrix(usedata[, var_list_j[i]]))\n", "}" ] }, @@ -337,19 +352,22 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "yX0GRnnlryxu" + "id": "yX0GRnnlryxu", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Simple regression\n", "lm0 <- lm(logghomr ~ logfssl, data = usedata)\n", "vc0 <- vcovHC(lm0)\n", - "cat(\"Baseline OLS:\",lm0$coefficients[2],\" (\",sqrt(vc0[2,2]),\")\\n\")\n", + "cat(\"Baseline OLS:\", lm0$coefficients[2], \" (\", sqrt(vc0[2, 2]), \")\\n\")\n", "# Confidence Interval with HC3 covariance\n", - "tt <- qt(c(0.025,0.975),summary(lm0)$df[2])\n", + "tt <- qt(c(0.025, 0.975), summary(lm0)$df[2])\n", "se <- sqrt(diag(vc0))\n", "ci <- coef(lm0) + se %o% tt\n", - "cat(\"2.5%: \", ci[2,1],\"97.5%: \", ci[2,2])" + "cat(\"2.5%: \", ci[2, 1], \"97.5%: \", ci[2, 2])" ] }, { @@ -376,15 +394,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "ljFlAr5Isjzd" + "id": "ljFlAr5Isjzd", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Regression on baseline controls\n", - "varlist <- c(d,X1,X2,census)\n", - "lmC <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = usedata)\n", + "var_list <- c(d, X1, X2, census)\n", + "lmC <- lm(paste(\"logghomr ~\", paste(var_list, collapse = \"+\")), data = usedata)\n", "vcC <- vcovHC(lmC)\n", - "cat(\"OLS with Controls:\",lmC$coefficients[\"logfssl\"],\" (\",sqrt(vcC[\"logfssl\",\"logfssl\"]),\")\\n\")" + "cat(\"OLS with Controls:\", lmC$coefficients[\"logfssl\"], \" (\", sqrt(vcC[\"logfssl\", \"logfssl\"]), \")\\n\")" ] }, { @@ -402,22 +423,25 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "iOFCWtUKyFK2" + "id": "iOFCWtUKyFK2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Regression on time and cross sectional averages\n", - "varlistX <- c(X1,X2,census)\n", - "varlistMeans <- c(d,X1,X2,census)\n", - "for(i in 1:length(varlistX)){\n", - " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"J\" , sep=\"\"))\n", + "var_list_x <- c(X1, X2, census)\n", + "var_list_means <- c(d, X1, X2, census)\n", + "for (i in seq_along(var_list_x)) {\n", + " var_list_means <- c(var_list_means, paste(var_list_x[i], \"J\", sep = \"\"))\n", "}\n", - "for(i in 1:length(varlistX)){\n", - " varlistMeans <- c(varlistMeans,paste(varlistX[i],\"T\" , sep=\"\"))\n", + "for (i in seq_along(var_list_x)) {\n", + " var_list_means <- c(var_list_means, paste(var_list_x[i], \"T\", sep = \"\"))\n", "}\n", - "lmM <- lm(paste(\"logghomr ~\",paste(varlistMeans, collapse = \"+\")), data = usedata)\n", + "lmM <- lm(paste(\"logghomr ~\", paste(var_list_means, collapse = \"+\")), data = usedata)\n", "vcM <- vcovHC(lmM)\n", - "cat(\"OLS with Averages:\",lmM$coefficients[\"logfssl\"],\" (\",sqrt(vcM[\"logfssl\",\"logfssl\"]),\")\\n\")" + "cat(\"OLS with Averages:\", lmM$coefficients[\"logfssl\"], \" (\", sqrt(vcM[\"logfssl\", \"logfssl\"]), \")\\n\")" ] }, { @@ -433,14 +457,17 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "wBMWYpbBtKzy" + "id": "wBMWYpbBtKzy", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Regression on all controls\n", "lmA <- lm(logghomr ~ ., data = usedata)\n", "vcA <- vcovHC(lmA)\n", - "cat(\"OLS All:\",lmA$coefficients[\"logfssl\"],\" (\",sqrt(vcA[\"logfssl\",\"logfssl\"]),\")\\n\")" + "cat(\"OLS All:\", lmA$coefficients[\"logfssl\"], \" (\", sqrt(vcA[\"logfssl\", \"logfssl\"]), \")\\n\")" ] }, { @@ -497,7 +524,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "u8n1149MolrR" + "id": "u8n1149MolrR", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -510,109 +540,129 @@ "# Cross-fitting\n", "n <- nrow(usedata)\n", "Kf <- 5 # Number of cross-fitting folds\n", - "sampleframe <- rep(1:Kf, ceiling(n/Kf))\n", - "cvgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups\n", + "sampleframe <- rep(1:Kf, ceiling(n / Kf))\n", + "cvgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups\n", "\n", "# Initialize variables for cross-fit predictions\n", - "yhat.r <- matrix(NA,n,10) # Going to consider 10 learners\n", - "dhat.r <- matrix(NA,n,10)\n", + "yhat_r <- matrix(NA, n, 10) # Going to consider 10 learners\n", + "dhat_r <- matrix(NA, n, 10)\n", "\n", "# Cross-fitting loop\n", - "for(k in 1:Kf) {\n", - " cat(\"fold: \", k,\"\\n\")\n", + "for (k in 1:Kf) {\n", + " cat(\"fold: \", k, \"\\n\")\n", " indk <- cvgroup == k\n", "\n", - " ktrain <- usedata[!indk,]\n", - " ktest <- usedata[indk,]\n", + " ktrain <- usedata[!indk, ]\n", + " ktest <- usedata[indk, ]\n", "\n", " #### Simple regression models ####\n", "\n", " # Simple regression\n", - " yhat.r[indk,1] <- ktest$logghomr - mean(ktrain$logghomr)\n", - " dhat.r[indk,1] <- ktest$logfssl - mean(ktrain$logfssl)\n", + " yhat_r[indk, 1] <- ktest$logghomr - mean(ktrain$logghomr)\n", + " dhat_r[indk, 1] <- ktest$logfssl - mean(ktrain$logfssl)\n", "\n", " # Baseline controls\n", - " varlist <- c(X1,X2,census)\n", - " lmyk.C <- lm(paste(\"logghomr ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", - " yhat.r[indk,2] <- ktest$logghomr - predict(lmyk.C, ktest)\n", - " lmdk.C <- lm(paste(\"logfssl ~\",paste(varlist, collapse = \"+\")), data = ktrain)\n", - " dhat.r[indk,2] <- ktest$logfssl - predict(lmdk.C, ktest)\n", + " var_list <- c(X1, X2, census)\n", + " lmyk_c <- lm(paste(\"logghomr ~\", paste(var_list, collapse = \"+\")), data = ktrain)\n", + " yhat_r[indk, 2] <- ktest$logghomr - predict(lmyk_c, ktest)\n", + " lmdk_c <- lm(paste(\"logfssl ~\", paste(var_list, collapse = \"+\")), data = ktrain)\n", + " dhat_r[indk, 2] <- ktest$logfssl - predict(lmdk_c, ktest)\n", "\n", " # All controls\n", - " lmyk.A <- lm(logghomr ~ .-logfssl, data = ktrain)\n", - " yhat.r[indk,3] <- ktest$logghomr - predict(lmyk.A, ktest)\n", - " lmdk.A <- lm(logfssl ~ .-logghomr, data = ktrain)\n", - " dhat.r[indk,3] <- ktest$logfssl - predict(lmdk.A, ktest)\n", + " lmyk_a <- lm(logghomr ~ . - logfssl, data = ktrain)\n", + " yhat_r[indk, 3] <- ktest$logghomr - predict(lmyk_a, ktest)\n", + " lmdk_a <- lm(logfssl ~ . - logghomr, data = ktrain)\n", + " dhat_r[indk, 3] <- ktest$logfssl - predict(lmdk_a, ktest)\n", "\n", " #### Penalized Linear Models ####\n", "\n", " # Lasso - default CV tuning\n", - " ytrain <- as.matrix(usedata[!indk,\"logghomr\"])\n", - " dtrain <- as.matrix(usedata[!indk,\"logfssl\"])\n", - " xtrain <- as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")])\n", - " ytest <- as.matrix(usedata[indk,\"logghomr\"])\n", - " dtest <- as.matrix(usedata[indk,\"logfssl\"])\n", - " xtest <- as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")])\n", + " ytrain <- as.matrix(usedata[!indk, \"logghomr\"])\n", + " dtrain <- as.matrix(usedata[!indk, \"logfssl\"])\n", + " xtrain <- as.matrix(usedata[!indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")])\n", + " ytest <- as.matrix(usedata[indk, \"logghomr\"])\n", + " dtest <- as.matrix(usedata[indk, \"logfssl\"])\n", + " xtest <- as.matrix(usedata[indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")])\n", "\n", - " lassoyk <- cv.glmnet(xtrain,ytrain)\n", - " yhat.r[indk,4] <- ytest - predict(lassoyk, newx = xtest, s = \"lambda.min\")\n", + " lassoyk <- cv.glmnet(xtrain, ytrain)\n", + " yhat_r[indk, 4] <- ytest - predict(lassoyk, newx = xtest, s = \"lambda.min\")\n", "\n", - " lassodk <- cv.glmnet(xtrain,dtrain)\n", - " dhat.r[indk,4] <- dtest - predict(lassodk, newx = xtest, s = \"lambda.min\")\n", + " lassodk <- cv.glmnet(xtrain, dtrain)\n", + " dhat_r[indk, 4] <- dtest - predict(lassodk, newx = xtest, s = \"lambda.min\")\n", "\n", " # Ridge\n", - " ridgeyk <- cv.glmnet(xtrain,ytrain,alpha = 0)\n", - " yhat.r[indk,5] <- ytest - predict(ridgeyk, newx = xtest, s = \"lambda.min\")\n", + " ridgeyk <- cv.glmnet(xtrain, ytrain, alpha = 0)\n", + " yhat_r[indk, 5] <- ytest - predict(ridgeyk, newx = xtest, s = \"lambda.min\")\n", "\n", - " ridgedk <- cv.glmnet(xtrain,dtrain, alpha = 0)\n", - " dhat.r[indk,5] <- dtest - predict(ridgedk, newx = xtest, s = \"lambda.min\")\n", + " ridgedk <- cv.glmnet(xtrain, dtrain, alpha = 0)\n", + " dhat_r[indk, 5] <- dtest - predict(ridgedk, newx = xtest, s = \"lambda.min\")\n", "\n", " # EN, .5 - no cv over alpha\n", - " enyk <- cv.glmnet(xtrain,ytrain,alpha = .5)\n", - " yhat.r[indk,6] <- ytest - predict(enyk, newx = xtest, s = \"lambda.min\")\n", + " enyk <- cv.glmnet(xtrain, ytrain, alpha = .5)\n", + " yhat_r[indk, 6] <- ytest - predict(enyk, newx = xtest, s = \"lambda.min\")\n", "\n", - " endk <- cv.glmnet(xtrain,dtrain, alpha = .5)\n", - " dhat.r[indk,6] <- dtest - predict(endk, newx = xtest, s = \"lambda.min\")\n", + " endk <- cv.glmnet(xtrain, dtrain, alpha = .5)\n", + " dhat_r[indk, 6] <- dtest - predict(endk, newx = xtest, s = \"lambda.min\")\n", "\n", " #### Flexible regression models ####\n", "\n", " # Random forest\n", - " rfyk <- randomForest(logghomr ~ .-logfssl, data = ktrain)\n", - " yhat.r[indk,7] <- ktest$logghomr - predict(rfyk, ktest)\n", - " rfdk <- randomForest(logfssl ~ .-logghomr, data = ktrain)\n", - " dhat.r[indk,7] <- ktest$logfssl - predict(rfdk, ktest)\n", + " rfyk <- randomForest(logghomr ~ . - logfssl, data = ktrain)\n", + " yhat_r[indk, 7] <- ktest$logghomr - predict(rfyk, ktest)\n", + " rfdk <- randomForest(logfssl ~ . - logghomr, data = ktrain)\n", + " dhat_r[indk, 7] <- ktest$logfssl - predict(rfdk, ktest)\n", "\n", " # Boosted tree - depth 4\n", - " xgb_train.y = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[!indk,\"logghomr\"]))\n", - " xgb_test.y = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[indk,\"logghomr\"]))\n", - " xgb_train.d = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[!indk,\"logfssl\"]))\n", - " xgb_test.d = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in%\n", - " c(\"logghomr\", \"logfssl\")]),\n", - " label = as.matrix(usedata[indk,\"logfssl\"]))\n", - "\n", - " byk = xgb.cv(data = xgb_train.y,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", - " best.iter = which.min(as.matrix(byk$evaluation_log[,4]))\n", - " byk = xgboost(data = xgb_train.y,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", - " yhat.r[indk,8] = ktest$logghomr - predict(byk, newdata = xgb_test.y,\n", - " iterationrange = c(1,(best.iter+1)))\n", - "\n", - " bdk = xgb.cv(data = xgb_train.d,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5)\n", - " best.iter = which.min(as.matrix(bdk$evaluation_log[,4]))\n", - " bdk = xgboost(data = xgb_train.d,\n", - " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4)\n", - " dhat.r[indk,8] = ktest$logfssl - predict(bdk, newdata = xgb_test.d,\n", - " iterationrange = c(1,(best.iter+1)))\n", + " xgb_train_y <- xgb.DMatrix(\n", + " data = as.matrix(usedata[!indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[!indk, \"logghomr\"])\n", + " )\n", + " xgb_test_y <- xgb.DMatrix(\n", + " data = as.matrix(usedata[indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[indk, \"logghomr\"])\n", + " )\n", + " xgb_train_d <- xgb.DMatrix(\n", + " data = as.matrix(usedata[!indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[!indk, \"logfssl\"])\n", + " )\n", + " xgb_test_d <- xgb.DMatrix(\n", + " data = as.matrix(usedata[indk, !names(usedata) %in%\n", + " c(\"logghomr\", \"logfssl\")]),\n", + " label = as.matrix(usedata[indk, \"logfssl\"])\n", + " )\n", + "\n", + " byk <- xgb.cv(\n", + " data = xgb_train_y,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5\n", + " )\n", + " best_iter <- which.min(as.matrix(byk$evaluation_log[, 4]))\n", + " byk <- xgboost(\n", + " data = xgb_train_y,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4\n", + " )\n", + " yhat_r[indk, 8] <- ktest$logghomr - predict(byk,\n", + " newdata = xgb_test_y,\n", + " iterationrange = c(1, (best_iter + 1))\n", + " )\n", + "\n", + " bdk <- xgb.cv(\n", + " data = xgb_train_d,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5\n", + " )\n", + " best_iter <- which.min(as.matrix(bdk$evaluation_log[, 4]))\n", + " bdk <- xgboost(\n", + " data = xgb_train_d,\n", + " nrounds = 1000, verbose = 0, eta = .1, max_depth = 4\n", + " )\n", + " dhat_r[indk, 8] <- ktest$logfssl - predict(bdk,\n", + " newdata = xgb_test_d,\n", + " iterationrange = c(1, (best_iter + 1))\n", + " )\n", "\n", " #### Neural Networks ####\n", "\n", @@ -622,158 +672,173 @@ " xtrainNN <- scale(xtrain, center = mean, scale = std)\n", " xtestNN <- scale(xtest, center = mean, scale = std)\n", "\n", - " xtestNN <- xtestNN[,which(!is.nan(colMeans(xtrainNN)))]\n", - " xtrainNN <- xtrainNN[,which(!is.nan(colMeans(xtrainNN)))]\n", + " xtestNN <- xtestNN[, which(!is.nan(colMeans(xtrainNN)))]\n", + " xtrainNN <- xtrainNN[, which(!is.nan(colMeans(xtrainNN)))]\n", "\n", " # DNN 50/50/50/50, .5 dropout\n", " NNmodely <- keras_model_sequential()\n", - " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " NNmodely %>%\n", + " layer_dense(units = 50, activation = \"relu\", input_shape = c(ncol(xtrainNN))) %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", " layer_dense(units = 1)\n", "\n", " NNmodely %>% compile(\n", " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", + " optimizer = optimizer_rmsprop()\n", + " )\n", "\n", - " fit.NNmodely <- NNmodely %>% fit(\n", + " fit_nn_model_y <- NNmodely %>% fit(\n", " xtrainNN, ytrain,\n", " epochs = 200, batch_size = 200,\n", " validation_split = .2, verbose = 0\n", " )\n", - " yhat.r[indk,9] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", + " yhat_r[indk, 9] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", "\n", " NNmodeld <- keras_model_sequential()\n", - " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", + " NNmodeld %>%\n", + " layer_dense(units = 50, activation = \"relu\", input_shape = c(ncol(xtrainNN))) %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dropout(rate = .5) %>%\n", " layer_dense(units = 1)\n", "\n", " NNmodeld %>% compile(\n", " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", + " optimizer = optimizer_rmsprop()\n", + " )\n", "\n", - " fit.NNmodeld <- NNmodeld %>% fit(\n", + " fit_nn_model_d <- NNmodeld %>% fit(\n", " xtrainNN, dtrain,\n", " epochs = 200, batch_size = 200,\n", " validation_split = .2, verbose = 0\n", " )\n", - " dhat.r[indk,9] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", + " dhat_r[indk, 9] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", "\n", " # DNN 50/50/50/50, early stopping\n", " NNmodely <- keras_model_sequential()\n", - " NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " NNmodely %>%\n", + " layer_dense(units = 50, activation = \"relu\", input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dense(units = 1)\n", "\n", " NNmodely %>% compile(\n", " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", + " optimizer = optimizer_rmsprop()\n", + " )\n", "\n", - " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", - " restore_best_weights = TRUE)\n", + " early_stop <- callback_early_stopping(\n", + " monitor = \"val_loss\", patience = 25,\n", + " restore_best_weights = TRUE\n", + " )\n", "\n", - " fit.NNmodely <- NNmodely %>% fit(\n", + " fit_nn_model_y <- NNmodely %>% fit(\n", " xtrainNN, ytrain,\n", " epochs = 200, batch_size = 200,\n", " validation_split = .2, verbose = 0,\n", - " callbacks = list(early.stop)\n", + " callbacks = list(early_stop)\n", " )\n", - " yhat.r[indk,10] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", + " yhat_r[indk, 10] <- ktest$logghomr - predict(NNmodely, xtestNN)\n", "\n", " NNmodeld <- keras_model_sequential()\n", - " NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", - " layer_dense(units = 50, activation = 'relu') %>%\n", + " NNmodeld %>%\n", + " layer_dense(units = 50, activation = \"relu\", input_shape = c(ncol(xtrainNN))) %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", + " layer_dense(units = 50, activation = \"relu\") %>%\n", " layer_dense(units = 1)\n", "\n", " NNmodeld %>% compile(\n", " loss = \"mse\",\n", - " optimizer = optimizer_rmsprop())\n", + " optimizer = optimizer_rmsprop()\n", + " )\n", "\n", - " early.stop <- callback_early_stopping(monitor = \"val_loss\", patience = 25,\n", - " restore_best_weights = TRUE)\n", + " early_stop <- callback_early_stopping(\n", + " monitor = \"val_loss\", patience = 25,\n", + " restore_best_weights = TRUE\n", + " )\n", "\n", - " fit.NNmodeld <- NNmodeld %>% fit(\n", + " fit_nn_model_d <- NNmodeld %>% fit(\n", " xtrainNN, dtrain,\n", " epochs = 200, batch_size = 200,\n", " validation_split = .2, verbose = 0,\n", - " callbacks = list(early.stop)\n", + " callbacks = list(early_stop)\n", " )\n", - " dhat.r[indk,10] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", - "\n", + " dhat_r[indk, 10] <- ktest$logfssl - predict(NNmodeld, xtestNN)\n", "}\n", "\n", "################################################################################\n", "# Predictions done, now DML\n", "\n", - "RMSE.y <- sqrt(colMeans(yhat.r^2))\n", - "RMSE.d <- sqrt(colMeans(dhat.r^2))\n", + "rmse_y <- sqrt(colMeans(yhat_r^2))\n", + "rmse_d <- sqrt(colMeans(dhat_r^2))\n", "\n", "# dml coefficient estimates\n", - "b.dml <- rep(NA,10)\n", - "s.dml <- rep(NA,10)\n", - "for(k in 1:10){\n", - " lm.k <- lm(yhat.r[,k] ~ dhat.r[,k]-1)\n", - " v.k <- vcovHC(lm.k)\n", - " b.dml[k] <- lm.k$coefficients\n", - " s.dml[k] <- sqrt(v.k)\n", + "b_dml <- rep(NA, 10)\n", + "s_dml <- rep(NA, 10)\n", + "for (k in 1:10) {\n", + " lm_k <- lm(yhat_r[, k] ~ dhat_r[, k] - 1)\n", + " v_k <- vcovHC(lm_k)\n", + " b_dml[k] <- lm_k$coefficients\n", + " s_dml[k] <- sqrt(v_k)\n", "}\n", "\n", "# \"best\" coefficient estimate\n", - "lm.k <- lm(yhat.r[,which.min(RMSE.y)] ~ dhat.r[,which.min(RMSE.d)]-1)\n", - "v.k <- vcovHC(lm.k)\n", - "b.dml[11] <- lm.k$coefficients\n", - "s.dml[11] <- sqrt(v.k)\n", + "lm_k <- lm(yhat_r[, which.min(rmse_y)] ~ dhat_r[, which.min(rmse_d)] - 1)\n", + "v_k <- vcovHC(lm_k)\n", + "b_dml[11] <- lm_k$coefficients\n", + "s_dml[11] <- sqrt(v_k)\n", "\n", "# ls model average\n", - "yhat <- usedata$logghomr - yhat.r\n", - "dhat <- usedata$logfssl - dhat.r\n", - "\n", - "ma.y <- lm(usedata$logghomr ~ yhat-1)\n", - "ma.d <- lm(usedata$logfssl ~ dhat-1)\n", - "weights.y <- ma.y$coefficients\n", - "weights.d <- ma.d$coefficients\n", - "lm.k <- lm(ma.y$residuals ~ ma.d$residuals-1)\n", - "v.k <- vcovHC(lm.k)\n", - "b.dml[12] <- lm.k$coefficients\n", - "s.dml[12] <- sqrt(v.k)\n", + "yhat <- usedata$logghomr - yhat_r\n", + "dhat <- usedata$logfssl - dhat_r\n", + "\n", + "ma_y <- lm(usedata$logghomr ~ yhat - 1)\n", + "ma_d <- lm(usedata$logfssl ~ dhat - 1)\n", + "weights_y <- ma_y$coefficients\n", + "weights_d <- ma_d$coefficients\n", + "lm_k <- lm(ma_y$residuals ~ ma_d$residuals - 1)\n", + "v_k <- vcovHC(lm_k)\n", + "b_dml[12] <- lm_k$coefficients\n", + "s_dml[12] <- sqrt(v_k)\n", "\n", "## Display results\n", "table1 <- matrix(0, 10, 2)\n", - "table1[,1] <- RMSE.y\n", - "table1[,2] <- RMSE.d\n", - "colnames(table1)<- c(\"RMSE Y\",\"RMSE D\")\n", - "rownames(table1)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", - " \"Random Forest\",\"Boosted trees - depth 4\",\n", - " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\")\n", - "tab1<- xtable(table1, digits =c(0,4,4))\n", + "table1[, 1] <- rmse_y\n", + "table1[, 2] <- rmse_d\n", + "colnames(table1) <- c(\"RMSE Y\", \"RMSE D\")\n", + "rownames(table1) <- c(\n", + " \"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", + " \"Random Forest\", \"Boosted trees - depth 4\",\n", + " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\"\n", + ")\n", + "tab1 <- xtable(table1, digits = c(0, 4, 4))\n", "tab1\n", "\n", "table2 <- matrix(0, 12, 2)\n", - "table2[,1] <- b.dml\n", - "table2[,2] <- s.dml\n", - "colnames(table2)<- c(\"Point Estimate\",\"Std. Error\")\n", - "rownames(table2)<- c(\"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", - " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", - " \"Random Forest\",\"Boosted trees - depth 4\",\n", - " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\",\n", - " \"Best\",\"Least Squares Model Average\")\n", - "tab2<- xtable(table2, digits =c(0,4,4))\n", + "table2[, 1] <- b_dml\n", + "table2[, 2] <- s_dml\n", + "colnames(table2) <- c(\"Point Estimate\", \"Std. Error\")\n", + "rownames(table2) <- c(\n", + " \"OLS - No Controls\", \"OLS - Basic\", \"OLS - All\",\n", + " \"Lasso (CV)\", \"Ridge (CV)\", \"Elastic Net (.5,CV)\",\n", + " \"Random Forest\", \"Boosted trees - depth 4\",\n", + " \"DNN - 50/50/50/50, dropout\", \"DNN - 50/50/50/50, early stopping\",\n", + " \"Best\", \"Least Squares Model Average\"\n", + ")\n", + "tab2 <- xtable(table2, digits = c(0, 4, 4))\n", "tab2" ] }, @@ -781,12 +846,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "FjJjD8gRURmc" + "id": "FjJjD8gRURmc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print(xtable(table1,type=\"latex\"))\n", - "print(xtable(table2,type=\"latex\"))" + "print(xtable(table1, type = \"latex\"))\n", + "print(xtable(table2, type = \"latex\"))" ] } ], From de7bf9be7db80dfb22afb5146d0f6613a21ca77d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 15 Jul 2024 23:44:37 -0700 Subject: [PATCH 095/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index ad802691..bd27a98f 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3'] #, 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4'], 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 @@ -62,7 +62,8 @@ jobs: R -e ' library(lintr) linters <- with_defaults(line_length_linter = line_length_linter(120), - object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase"))) + object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")), + object_usage_linter = NULL) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) results <- lapply(rmd_files, function(file) { lints <- lint(file, linters) From 57dbf784235953097df2cbf7688e1585f2171fc9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 10:40:29 +0300 Subject: [PATCH 096/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index bd27a98f..374057f9 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4'], 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4'] #, 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From 3547a6cf38bb2f094eb6c2d5ca96658d114b588a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 11:05:31 +0300 Subject: [PATCH 097/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 374057f9..5475bb53 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -91,7 +91,7 @@ jobs: source(gitrfile) }, error = function(e) { - errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = traceback()) + errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = capture.output(traceback())) } ) } From eda58da961d93bea533ab70df57200bc14dce4fc Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 11:53:28 +0300 Subject: [PATCH 098/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 5475bb53..356bec48 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -91,7 +91,11 @@ jobs: source(gitrfile) }, error = function(e) { - errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = capture.output(traceback())) + traceback_info <- capture.output({ + cat("Traceback:\n") + traceback() + }) + errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = traceback_info) } ) } @@ -102,8 +106,7 @@ jobs: for (error in errors) { cat("Error found in file:", error$gitrfile, "\n") cat("Error message:", error$message, "\n") - cat("Traceback:\n") - print(error$traceback) + cat(paste(error$traceback, collapse = "\n")) } quit(status = 1, save = "no") # Exit with an error status if errors are found } From 8d407ea4e330557726913f188b5cf8f04d40dace Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 12:33:26 +0300 Subject: [PATCH 099/261] Update r_ml_wage_prediction.irnb --- PM3/r_ml_wage_prediction.irnb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index ee43c561..3b591fe5 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -369,8 +369,8 @@ "outputs": [], "source": [ "x_basic <- \"sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)\"\n", - "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", - " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", + "x_flex <- paste(\"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \",\n", + " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\")\n", "formula_basic <- as.formula(paste(\"lwage\", \"~\", x_basic))\n", "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", "\n", From ca0919aaf8307d60a0502aad47431a55d8e0e405 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 04:33:30 -0700 Subject: [PATCH 100/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 356bec48..2989e516 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -29,6 +29,11 @@ jobs: python -m pip install --upgrade pip pip install nbstripout tensorflow + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libcurl4-openssl-dev + - name: Set up R uses: r-lib/actions/setup-r@v2 From 30b9901c8efa20ebcf853bbe4bd5bca24ed52df2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 11:47:49 +0000 Subject: [PATCH 101/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM3 --- ..._functional_approximation_by_nn_and_rf.Rmd | 128 +++--- PM3/r_ml_wage_prediction.Rmd | 392 +++++++++--------- 2 files changed, 274 insertions(+), 246 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd index 0f46a695..be988103 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.Rmd +++ b/PM3/r_functional_approximation_by_nn_and_rf.Rmd @@ -32,29 +32,29 @@ Specifics on the penalty can be found [here](https://cran.r-project.org/web/pack ```{r} set.seed(1) -X_train <- matrix(runif(1000),1000,1) -Y_train <- exp(4*X_train) #Noiseless case Y=g(X) -dim(X_train) +x_train <- matrix(runif(1000), 1000, 1) +y_train <- exp(4 * x_train) # Noiseless case Y=g(X) +dim(x_train) # shallow tree -TreeModel<- rpart(Y_train~X_train, cp=.01) #cp is penalty level -pred.TM<- predict(TreeModel, newx=X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.TM, col=3, pch=19) +TreeModel <- rpart(y_train ~ x_train, cp = .01) # cp is penalty level +pred_tm <- predict(TreeModel, newx = x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_tm, col = 3, pch = 19) ``` ```{r} set.seed(1) -X_train <- matrix(runif(1000),1000,1) -Y_train <- exp(4*X_train) #Noiseless case Y=g(X) -dim(X_train) +x_train <- matrix(runif(1000), 1000, 1) +y_train <- exp(4 * x_train) # Noiseless case Y=g(X) +dim(x_train) -TreeModel<- rpart(Y_train~X_train, cp=.0005) #cp is penalty level -pred.TM<- predict(TreeModel, newx=X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.TM, col=3, pch=19) +TreeModel <- rpart(y_train ~ x_train, cp = .0005) # cp is penalty level +pred_tm <- predict(TreeModel, newx = x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_tm, col = 3, pch = 19) ``` # Functional Approximation by RF @@ -66,50 +66,58 @@ $$ can be easily approximated by a tree-based method (Random Forest) and a neural network (2 Layered Neural Network) ```{r} -RFmodel<- randomForest(Y_train~X_train) -pred.RF<- predict(RFmodel, newdata=X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.RF, col=4, pch=19,) +RFmodel <- randomForest(y_train ~ x_train) +pred_rf <- predict(RFmodel, newdata = x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_rf, col = 4, pch = 19) ``` # Boosted Trees ```{r} -data_train = as.data.frame(cbind(X_train, Y_train)) -BoostTreemodel<- gbm(Y_train~X_train, distribution= "gaussian", n.trees=100, shrinkage=.01, interaction.depth -=3) +data_train <- as.data.frame(cbind(x_train, y_train)) +BoostTreemodel <- gbm(y_train ~ x_train, + distribution = "gaussian", n.trees = 100, shrinkage = .01, + interaction.depth = 3 +) + # shrinkage is "learning rate" # n.trees is the number of boosting steps # interaction.depth is the max depth of each tree -pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=100) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.BT, col=4, pch=19,) +pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 100) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_bt, col = 4, pch = 19) ``` ```{r} -data_train = as.data.frame(cbind(X_train, Y_train)) -BoostTreemodel<- gbm(Y_train~X_train, distribution= "gaussian", n.trees=1000, shrinkage=.01, interaction.depth -=3) +data_train <- as.data.frame(cbind(x_train, y_train)) +BoostTreemodel <- gbm(y_train ~ x_train, + distribution = "gaussian", n.trees = 1000, shrinkage = .01, + interaction.depth = 3 +) # shrinkage is "learning rate" # n.trees is the number of boosting steps # interaction.depth is the max depth of each tree -pred.BT<- predict(BoostTreemodel, newdata=data_train, n.trees=1000) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.BT, col=4, pch=19,) +pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 1000) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_bt, col = 4, pch = 19) ``` # Same Example with a Neural Network ```{r} build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 200, activation = "relu", - input_shape = 1)%>% - layer_dense(units = 20, activation = "relu") %>% - layer_dense(units = 1) - - model %>% compile( - optimizer = optimizer_adam(lr = 0.01), + + model <- keras::keras_model_sequential() %>% + keras::layer_dense( + units = 200, activation = "relu", + input_shape = 1 + ) %>% + keras::layer_dense(units = 20, activation = "relu") %>% + keras::layer_dense(units = 1) + + model %>% keras::compile( + optimizer = keras::optimizer_adam(lr = 0.01), loss = "mse", metrics = c("mae"), ) @@ -123,23 +131,22 @@ summary(model) ```{r} num_epochs <- 1 -model %>% fit(X_train, Y_train, - epochs = num_epochs, batch_size = 10, verbose = 0) -pred.NN <- model %>% predict(X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.NN, col=4, pch=19,) - +model %>% fit(x_train, y_train, + epochs = num_epochs, batch_size = 10, verbose = 0 +) +pred_nn <- model %>% predict(x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_nn, col = 4, pch = 19, ) ``` ```{r} num_epochs <- 100 -model %>% fit(X_train, Y_train, - epochs = num_epochs, batch_size = 10, verbose = 0) -pred.NN <- model %>% predict(X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.NN, col=4, pch=19,) - - +model %>% fit(x_train, y_train, + epochs = num_epochs, batch_size = 10, verbose = 0 +) +pred_nn <- model %>% predict(x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_nn, col = 4, pch = 19, ) ``` ### Using Early Stopping @@ -147,9 +154,9 @@ points(X_train, pred.NN, col=4, pch=19,) ```{r} # Define the neural network architecture model <- keras_model_sequential() %>% - layer_dense(units = 200, activation = 'relu', input_shape = 1) %>% - layer_dense(units = 20, activation = 'relu') %>% - layer_dense(units = 1) # Output layer with 1 unit for regression task + layer_dense(units = 200, activation = "relu", input_shape = 1) %>% + layer_dense(units = 20, activation = "relu") %>% + layer_dense(units = 1) # Output layer with 1 unit for regression task # Compile the model model %>% compile( @@ -165,20 +172,21 @@ summary(model) num_epochs <- 100 # Define early stopping based on validation set (20%) performance -early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5) +# Patience set to 5 epochs (default in skorch is 5) +early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 5) # Train the model model %>% fit( - X_train, Y_train, + x_train, y_train, epochs = num_epochs, batch_size = 10, - validation_split = 0.2, # 20% validation set + validation_split = 0.2, # 20% validation set verbose = 0, callbacks = list(early_stopping) ) -pred.NN <- model %>% predict(X_train) -plot(X_train, Y_train, type="p", pch=19, xlab="z", ylab="g(z)") -points(X_train, pred.NN, col=4, pch=19,) +pred_nn <- model %>% predict(x_train) +plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") +points(x_train, pred_nn, col = 4, pch = 19) ``` diff --git a/PM3/r_ml_wage_prediction.Rmd b/PM3/r_ml_wage_prediction.Rmd index 75065749..1cf232e3 100644 --- a/PM3/r_ml_wage_prediction.Rmd +++ b/PM3/r_ml_wage_prediction.Rmd @@ -38,7 +38,7 @@ Again, we consider data from the U.S. March Supplement of the Current Population The preproccessed sample consists of $5150$ never-married individuals. ```{r} -file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" data <- read.csv(file) dim(data) ``` @@ -46,14 +46,14 @@ dim(data) The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators. ```{r} -Z <- subset(data,select=-c(lwage,wage)) # regressors +Z <- subset(data, select = -c(lwage, wage)) # regressors colnames(Z) ``` The following figure shows the weekly wage distribution from the US survey data. ```{r} -hist(data$wage, xlab= "hourly wage", main="Empirical wage distribution from the US survey data", breaks= 35) +hist(data$wage, xlab = "hourly wage", main = "Empirical wage distribution from the US survey data", breaks = 35) ``` Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by @@ -80,31 +80,32 @@ To evaluate the out-of-sample performance, we split the data first. ```{r} set.seed(1234) -training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) +training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE) -data_train <- data[training,] -data_test <- data[-training,] +data_train <- data[training, ] +data_test <- data[-training, ] ``` We construct the two different model matrices $X_{basic}$ and $X_{flex}$ for both the training and the test sample: ```{r} -X_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" -X_flex <- "sex + exp1 + shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+C(occ2)+C(ind2)+mw+so+we)" -formula_basic <- as.formula(paste("lwage", "~", X_basic)) -formula_flex <- as.formula(paste("lwage", "~", X_flex)) +x_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" +x_flex <- paste("sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we ", + "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)") +formula_basic <- as.formula(paste("lwage", "~", x_basic)) +formula_flex <- as.formula(paste("lwage", "~", x_flex)) -model_X_basic_train <- model.matrix(formula_basic,data_train) -model_X_basic_test <- model.matrix(formula_basic,data_test) -p_basic <- dim(model_X_basic_train)[2] -model_X_flex_train <- model.matrix(formula_flex,data_train) -model_X_flex_test <- model.matrix(formula_flex,data_test) -p_flex <- dim(model_X_flex_train)[2] +model_x_basic_train <- model.matrix(formula_basic, data_train) +model_x_basic_test <- model.matrix(formula_basic, data_test) +p_basic <- dim(model_x_basic_train)[2] +model_x_flex_train <- model.matrix(formula_flex, data_train) +model_x_flex_test <- model.matrix(formula_flex, data_test) +p_flex <- dim(model_x_flex_train)[2] ``` ```{r} -Y_train <- data_train$lwage -Y_test <- data_test$lwage +y_train <- data_train$lwage +y_test <- data_test$lwage ``` ```{r} @@ -120,40 +121,42 @@ We fit the basic model to our training data by running an ols regression and com ```{r} # ols (basic model) -fit.lm.basic <- lm(formula_basic, data_train) +fit_lm_basic <- lm(formula_basic, data_train) ``` ```{r} # Compute the Out-Of-Sample Performance -yhat.lm.basic <- predict(fit.lm.basic, newdata=data_test) -cat("The mean squared error (MSE) using the basic model is equal to" , mean((Y_test-yhat.lm.basic)^2)) # MSE OLS (basic model) +yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test) +# MSE OLS (basic model) +cat("The mean squared error (MSE) using the basic model is equal to", mean((y_test - yhat_lm_basic)^2)) ``` To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*: ```{r} -MSE.lm.basic <- summary(lm((Y_test-yhat.lm.basic)^2~1))$coef[1:2] -MSE.lm.basic +mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2] +mse_lm_basic ``` We also compute the out-of-sample $R^2$: ```{r} -R2.lm.basic <- 1-MSE.lm.basic[1]/var(Y_test) -cat("The R^2 using the basic model is equal to",R2.lm.basic) # MSE OLS (basic model) +r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test) +# MSE OLS (basic model) +cat("The R^2 using the basic model is equal to", r2_lm_basic) ``` We repeat the same procedure for the flexible model. ```{r} # ols (flexible model) -fit.lm.flex <- lm(formula_flex, data_train) +fit_lm_flex <- lm(formula_flex, data_train) # Compute the Out-Of-Sample Performance -options(warn=-1) -yhat.lm.flex <- predict(fit.lm.flex, newdata=data_test) -MSE.lm.flex <- summary(lm((Y_test-yhat.lm.flex)^2~1))$coef[1:2] -R2.lm.flex <- 1-MSE.lm.flex[1]/var(Y_test) -cat("The R^2 using the flexible model is equal to",R2.lm.flex) # MSE OLS (flexible model) +options(warn = -1) +yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test) +mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2] +r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test) +cat("The R^2 using the flexible model is equal to", r2_lm_flex) # MSE OLS (flexible model) ``` We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We now proceed by running lasso regressions and related penalized methods. @@ -164,79 +167,84 @@ Considering the basic model, we run a lasso/post-lasso regression first and then ```{r} # lasso and variants -fit.rlasso <- rlasso(formula_basic, data_train, post=FALSE) -fit.rlasso.post <- rlasso(formula_basic, data_train, post=TRUE) -yhat.rlasso <- predict(fit.rlasso, newdata=data_test) -yhat.rlasso.post <- predict(fit.rlasso.post, newdata=data_test) +fit_rlasso <- hdm::rlasso(formula_basic, data_train, post = FALSE) +fit_rlasso_post <- hdm::rlasso(formula_basic, data_train, post = TRUE) +yhat_rlasso <- predict(fit_rlasso, newdata = data_test) +yhat_rlasso_post <- predict(fit_rlasso_post, newdata = data_test) -MSE.lasso <- summary(lm((Y_test-yhat.rlasso)^2~1))$coef[1:2] -MSE.lasso.post <- summary(lm((Y_test-yhat.rlasso.post)^2~1))$coef[1:2] +mse_lasso <- summary(lm((y_test - yhat_rlasso)^2 ~ 1))$coef[1:2] +mse_lasso_post <- summary(lm((y_test - yhat_rlasso_post)^2 ~ 1))$coef[1:2] -R2.lasso <- 1-MSE.lasso[1]/var(Y_test) -R2.lasso.post <- 1-MSE.lasso.post[1]/var(Y_test) -cat("The R^2 using the basic model is equal to",R2.lasso,"for lasso and",R2.lasso.post,"for post-lasso") # R^2 lasso/post-lasso (basic model) +r2_lasso <- 1 - mse_lasso[1] / var(y_test) +r2_lasso_post <- 1 - mse_lasso_post[1] / var(y_test) +# R^2 lasso/post-lasso (basic model) +cat("The R^2 using the basic model is equal to", r2_lasso, "for lasso and", r2_lasso_post, "for post-lasso") ``` Now, we repeat the same procedure for the flexible model. ```{r} -fit.rlasso.flex <- rlasso(formula_flex, data_train, post=FALSE) -fit.rlasso.post.flex <- rlasso(formula_flex, data_train, post=TRUE) -yhat.rlasso.flex <- predict(fit.rlasso.flex, newdata=data_test) -yhat.rlasso.post.flex <- predict(fit.rlasso.post.flex, newdata=data_test) +fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE) +fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE) +yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test) +yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test) -MSE.lasso.flex <- summary(lm((Y_test-yhat.rlasso.flex)^2~1))$coef[1:2] -MSE.lasso.post.flex <- summary(lm((Y_test-yhat.rlasso.post.flex)^2~1))$coef[1:2] +mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2] +mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2] -R2.lasso.flex <- 1-MSE.lasso.flex[1]/var(Y_test) -R2.lasso.post.flex <- 1-MSE.lasso.post.flex[1]/var(Y_test) -cat("The R^2 using the flexible model is equal to",R2.lasso.flex,"for lasso and",R2.lasso.post.flex,"for post-lasso") # R^2 lasso/post-lasso (flexible model) +# R^2 lasso/post-lasso (flexible model) +r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test) +r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test) +cat("The R^2 using the flexible model is equal to", r2_lasso_flex, + "for lasso and", r2_lasso_post_flex, "for post-lasso") ``` In contrast to a theoretical based choice of the tuning parameter $\lambda$ in the lasso regression, we can also use cross-validation to determine the penalty level by applying the package *glmnet* and the function cv.glmnet. In this context, we also run a ridge and a elastic net regression by adjusting the parameter *alpha*. ```{r} -fit.lasso.cv <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=1) -fit.ridge <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=0) -fit.elnet <- cv.glmnet(model_X_basic_train, Y_train, family="gaussian", alpha=.5) +fit_lasso_cv <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = 1) +fit_ridge <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = 0) +fit_elnet <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = .5) -yhat.lasso.cv <- predict(fit.lasso.cv, newx = model_X_basic_test) -yhat.ridge <- predict(fit.ridge, newx = model_X_basic_test) -yhat.elnet <- predict(fit.elnet, newx = model_X_basic_test) +yhat_lasso_cv <- predict(fit_lasso_cv, newx = model_x_basic_test) +yhat_ridge <- predict(fit_ridge, newx = model_x_basic_test) +yhat_elnet <- predict(fit_elnet, newx = model_x_basic_test) -MSE.lasso.cv <- summary(lm((Y_test-yhat.lasso.cv)^2~1))$coef[1:2] -MSE.ridge <- summary(lm((Y_test-yhat.ridge)^2~1))$coef[1:2] -MSE.elnet <- summary(lm((Y_test-yhat.elnet)^2~1))$coef[1:2] +mse_lasso_cv <- summary(lm((y_test - yhat_lasso_cv)^2 ~ 1))$coef[1:2] +mse_ridge <- summary(lm((y_test - yhat_ridge)^2 ~ 1))$coef[1:2] +mse_elnet <- summary(lm((y_test - yhat_elnet)^2 ~ 1))$coef[1:2] -R2.lasso.cv <- 1-MSE.lasso.cv[1]/var(Y_test) -R2.ridge <- 1-MSE.ridge[1]/var(Y_test) -R2.elnet <- 1-MSE.elnet[1]/var(Y_test) +r2_lasso_cv <- 1 - mse_lasso_cv[1] / var(y_test) +r2_ridge <- 1 - mse_ridge[1] / var(y_test) +r2_elnet <- 1 - mse_elnet[1] / var(y_test) # R^2 using cross-validation (basic model) -cat("R^2 using cross-validation for lasso, ridge and elastic net in the basic model:",R2.lasso.cv,R2.ridge,R2.elnet) +cat("R^2 using cross-validation for lasso, ridge and elastic net in the basic model:", + r2_lasso_cv, r2_ridge, r2_elnet) ``` Note that the following calculations for the flexible model need some computation time. ```{r} -fit.lasso.cv.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=1) -fit.ridge.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=0) -fit.elnet.flex <- cv.glmnet(model_X_flex_train, Y_train, family="gaussian", alpha=.5) +fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 1) +fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 0) +fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = .5) -yhat.lasso.cv.flex <- predict(fit.lasso.cv.flex , newx = model_X_flex_test) -yhat.ridge.flex <- predict(fit.ridge.flex , newx = model_X_flex_test) -yhat.elnet.flex <- predict(fit.elnet.flex , newx = model_X_flex_test) +yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test) +yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test) +yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test) -MSE.lasso.cv.flex <- summary(lm((Y_test-yhat.lasso.cv.flex )^2~1))$coef[1:2] -MSE.ridge.flex <- summary(lm((Y_test-yhat.ridge.flex )^2~1))$coef[1:2] -MSE.elnet.flex <- summary(lm((Y_test-yhat.elnet.flex )^2~1))$coef[1:2] +mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2] +mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2] +mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2] -R2.lasso.cv.flex <- 1-MSE.lasso.cv.flex [1]/var(Y_test) -R2.ridge.flex <- 1-MSE.ridge.flex [1]/var(Y_test) -R2.elnet.flex <- 1-MSE.elnet.flex [1]/var(Y_test) +r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test) +r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test) +r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test) # R^2 using cross-validation (flexible model) -cat("R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:",R2.lasso.cv.flex,R2.ridge.flex,R2.elnet.flex) +cat("R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:", + r2_lasso_cv_flex, r2_ridge_flex, r2_elnet_flex) ``` The performance of the lasso regression with cross-validated penalty is quite similar to the performance of lasso using a theoretical based choice of the tuning parameter. @@ -251,34 +259,36 @@ We fit a regression tree to the training data using the basic model. The variabl ```{r} # tree -fit.trees <- rpart(formula_basic, data_train, minbucket=5, cp = 0.001) -prp(fit.trees, leaf.round=1, space=2, yspace=2, split.space=2,shadow.col = "gray",trace = 1) # plotting the tree +fit_trees <- rpart(formula_basic, data_train, minbucket = 5, cp = 0.001) +# plotting the tree +prp(fit_trees, leaf.round = 1, space = 2, yspace = 2, split.space = 2, shadow.col = "gray", trace = 1) ``` An important method to improve predictive performance is called "Pruning the Tree". This means the process of cutting down the branches of a tree. We apply pruning to the complex tree above to reduce the depth. Initially, we determine the optimal complexity of the regression tree. ```{r} -bestcp <- fit.trees$cptable[which.min(fit.trees$cptable[,"xerror"]),"CP"] +bestcp <- fit_trees$cptable[which.min(fit_trees$cptable[, "xerror"]), "CP"] bestcp ``` Now, we can prune the tree and visualize the prediction rule. ```{r} -fit.prunedtree <- prune(fit.trees,cp=bestcp) -prp(fit.prunedtree,leaf.round=1, space=3, yspace=3, split.space=7, shadow.col = "gray",trace = 1,yesno=1) +fit_prunedtree <- prune(fit_trees, cp = bestcp) +prp(fit_prunedtree, leaf.round = 1, space = 3, yspace = 3, split.space = 7, + shadow.col = "gray", trace = 1, yesno = 1) ``` Finally, we calculate the mean-squared error and the $R^2$ on the test sample to evaluate the out-of-sample performance of the pruned tree. ```{r} -yhat.pt <- predict(fit.prunedtree,newdata=data_test) -MSE.pt <- summary(lm((Y_test-yhat.pt)^2~1))$coef[1:2] -R2.pt <- 1-MSE.pt[1]/var(Y_test) +yhat_pt <- predict(fit_prunedtree, newdata = data_test) +mse_pt <- summary(lm((y_test - yhat_pt)^2 ~ 1))$coef[1:2] +r2_pt <- 1 - mse_pt[1] / var(y_test) # R^2 of the pruned tree -cat("R^2 of the pruned tree:",R2.pt) +cat("R^2 of the pruned tree:", r2_pt) ``` ## Random Forest and Boosted Trees @@ -287,30 +297,31 @@ In the next step, we apply the more advanced tree-based methods random forest an ```{r} # random forest -fit.rf <- randomForest(model_X_basic_train, Y_train, ntree=2000, nodesize=20, data = data_train) +fit_rf <- randomForest(model_x_basic_train, y_train, ntree = 2000, nodesize = 20, data = data_train) ## Evaluating the method -yhat.rf <- predict(fit.rf, newdata=model_X_basic_test) # prediction +yhat_rf <- predict(fit_rf, newdata = model_x_basic_test) # prediction -MSE.rf = summary(lm((Y_test-yhat.rf)^2~1))$coef[1:2] -R2.rf <- 1-MSE.rf[1]/var(Y_test) +mse_rf <- summary(lm((y_test - yhat_rf)^2 ~ 1))$coef[1:2] +r2_rf <- 1 - mse_rf[1] / var(y_test) ``` ```{r} # boosting -fit.boost <- gbm(formula_basic, data=data_train, distribution= "gaussian", bag.fraction = .5, interaction.depth=2, n.trees=1000, shrinkage=.01) -best.boost <- gbm.perf(fit.boost, plot.it = FALSE) # cross-validation to determine when to stop +fit_boost <- gbm(formula_basic, data = data_train, distribution = "gaussian", bag.fraction = .5, + interaction.depth = 2, n.trees = 1000, shrinkage = .01) +best_boost <- gbm.perf(fit_boost, plot.it = FALSE) # cross-validation to determine when to stop ## Evaluating the method -yhat.boost <- predict(fit.boost, newdata=data_test, n.trees=best.boost) +yhat_boost <- predict(fit_boost, newdata = data_test, n.trees = best_boost) -MSE.boost = summary(lm((Y_test-yhat.boost)^2~1))$coef[1:2] -R2.boost <- 1-MSE.boost[1]/var(Y_test) +mse_boost <- summary(lm((y_test - yhat_boost)^2 ~ 1))$coef[1:2] +r2_boost <- 1 - mse_boost[1] / var(y_test) ``` ```{r} # printing R^2 -cat("R^2 of the random forest and boosted trees:", R2.rf, R2.boost) +cat("R^2 of the random forest and boosted trees:", r2_rf, r2_boost) ``` ## NNets @@ -320,9 +331,9 @@ First, we need to determine the structure of our network. We are using the R pac ```{r} # Define the neural network architecture model <- keras_model_sequential() %>% - layer_dense(units = 50, activation = 'relu', input_shape = dim(model_X_basic_train)[2]) %>% - layer_dense(units = 50, activation = 'relu') %>% - layer_dense(units = 1) # Output layer with 1 unit for regression task + layer_dense(units = 50, activation = "relu", input_shape = dim(model_x_basic_train)[2]) %>% + layer_dense(units = 50, activation = "relu") %>% + layer_dense(units = 1) # Output layer with 1 unit for regression task # Compile the model model %>% compile( @@ -338,14 +349,15 @@ summary(model) num_epochs <- 100 # Define early stopping based on validation set (20%) performance -early_stopping <- callback_early_stopping(monitor = 'val_loss', patience = 5) # Patience set to 5 epochs (default in skorch is 5) +# Patience set to 5 epochs (default in skorch is 5) +early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 5) # Train the model model %>% fit( - model_X_basic_train, Y_train, + model_x_basic_train, y_train, epochs = num_epochs, batch_size = 10, - validation_split = 0.2, # 20% validation set + validation_split = 0.2, # 20% validation set verbose = 0, callbacks = list(early_stopping) ) @@ -353,16 +365,16 @@ model %>% fit( ```{r} # evaluating the performance -model %>% evaluate(model_X_basic_test, Y_test, verbose = 0) +model %>% evaluate(model_x_basic_test, y_test, verbose = 0) ``` ```{r} # Calculating the performance measures -yhat.nn <- model %>% predict(model_X_basic_test) -MSE.nn = summary(lm((Y_test-yhat.nn)^2~1))$coef[1:2] -R2.nn <- 1-MSE.nn[1]/var(Y_test) +yhat_nn <- model %>% predict(model_x_basic_test) +mse_nn <- summary(lm((y_test - yhat_nn)^2 ~ 1))$coef[1:2] +r2_nn <- 1 - mse_nn[1] / var(y_test) # printing R^2 -cat("R^2 of the neural network:",R2.nn) +cat("R^2 of the neural network:", r2_nn) ``` To conclude, let us have a look at our results. @@ -370,52 +382,53 @@ To conclude, let us have a look at our results. ## Results ```{r} -table<- matrix(0, 16, 3) -table[1,1:2] <- MSE.lm.basic -table[2,1:2] <- MSE.lm.flex -table[3,1:2] <- MSE.lasso -table[4,1:2] <- MSE.lasso.post -table[5,1:2] <- MSE.lasso.flex -table[6,1:2] <- MSE.lasso.post.flex -table[7,1:2] <- MSE.lasso.cv -table[8,1:2] <- MSE.ridge -table[9,1:2] <- MSE.elnet -table[10,1:2] <- MSE.lasso.cv.flex -table[11,1:2] <- MSE.ridge.flex -table[12,1:2] <- MSE.elnet.flex -table[13,1:2] <- MSE.rf -table[14,1:2] <- MSE.boost -table[15,1:2] <- MSE.pt -table[16,1:2] <- MSE.nn - - - -table[1,3] <- R2.lm.basic -table[2,3] <- R2.lm.flex -table[3,3] <- R2.lasso -table[4,3] <- R2.lasso.post -table[5,3] <- R2.lasso.flex -table[6,3] <- R2.lasso.post.flex -table[7,3] <- R2.lasso.cv -table[8,3] <- R2.ridge -table[9,3] <- R2.elnet -table[10,3] <- R2.lasso.cv.flex -table[11,3] <- R2.ridge.flex -table[12,3] <- R2.elnet.flex -table[13,3] <- R2.rf -table[14,3] <- R2.boost -table[15,3] <- R2.pt -table[16,3] <- R2.nn - - - - -colnames(table)<- c("MSE", "S.E. for MSE", "R-squared") -rownames(table)<- c("Least Squares (basic)","Least Squares (flexible)", "Lasso", "Post-Lasso","Lasso (flexible)","Post-Lasso (flexible)", - "Cross-Validated lasso", "Cross-Validated ridge","Cross-Validated elnet","Cross-Validated lasso (flexible)","Cross-Validated ridge (flexible)","Cross-Validated elnet (flexible)", - "Random Forest","Boosted Trees", "Pruned Tree", "Neural Net (Early)") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX +table <- matrix(0, 16, 3) +table[1, 1:2] <- mse_lm_basic +table[2, 1:2] <- mse_lm_flex +table[3, 1:2] <- mse_lasso +table[4, 1:2] <- mse_lasso_post +table[5, 1:2] <- mse_lasso_flex +table[6, 1:2] <- mse_lasso_post_flex +table[7, 1:2] <- mse_lasso_cv +table[8, 1:2] <- mse_ridge +table[9, 1:2] <- mse_elnet +table[10, 1:2] <- mse_lasso_cv_flex +table[11, 1:2] <- mse_ridge_flex +table[12, 1:2] <- mse_elnet_flex +table[13, 1:2] <- mse_rf +table[14, 1:2] <- mse_boost +table[15, 1:2] <- mse_pt +table[16, 1:2] <- mse_nn + + +table[1, 3] <- r2_lm_basic +table[2, 3] <- r2_lm_flex +table[3, 3] <- r2_lasso +table[4, 3] <- r2_lasso_post +table[5, 3] <- r2_lasso_flex +table[6, 3] <- r2_lasso_post_flex +table[7, 3] <- r2_lasso_cv +table[8, 3] <- r2_ridge +table[9, 3] <- r2_elnet +table[10, 3] <- r2_lasso_cv_flex +table[11, 3] <- r2_ridge_flex +table[12, 3] <- r2_elnet_flex +table[13, 3] <- r2_rf +table[14, 3] <- r2_boost +table[15, 3] <- r2_pt +table[16, 3] <- r2_nn + + +colnames(table) <- c("MSE", "S.E. for MSE", "R-squared") +rownames(table) <- c( + "Least Squares (basic)", "Least Squares (flexible)", "Lasso", "Post-Lasso", + "Lasso (flexible)", "Post-Lasso (flexible)", + "Cross-Validated lasso", "Cross-Validated ridge", "Cross-Validated elnet", + "Cross-Validated lasso (flexible)", "Cross-Validated ridge (flexible)", "Cross-Validated elnet (flexible)", + "Random Forest", "Boosted Trees", "Pruned Tree", "Neural Net (Early)" +) +tab <- xtable(table, digits = 3) +print(tab, type = "latex") # set type="latex" for printing table in LaTeX tab ``` @@ -431,34 +444,40 @@ where the $f_k$'s denote our prediction rules from the table above and the $\alp We first estimate the weights without penalization. ```{r} -ensemble.ols <- summary(lm(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn)) -ensemble.ols +ensemble_ols <- summary(lm(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex + + yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex + + yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex + + yhat_pt + yhat_rf + yhat_boost + yhat_nn)) +ensemble_ols ``` Alternatively, we can determine the weights via lasso regression. ```{r} -ensemble.lasso <- summary(rlasso(Y_test~ yhat.lm.basic + yhat.lm.flex + yhat.rlasso + + yhat.rlasso.flex + yhat.rlasso.post + yhat.rlasso.post.flex + yhat.lasso.cv + yhat.lasso.cv.flex + yhat.ridge + yhat.ridge.flex + yhat.elnet + yhat.elnet.flex + yhat.pt + yhat.rf + yhat.boost + yhat.nn)) -ensemble.lasso +ensemble_lasso <- summary(hdm::rlasso(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex + + yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex + + yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex + + yhat_pt + yhat_rf + yhat_boost + yhat_nn)) +ensemble_lasso ``` The estimated weights are shown in the following table. ```{r} -table<- matrix(0, 17, 2) -table[1:17,1] <- ensemble.ols$coef[1:17] -table[1:17,2] <- ensemble.lasso$coef[1:17] - - -colnames(table)<- c("Weight OLS", "Weight Lasso") +table <- matrix(0, 17, 2) +table[1:17, 1] <- ensemble_ols$coef[1:17] +table[1:17, 2] <- ensemble_lasso$coef[1:17] +colnames(table) <- c("Weight OLS", "Weight Lasso") -rownames(table)<- c("Constant","Least Squares (basic)", "Least Squares (flexible)", "Lasso (basic)", - "Lasso (flexible)", "Post-Lasso (basic)", "Post-Lasso (flexible)", "LassoCV (basic)", - "Lasso CV (flexible)", "Ridge CV (basic)", "Ridge CV (flexible)", "ElNet CV (basic)", - "ElNet CV (flexible)", "Pruned Tree", "Random Forest","Boosted Trees", "Neural Net") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX +rownames(table) <- c( + "Constant", "Least Squares (basic)", "Least Squares (flexible)", "Lasso (basic)", + "Lasso (flexible)", "Post-Lasso (basic)", "Post-Lasso (flexible)", "LassoCV (basic)", + "Lasso CV (flexible)", "Ridge CV (basic)", "Ridge CV (flexible)", "ElNet CV (basic)", + "ElNet CV (flexible)", "Pruned Tree", "Random Forest", "Boosted Trees", "Neural Net" +) +tab <- xtable(table, digits = 3) +print(tab, type = "latex") # set type="latex" for printing table in LaTeX tab ``` @@ -466,8 +485,8 @@ We note the superior $R^2$ performance of the ensembles. Though for more unbiase ```{r} # print ensemble R^2 -cat("R^2 of stacking with LS weights:",ensemble.ols$adj.r.squared,"\n") -cat("R^2 of stacking with Lasso weights:",ensemble.lasso$adj.r.squared,"\n") +cat("R^2 of stacking with LS weights:", ensemble_ols$adj.r.squared, "\n") +cat("R^2 of stacking with Lasso weights:", ensemble_lasso$adj.r.squared, "\n") ``` # Automatic Machine Learning with H20 AutoML @@ -489,27 +508,28 @@ h2o.init() ```{r} # convert data as h2o type -train_h = as.h2o(data_train) -test_h = as.h2o(data_test) +train_h <- as.h2o(data_train) +test_h <- as.h2o(data_test) # have a look at the data h2o.describe(train_h) ``` ```{r} -y_name = 'lwage' -X_names = setdiff(names(data), c('lwage','wage','occ', 'ind')) +y_name <- "lwage" +x_names <- setdiff(names(data), c("lwage", "wage", "occ", "ind")) # run AutoML for 10 base models and a maximal runtime of 100 seconds -aml = h2o.automl(x=X_names, y=y_name, - training_frame = train_h, - leaderboard_frame = test_h, - max_models = 10, - seed = 1, - max_runtime_secs = 100 - ) +aml <- h2o.automl( + x = x_names, y = y_name, + training_frame = train_h, + leaderboard_frame = test_h, + max_models = 10, + seed = 1, + max_runtime_secs = 100 +) # AutoML Leaderboard -lb = aml@leaderboard +lb <- aml@leaderboard print(lb, n = nrow(lb)) ``` @@ -528,7 +548,7 @@ aml@leader This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard. ```{r} -model_ids <- as.data.frame(aml@leaderboard$model_id)[,1] +model_ids <- as.data.frame(aml@leaderboard$model_id)[, 1] # Get the "All Models" Stacked Ensemble model se <- h2o.getModel(grep("StackedEnsemble_AllModels", model_ids, value = TRUE)[1]) # Get the Stacked Ensemble metalearner model @@ -547,19 +567,19 @@ h2o.varimp_plot(metalearner) We can also generate predictions on a test sample using the leader model object. ```{r} -pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample +pred <- as.matrix(h2o.predict(aml@leader, test_h)) # make prediction using x data from the test sample head(pred) ``` ```{r} y_test <- as.matrix(test_h$lwage) -R2_test <- 1-summary(lm((y_test-pred)^2~1))$coef[1]/var(y_test) -cat("MSE, SE, R^2:" , summary(lm((y_test-pred)^2~1))$coef[1:2], R2_test) +r2_test <- 1 - summary(lm((y_test - pred)^2 ~ 1))$coef[1] / var(y_test) +cat("MSE, SE, R^2:", summary(lm((y_test - pred)^2 ~ 1))$coef[1:2], r2_test) ``` We observe both a similar MSE and $R^2$ relative to the better performing models in our previous results. ```{r} -h2o.shutdown(prompt = F) # shut down the h20 automatically without prompting user +h2o.shutdown(prompt = FALSE) # shut down the h20 automatically without prompting user ``` From d7242bc0615f81828e95cb1ed8233d2183cb45c4 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 04:57:39 -0700 Subject: [PATCH 102/261] Update r-dml-401k.irnb --- PM4/r-dml-401k.irnb | 948 +++++++++++++++++++++++++++----------------- 1 file changed, 583 insertions(+), 365 deletions(-) diff --git a/PM4/r-dml-401k.irnb b/PM4/r-dml-401k.irnb index a67f68de..f39a59b2 100644 --- a/PM4/r-dml-401k.irnb +++ b/PM4/r-dml-401k.irnb @@ -46,7 +46,10 @@ "execution_count": null, "id": "2", "metadata": { - "id": "KmAkbDiVE7wm" + "id": "KmAkbDiVE7wm", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -105,7 +108,10 @@ "start_time": "2022-04-19T09:06:48.819963", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -145,7 +151,10 @@ "start_time": "2022-04-19T09:06:49.312540", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -201,11 +210,15 @@ "start_time": "2022-04-19T09:06:49.711690", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar()\n", + "hist_e401 <- ggplot(data, aes(x = e401, fill = factor(e401))) +\n", + " geom_bar()\n", "hist_e401" ] }, @@ -240,13 +253,17 @@ "start_time": "2022-04-19T09:06:50.359520", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) +\n", - " geom_density() + xlim(c(-20000, 150000)) +\n", - " facet_wrap(.~e401)\n", + "dens_net_tfa <- ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401))) +\n", + " geom_density() +\n", + " xlim(c(-20000, 150000)) +\n", + " facet_wrap(. ~ e401)\n", "\n", "dens_net_tfa" ] @@ -282,13 +299,16 @@ "start_time": "2022-04-19T09:06:50.945506", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "e1 <- data[data$e401==1,]\n", - "e0 <- data[data$e401==0,]\n", - "round(mean(e1$net_tfa)-mean(e0$net_tfa),0)" + "e1 <- data[data$e401 == 1, ]\n", + "e0 <- data[data$e401 == 0, ]\n", + "round(mean(e1$net_tfa) - mean(e0$net_tfa), 0)" ] }, { @@ -322,13 +342,16 @@ "start_time": "2022-04-19T09:06:51.015311", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "p1 <- data[data$p401==1,]\n", - "p0 <- data[data$p401==0,]\n", - "round(mean(p1$net_tfa)-mean(p0$net_tfa),0)" + "p1 <- data[data$p401 == 1, ]\n", + "p0 <- data[data$p401 == 0, ]\n", + "round(mean(p1$net_tfa) - mean(p0$net_tfa), 0)" ] }, { @@ -354,23 +377,28 @@ "execution_count": null, "id": "17", "metadata": { - "id": "1hBrSMQGzZBR" + "id": "1hBrSMQGzZBR", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# outcome variable\n", - "y <- data[,'net_tfa']\n", + "y <- data[, \"net_tfa\"]\n", "# treatment variable\n", - "D <- data[,'e401']\n", - "D2 <- data[,\"p401\"]\n", - "D3 <- data[,\"a401\"]\n", - "\n", - "columns_to_drop <- c('e401', 'p401', 'a401', 'tw', 'tfa', 'net_tfa', 'tfa_he',\n", - " 'hval', 'hmort', 'hequity',\n", - " 'nifa', 'net_nifa', 'net_n401', 'ira',\n", - " 'dum91', 'icat', 'ecat', 'zhat',\n", - " 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7',\n", - " 'a1', 'a2', 'a3', 'a4', 'a5')\n", + "D <- data[, \"e401\"]\n", + "D2 <- data[, \"p401\"]\n", + "D3 <- data[, \"a401\"]\n", + "\n", + "columns_to_drop <- c(\n", + " \"e401\", \"p401\", \"a401\", \"tw\", \"tfa\", \"net_tfa\", \"tfa_he\",\n", + " \"hval\", \"hmort\", \"hequity\",\n", + " \"nifa\", \"net_nifa\", \"net_n401\", \"ira\",\n", + " \"dum91\", \"icat\", \"ecat\", \"zhat\",\n", + " \"i1\", \"i2\", \"i3\", \"i4\", \"i5\", \"i6\", \"i7\",\n", + " \"a1\", \"a2\", \"a3\", \"a4\", \"a5\"\n", + ")\n", "\n", "# covariates\n", "X <- data[, !(names(data) %in% columns_to_drop)]" @@ -381,13 +409,17 @@ "execution_count": null, "id": "18", "metadata": { - "id": "DD0Hwcb6z4u5" + "id": "DD0Hwcb6z4u5", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Constructing the controls\n", - "X_formula = \"~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\"\n", - "X = as.data.table(model.frame(X_formula, pension))\n", + "x_formula <- paste(\"~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) \",\n", + " \"+ poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown\")\n", + "X <- as.data.table(model.frame(x_formula, X))\n", "head(X)" ] }, @@ -418,10 +450,10 @@ "id": "vEAeB2ih0r8B" }, "source": [ - "\\begin{eqnarray}\n", + "\\begin{align}\n", " & Y = D\\theta_0 + g_0(X) + \\zeta, & E[\\zeta \\mid D,X]= 0,\\\\\n", " & D = m_0(X) + V, & E[V \\mid X] = 0.\n", - "\\end{eqnarray}" + "\\end{align}" ] }, { @@ -447,56 +479,57 @@ "execution_count": null, "id": "23", "metadata": { - "id": "tqFlcClUNr9Z" + "id": "tqFlcClUNr9Z", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=3, method = \"regression\") {\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", + "dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 3, method = \"regression\") {\n", + " nobs <- nrow(x) # number of observations\n", + " foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices\n", + " I <- split(1:nobs, foldid) # split observation indices into folds\n", " ytil <- dtil <- rep(NA, nobs)\n", " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - "\n", + " for (b in seq_along(I)) {\n", " if (method == \"regression\") {\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out\n", + " yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out\n", + " dhat <- predict(dfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold\n", " } else if (method == \"randomforest\") {\n", - " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"prob\")[,2] #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out\n", + " yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]], ], type = \"prob\")[, 2] # predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold\n", " } else if (method == \"decisiontrees\") {\n", - " dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],])[,2] #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],]) #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out\n", + " yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]], ])[, 2] # predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]], ]) # predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold\n", " } else if (method == \"boostedtrees\") {\n", - " dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out\n", - " yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out\n", - " dhat <- predict(dfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " yhat <- predict(yfit, x[I[[b]],], type=\"response\") #predict the left-out fold\n", - " dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold\n", - " ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold\n", + " dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out\n", + " yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out\n", + " dhat <- predict(dfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " yhat <- predict(yfit, x[I[[b]], ], type = \"response\") # predict the left-out fold\n", + " dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold\n", + " ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold\n", " }\n", - " cat(b,\" \")\n", - "\n", + " cat(b, \" \")\n", " }\n", - " rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other\n", - " coef.est <- coef(rfit)[2] #extract coefficient\n", - " se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", - " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef.est , se)) #printing output\n", - " return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals\n", - "}\n" + " rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other\n", + " coef_est <- coef(rfit)[2] # extract coefficient\n", + " se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error\n", + " cat(sprintf(\"\\ncoef (se) = %g (%g)\\n\", coef_est, se)) # printing output\n", + " return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals\n", + "}" ] }, { @@ -504,7 +537,10 @@ "execution_count": null, "id": "24", "metadata": { - "id": "sS0P4CVySjDP" + "id": "sS0P4CVySjDP", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -516,7 +552,7 @@ " upper = point + 1.96 * stderr, # upper end of 95% confidence interval\n", " `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y\n", " `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D\n", - " `accuracy D` = mean(abs(resD) < 0.5)# binary classification accuracy of model for D\n", + " `accuracy D` = mean(abs(resD) < 0.5) # binary classification accuracy of model for D\n", " )\n", " rownames(data) <- name\n", " return(data)\n", @@ -538,7 +574,10 @@ "execution_count": null, "id": "26", "metadata": { - "id": "LOVuR5QO1bkB" + "id": "LOVuR5QO1bkB", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -546,14 +585,19 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Lasso CV \\n\"))\n", "\n", - "dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", + "dreg_lasso_cv <- function(x, d) {\n", + " cv.glmnet(x, d, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "}\n", + "yreg_lasso_cv <- function(x, y) {\n", + " cv.glmnet(x, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "}\n", "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.lasso.cv, yreg.lasso.cv, nfold=5)\n", + "dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_lasso_cv, yreg_lasso_cv, nfold = 5)\n", "\n", - "sum.lasso.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV')\n", + "sum_lasso_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"LassoCV\")\n", "tableplr <- data.frame()\n", - "tableplr <- rbind(sum.lasso.cv)\n", + "tableplr <- rbind(sum_lasso_cv)\n", "tableplr" ] }, @@ -562,13 +606,16 @@ "execution_count": null, "id": "27", "metadata": { - "id": "KatOw36Z0ghO" + "id": "KatOw36Z0ghO", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.lasso <- D - DML2.results$dtil\n", - "yhat.lasso <- y - DML2.results$ytil" + "dhat_lasso <- D - dml2_results$dtil\n", + "yhat_lasso <- y - dml2_results$ytil" ] }, { @@ -588,7 +635,10 @@ "execution_count": null, "id": "29", "metadata": { - "id": "b9Nvp5ZlSuwB" + "id": "b9Nvp5ZlSuwB", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -596,12 +646,17 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Lasso/Logistic \\n\"))\n", "\n", - "dreg.logistic.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.logistic.cv, yreg.lasso.cv, nfold=5)\n", - "sum.lasso_logistic.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV/LogisticCV')\n", - "tableplr <- rbind(tableplr, sum.lasso_logistic.cv)\n", + "dreg_logistic_cv <- function(x, d) {\n", + " cv.glmnet(x, d, family = \"binomial\", alpha = 0, nfolds = 5)\n", + "}\n", + "yreg_lasso_cv <- function(x, y) {\n", + " cv.glmnet(x, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "}\n", + "\n", + "dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_logistic_cv, yreg_lasso_cv, nfold = 5)\n", + "sum_lasso_logistic_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"LassoCV/LogisticCV\")\n", + "tableplr <- rbind(tableplr, sum_lasso_logistic_cv)\n", "tableplr" ] }, @@ -610,13 +665,16 @@ "execution_count": null, "id": "30", "metadata": { - "id": "hJqMdcZV05lr" + "id": "hJqMdcZV05lr", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.lasso_logistic <- D - DML2.results$dtil\n", - "yhat.lasso_logistic <- y - DML2.results$ytil" + "dhat_lasso_logistic <- D - dml2_results$dtil\n", + "yhat_lasso_logistic <- y - dml2_results$ytil" ] }, { @@ -634,7 +692,10 @@ "execution_count": null, "id": "32", "metadata": { - "id": "nt0oTHTfSwMr" + "id": "nt0oTHTfSwMr", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -642,12 +703,17 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", "\n", - "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.rf, yreg.rf, nfold=5, method = \"randomforest\")\n", - "sum.rf <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Random Forest')\n", - "tableplr <- rbind(tableplr, sum.rf)\n", + "dreg_rf <- function(x, d) {\n", + " randomForest(x, d, ntree = 1000, nodesize = 10)\n", + "} # ML method=Forest\n", + "yreg_rf <- function(x, y) {\n", + " randomForest(x, y, ntree = 1000, nodesize = 10)\n", + "} # ML method=Forest\n", + "\n", + "dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_rf, yreg_rf, nfold = 5, method = \"randomforest\")\n", + "sum_rf <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"Random Forest\")\n", + "tableplr <- rbind(tableplr, sum_rf)\n", "tableplr" ] }, @@ -656,13 +722,16 @@ "execution_count": null, "id": "33", "metadata": { - "id": "TG476dPX1BI_" + "id": "TG476dPX1BI_", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.rf <- D - DML2.results$dtil\n", - "yhat.rf <- y - DML2.results$ytil" + "dhat_rf <- D - dml2_results$dtil\n", + "dhat_rf <- y - dml2_results$ytil" ] }, { @@ -680,7 +749,10 @@ "execution_count": null, "id": "35", "metadata": { - "id": "3Nu4daQRSyRb" + "id": "3Nu4daQRSyRb", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -688,12 +760,18 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", "\n", - "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results <- DML2.for.PLM(X, D, y, dreg.tr, yreg.tr, nfold=5, method = \"decisiontrees\") # decision tree takes in X as dataframe, not matrix/array\n", - "sum.tr <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Decision Trees')\n", - "tableplr <- rbind(tableplr, sum.tr)\n", + "dreg_tr <- function(x, d) {\n", + " rpart(as.formula(\"D~.\"), cbind(data.frame(D = d), x), method = \"class\", minbucket = 10, cp = 0.001)\n", + "}\n", + "dreg_tr <- function(x, y) {\n", + " rpart(as.formula(\"y~.\"), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001)\n", + "}\n", + "\n", + "# decision tree takes in X as dataframe, not matrix/array\n", + "dml2_results <- dml2_for_plm(X, D, y, dreg_tr, dreg_tr, nfold = 5, method = \"decisiontrees\")\n", + "sum_tr <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"Decision Trees\")\n", + "tableplr <- rbind(tableplr, sum_tr)\n", "tableplr" ] }, @@ -702,13 +780,16 @@ "execution_count": null, "id": "36", "metadata": { - "id": "RnCGwVbN1KJJ" + "id": "RnCGwVbN1KJJ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.tr <- D - DML2.results$dtil\n", - "yhat.tr <- y - DML2.results$ytil" + "dhat_tr <- D - dml2_results$dtil\n", + "yhat_tr <- y - dml2_results$ytil" ] }, { @@ -737,7 +818,10 @@ "execution_count": null, "id": "39", "metadata": { - "id": "Ekg5qeEOSxep" + "id": "Ekg5qeEOSxep", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -746,14 +830,21 @@ "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", "\n", "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", + "dreg_boost <- function(x, d) {\n", + " gbm(as.formula(\"D~.\"), cbind(data.frame(D = d), x), distribution = \"bernoulli\",\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", + "}\n", + "yreg_boost <- function(x, y) {\n", + " gbm(as.formula(\"y~.\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", + "}\n", "\n", "# passing these through regression as type=\"response\", and D should not be factor!\n", - "DML2.results = DML2.for.PLM(X, D, y, dreg.boost, yreg.boost, nfold=5, method = \"boostedtrees\")\n", - "sum.boost <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Boosted Trees')\n", - "tableplr <- rbind(tableplr, sum.boost)\n", + "dml2_results <- dml2_for_plm(X, D, y, dreg_boost, yreg_boost, nfold = 5, method = \"boostedtrees\")\n", + "sum_boost <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"Boosted Trees\")\n", + "tableplr <- rbind(tableplr, sum_boost)\n", "tableplr" ] }, @@ -762,13 +853,16 @@ "execution_count": null, "id": "40", "metadata": { - "id": "WSyqSd5Z1hne" + "id": "WSyqSd5Z1hne", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Because residuals are output, reconstruct fitted values for use in ensemble\n", - "dhat.boost <- D - DML2.results$dtil\n", - "yhat.boost <- y - DML2.results$ytil" + "dhat_boost <- D - dml2_results$dtil\n", + "yhat_boost <- y - dml2_results$ytil" ] }, { @@ -796,14 +890,18 @@ "execution_count": null, "id": "43", "metadata": { - "id": "gDrZqZXR12hA" + "id": "gDrZqZXR12hA", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Best fit is boosted trees for both D and Y\n", "\n", - "sum.best <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Best')\n", - "tableplr <- rbind(tableplr, sum.best)\n", + "sum_best <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil,\n", + " dml2_results$ytil, name = \"Best\")\n", + "tableplr <- rbind(tableplr, sum_best)\n", "tableplr" ] }, @@ -822,22 +920,25 @@ "execution_count": null, "id": "45", "metadata": { - "id": "Pkg7pw5h2N0z" + "id": "Pkg7pw5h2N0z", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Least squares model average\n", "\n", - "ma.dtil <- lm(D~dhat.lasso+dhat.lasso_logistic+dhat.rf+dhat.tr+dhat.boost)$residuals\n", - "ma.ytil <- lm(y~yhat.lasso+yhat.lasso_logistic+yhat.rf+yhat.tr+yhat.boost)$residuals\n", + "ma_dtil <- lm(D ~ dhat_lasso + dhat_lasso_logistic + dhat_rf + dhat_tr + dhat_boost)$residuals\n", + "ma_ytil <- lm(y ~ yhat_lasso + yhat_lasso_logistic + dhat_rf + yhat_tr + yhat_boost)$residuals\n", "\n", - "rfit <- lm(ma.ytil ~ ma.dtil) #estimate the main parameter by regressing one residual on the other\n", - "coef.est <- coef(rfit)[2] #extract coefficient\n", - "se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error\n", + "rfit <- lm(ma_ytil ~ ma_dtil) # estimate the main parameter by regressing one residual on the other\n", + "coef_est <- coef(rfit)[2] # extract coefficient\n", + "se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error\n", "\n", - "sum.ma <- summaryPLR(coef.est, se, ma.dtil, ma.ytil, name = 'Model Average')\n", + "sum.ma <- summaryPLR(coef_est, se, ma_dtil, ma_ytil, name = \"Model Average\")\n", "tableplr <- rbind(tableplr, sum.ma)\n", - "tableplr\n" + "tableplr" ] }, { @@ -891,10 +992,10 @@ "tags": [] }, "source": [ - " \\begin{eqnarray}\\label{eq: HetPL1}\n", + " \\begin{align}\n", " & Y = g_0(D, X) + U, & \\quad E[U \\mid X, D]= 0,\\\\\n", " & D = m_0(X) + V, & \\quad E[V\\mid X] = 0.\n", - "\\end{eqnarray}" + "\\end{align}" ] }, { @@ -921,90 +1022,92 @@ "execution_count": null, "id": "50", "metadata": { - "id": "-hCmnqC-N0nS" + "id": "-hCmnqC-N0nS", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "DML2.for.IRM <- function(x, d, y, dreg, yreg0, yreg1, trimming=0.01, nfold=5, method=\"regression\") {\n", + "dml2_for_irm <- function(x, d, y, dreg, yreg0, yreg1, trimming = 0.01, nfold = 5, method = \"regression\") {\n", " yhat0 <- rep(0, length(y))\n", " yhat1 <- rep(0, length(y))\n", " Dhat <- rep(0, length(d))\n", "\n", - " nobs <- nrow(x) #number of observations\n", - " foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices\n", - " I <- split(1:nobs, foldid) #split observation indices into folds\n", + " nobs <- nrow(x) # number of observations\n", + " foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices\n", + " I <- split(1:nobs, foldid) # split observation indices into folds\n", " ytil <- dtil <- rep(NA, nobs)\n", "\n", " cat(\"fold: \")\n", - " for(b in 1:length(I)){\n", - "\n", + " for (b in seq_along(I)) {\n", " # define helpful variables\n", - " Dnotb = d[-I[[b]]]\n", - " Xb = X[I[[b]],]\n", - " Xnotb = X[-I[[b]],]\n", + " Dnotb <- d[-I[[b]]]\n", + " Xb <- X[I[[b]], ]\n", + " Xnotb <- X[-I[[b]], ]\n", "\n", " # training dfs subsetted on the -I[[b]] fold\n", - " XD0 = X[-I[[b]],][d[-I[[b]]]==0]\n", - " yD0 = y[-I[[b]]][d[-I[[b]]]==0]\n", - " XD1 = X[-I[[b]],][d[-I[[b]]]==1]\n", - " yD1 = y[-I[[b]]][d[-I[[b]]]==1]\n", + " XD0 <- X[-I[[b]], ][d[-I[[b]]] == 0]\n", + " yD0 <- y[-I[[b]]][d[-I[[b]]] == 0]\n", + " XD1 <- X[-I[[b]], ][d[-I[[b]]] == 1]\n", + " yD1 <- y[-I[[b]]][d[-I[[b]]] == 1]\n", "\n", " if (method == \"regression\") {\n", - " yfit0 <- yreg0(as.matrix(XD0), yD0)\n", - " yfit1 <- yreg1(as.matrix(XD1), yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = \"response\" for glmnet family gaussian\n", - " yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb))\n", + " yfit0 <- yreg0(as.matrix(XD0), yD0)\n", + " yfit1 <- yreg1(as.matrix(XD1), yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = \"response\" for glmnet family gaussian\n", + " yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb))\n", " } else if (method == \"randomforest\") {\n", - " yfit0 <- yreg0(XD0, yD0)\n", - " yfit1 <- yreg1(XD1, yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for rf\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " yfit0 <- yreg0(XD0, yD0)\n", + " yfit1 <- yreg1(XD1, yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for rf\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", " } else if (method == \"decisiontrees\") {\n", - " yfit0 <- yreg0(XD0, yD0)\n", - " yfit1 <- yreg1(XD1, yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"vector\" for decision\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " yfit0 <- yreg0(XD0, yD0)\n", + " yfit1 <- yreg1(XD1, yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"vector\" for decision\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", " } else if (method == \"boostedtrees\") {\n", - " yfit0 <- yreg0(as.data.frame(XD0), yD0)\n", - " yfit1 <- yreg1(as.data.frame(XD1), yD1)\n", - " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for boosted\n", - " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", + " yfit0 <- yreg0(as.data.frame(XD0), yD0)\n", + " yfit1 <- yreg1(as.data.frame(XD1), yD1)\n", + " yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = \"response\" for boosted\n", + " yhat1[I[[b]]] <- predict(yfit1, Xb)\n", " }\n", "\n", " # propensity scores:\n", - " if (method == \"regression\"){\n", - " Dfit_b <- dreg(as.matrix(Xnotb), Dnotb)\n", - " Dhat_b <- predict(Dfit_b, as.matrix(Xb), type=\"response\") # default is type=\"link\" for family binomial!\n", + " if (method == \"regression\") {\n", + " dfit_b <- dreg(as.matrix(Xnotb), Dnotb)\n", + " dhat_b <- predict(dfit_b, as.matrix(Xb), type = \"response\") # default is type=\"link\" for family binomial!\n", " } else if (method == \"randomforest\") {\n", - " Dfit_b <- dreg(Xnotb, as.factor(Dnotb))\n", - " Dhat_b <- predict(Dfit_b, Xb, type = \"prob\")[,2]\n", + " dfit_b <- dreg(Xnotb, as.factor(Dnotb))\n", + " dhat_b <- predict(dfit_b, Xb, type = \"prob\")[, 2]\n", " } else if (method == \"decisiontrees\") {\n", - " Dfit_b <- dreg(Xnotb, Dnotb)\n", - " Dhat_b <- predict(Dfit_b, Xb)[,2]\n", + " dfit_b <- dreg(Xnotb, Dnotb)\n", + " dhat_b <- predict(dfit_b, Xb)[, 2]\n", " } else if (method == \"boostedtrees\") {\n", - " Dfit_b <- dreg(as.data.frame(Xnotb), Dnotb)\n", - " Dhat_b <- predict(Dfit_b, Xb, type=\"response\")\n", + " dfit_b <- dreg(as.data.frame(Xnotb), Dnotb)\n", + " dhat_b <- predict(dfit_b, Xb, type = \"response\")\n", " }\n", - " Dhat_b <- pmax(pmin(Dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", - " Dhat[I[[b]]] <- Dhat_b\n", + " dhat_b <- pmax(pmin(dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)]\n", + " Dhat[I[[b]]] <- dhat_b\n", "\n", - "\n", - " cat(b,\" \")\n", + " cat(b, \" \")\n", " }\n", "\n", " # Prediction of treatment and outcome for observed instrument\n", " yhat <- yhat0 * (1 - D) + yhat1 * D\n", " # residuals\n", - " ytil <- y-yhat\n", - " dtil <- D-Dhat\n", + " ytil <- y - yhat\n", + " dtil <- D - Dhat\n", " # doubly robust quantity for every sample\n", - " drhat <- yhat1 - yhat0 + (y-yhat)* (D/Dhat - (1 - D)/(1 - Dhat))\n", - " coef.est <- mean(drhat)\n", + " drhat <- yhat1 - yhat0 + (y - yhat) * (D / Dhat - (1 - D) / (1 - Dhat))\n", + " coef_est <- mean(drhat)\n", " vari <- var(drhat)\n", - " se <- sqrt(vari/nrow(X))\n", - " cat(\"point\", coef.est)\n", + " se <- sqrt(vari / nrow(X))\n", + " cat(\"point\", coef_est)\n", " cat(\"se\", se)\n", - " return(list(coef.est = coef.est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat))\n", + " return(list(coef_est = coef_est, se = se, ytil = ytil, dtil = dtil, drhat = drhat,\n", + " yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat))\n", "}" ] }, @@ -1013,18 +1116,22 @@ "execution_count": null, "id": "51", "metadata": { - "id": "bCj1D8_MSg09" + "id": "bCj1D8_MSg09", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) {\n", - " summary_data <- data.frame(estimate = coef.est, # point estimate\n", - " se = se, # standard error\n", - " lower = coef.est - 1.96 * se, # lower end of 95% confidence interval\n", - " upper = coef.est + 1.96 * se, # upper end of 95% confidence interval\n", - " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", - " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", - " accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D\n", + "summaryIRM <- function(coef_est, se, ytil, dtil, drhat, name) {\n", + " summary_data <- data.frame(\n", + " estimate = coef_est, # point estimate\n", + " se = se, # standard error\n", + " lower = coef_est - 1.96 * se, # lower end of 95% confidence interval\n", + " upper = coef_est + 1.96 * se, # upper end of 95% confidence interval\n", + " rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y\n", + " rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D\n", + " accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D\n", " )\n", " row.names(summary_data) <- name\n", " return(summary_data)\n", @@ -1046,7 +1153,10 @@ "execution_count": null, "id": "53", "metadata": { - "id": "AUiHMoNTvo-m" + "id": "AUiHMoNTvo-m", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1054,20 +1164,28 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with LassoCV/Logistic \\n\"))\n", "\n", - "dreg.lasso.cv <- function(x,d){cv.glmnet(x, d, family=\"binomial\", alpha=0, nfolds=5)}\n", - "yreg0.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "yreg1.lasso.cv <- function(x,y){cv.glmnet(x, y, family=\"gaussian\", alpha=1, nfolds=5)}\n", - "\n", - "DML2.results <- DML2.for.IRM(X, D, y, dreg.lasso.cv, yreg0.lasso.cv, yreg1.lasso.cv, nfold=5) # more folds seems to help stabilize finite sample performance\n", - "sum.lasso.cv <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'LassoCVLogistic')\n", + "dreg_lasso_cv <- function(x, d) {\n", + " cv.glmnet(x, d, family = \"binomial\", alpha = 0, nfolds = 5)\n", + "}\n", + "yreg0_lasso_cv <- function(x, y) {\n", + " cv.glmnet(x, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "}\n", + "yreg1_lasso_cv <- function(x, y) {\n", + " cv.glmnet(x, y, family = \"gaussian\", alpha = 1, nfolds = 5)\n", + "}\n", + "\n", + "# more folds seems to help stabilize finite sample performance\n", + "dml2_results <- dml2_for_irm(X, D, y, dreg_lasso_cv, yreg0_lasso_cv, yreg1_lasso_cv, nfold = 5)\n", + "sum_lasso_cv <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$drhat, name = \"LassoCVLogistic\")\n", "tableirm <- data.frame()\n", - "tableirm <- rbind(sum.lasso.cv)\n", + "tableirm <- rbind(sum_lasso_cv)\n", "tableirm\n", "\n", - "yhat0.lasso <- DML2.results$yhat0\n", - "yhat1.lasso <- DML2.results$yhat1\n", - "dhat.lasso <- DML2.results$dhat\n", - "yhat.lasso <- DML2.results$yhat" + "yhat0_lasso <- dml2_results$yhat0\n", + "yhat1_lasso <- dml2_results$yhat1\n", + "dhat_lasso <- dml2_results$dhat\n", + "yhat_lasso <- dml2_results$yhat" ] }, { @@ -1075,7 +1193,10 @@ "execution_count": null, "id": "54", "metadata": { - "id": "JPABXLYyvyqy" + "id": "JPABXLYyvyqy", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1083,20 +1204,27 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Random Forest \\n\"))\n", "\n", - "dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg0.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "yreg1.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest\n", - "\n", - "\n", - "DML2.results <- DML2.for.IRM(as.matrix(X), D, y, dreg.rf, yreg0.rf, yreg1.rf, nfold=5, method = \"randomforest\")\n", - "sum.rf <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Random Forest')\n", - "tableirm <- rbind(tableirm, sum.rf)\n", + "dreg_rf <- function(x, d) {\n", + " randomForest(x, d, ntree = 1000, nodesize = 10)\n", + "} # ML method=Forest\n", + "yreg0_rf <- function(x, y) {\n", + " randomForest(x, y, ntree = 1000, nodesize = 10)\n", + "} # ML method=Forest\n", + "yreg1_rf <- function(x, y) {\n", + " randomForest(x, y, ntree = 1000, nodesize = 10)\n", + "} # ML method=Forest\n", + "\n", + "\n", + "dml2_results <- dml2_for_irm(as.matrix(X), D, y, dreg_rf, yreg0_rf, yreg1_rf, nfold = 5, method = \"randomforest\")\n", + "sum_rf <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$drhat, name = \"Random Forest\")\n", + "tableirm <- rbind(tableirm, sum_rf)\n", "tableirm\n", "\n", - "yhat0.rf <- DML2.results$yhat0\n", - "yhat1.rf <- DML2.results$yhat1\n", - "dhat.rf <- DML2.results$dhat\n", - "yhat.rf <- DML2.results$yhat" + "yhat0_rf <- dml2_results$yhat0\n", + "yhat1_rf <- dml2_results$yhat1\n", + "dhat_rf <- dml2_results$dhat\n", + "dhat_rf <- dml2_results$yhat" ] }, { @@ -1104,7 +1232,10 @@ "execution_count": null, "id": "55", "metadata": { - "id": "SukZCfEbvyzC" + "id": "SukZCfEbvyzC", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1112,19 +1243,26 @@ "set.seed(123)\n", "cat(sprintf(\"\\nDML with Decision Trees \\n\"))\n", "\n", - "dreg.tr <- function(x,d){rpart(as.formula(\"D~.\"), cbind(data.frame(D=d),x), method = \"class\", minbucket=10, cp = 0.001)}\n", - "yreg0.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "yreg1.tr <- function(x,y){rpart(as.formula(\"y~.\"), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)}\n", - "\n", - "DML2.results <- DML2.for.IRM(X, D, y, dreg.tr, yreg0.tr, yreg1.tr, nfold=5, method = \"decisiontrees\")\n", - "sum.tr <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Decision Trees')\n", - "tableirm <- rbind(tableirm, sum.tr)\n", + "dreg_tr <- function(x, d) {\n", + " rpart(as.formula(\"D~.\"), cbind(data.frame(D = d), x), method = \"class\", minbucket = 10, cp = 0.001)\n", + "}\n", + "yreg0_tr <- function(x, y) {\n", + " rpart(as.formula(\"y~.\"), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001)\n", + "}\n", + "yreg1_tr <- function(x, y) {\n", + " rpart(as.formula(\"y~.\"), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001)\n", + "}\n", + "\n", + "dml2_results <- dml2_for_irm(X, D, y, dreg_tr, yreg0_tr, yreg1_tr, nfold = 5, method = \"decisiontrees\")\n", + "sum_tr <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$drhat, name = \"Decision Trees\")\n", + "tableirm <- rbind(tableirm, sum_tr)\n", "tableirm\n", "\n", - "yhat0.tr <- DML2.results$yhat0\n", - "yhat1.tr <- DML2.results$yhat1\n", - "dhat.tr <- DML2.results$dhat\n", - "yhat.tr <- DML2.results$yhat" + "yhat0_tr <- dml2_results$yhat0\n", + "yhat1_tr <- dml2_results$yhat1\n", + "dhat_tr <- dml2_results$dhat\n", + "yhat_tr <- dml2_results$yhat" ] }, { @@ -1132,7 +1270,10 @@ "execution_count": null, "id": "56", "metadata": { - "id": "bTfgiCabvy6f" + "id": "bTfgiCabvy6f", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1141,21 +1282,31 @@ "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", "\n", "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE)\n", - "dreg.boost <- function(x,d){gbm(as.formula(\"D~.\"), cbind(data.frame(D=d),x), distribution= \"bernoulli\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg0.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", - "yreg1.boost <- function(x,y){gbm(as.formula(\"y~.\"), cbind(data.frame(y=y),x), distribution= \"gaussian\", interaction.depth=2, n.trees=100, shrinkage=.1)}\n", + "## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", + "dreg_boost <- function(x, d) {\n", + " gbm(as.formula(\"D~.\"), cbind(data.frame(D = d), x), distribution = \"bernoulli\",\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", + "}\n", + "yreg0_boost <- function(x, y) {\n", + " gbm(as.formula(\"y~.\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", + "}\n", + "yreg1_boost <- function(x, y) {\n", + " gbm(as.formula(\"y~.\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", + "}\n", "\n", "# passing these through regression as type=\"response\", and D should not be factor!\n", - "DML2.results = DML2.for.IRM(X, D, y, dreg.boost, yreg0.boost, yreg1.boost, nfold=5, method = \"boostedtrees\")\n", - "sum.boost <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Boosted Trees')\n", - "tableirm <- rbind(tableirm, sum.boost)\n", + "dml2_results <- dml2_for_irm(X, D, y, dreg_boost, yreg0_boost, yreg1_boost, nfold = 5, method = \"boostedtrees\")\n", + "sum_boost <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$drhat, name = \"Boosted Trees\")\n", + "tableirm <- rbind(tableirm, sum_boost)\n", "tableirm\n", "\n", - "yhat0.boost <- DML2.results$yhat0\n", - "yhat1.boost <- DML2.results$yhat1\n", - "dhat.boost <- DML2.results$dhat\n", - "yhat.boost <- DML2.results$yhat" + "yhat0_boost <- dml2_results$yhat0\n", + "yhat1_boost <- dml2_results$yhat1\n", + "dhat_boost <- dml2_results$dhat\n", + "yhat_boost <- dml2_results$yhat" ] }, { @@ -1163,7 +1314,10 @@ "execution_count": null, "id": "57", "metadata": { - "id": "7rxqwK-R4Z2q" + "id": "7rxqwK-R4Z2q", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1174,16 +1328,16 @@ "# Here, the best performance for Y is the random forest and for D the boosted tree\n", "\n", "# residuals\n", - "ytil <- y-yhat.rf\n", - "dtil <- D-dhat.boost\n", + "ytil <- y - dhat_rf\n", + "dtil <- D - dhat_boost\n", "# doubly robust quantity for every sample\n", - "drhat <- yhat1.rf - yhat0.rf + (y-yhat.rf)* (D/dhat.boost - (1 - D)/(1 - dhat.boost))\n", - "coef.est <- mean(drhat)\n", + "drhat <- yhat1_rf - yhat0_rf + (y - dhat_rf) * (D / dhat_boost - (1 - D) / (1 - dhat_boost))\n", + "coef_est <- mean(drhat)\n", "vari <- var(drhat)\n", - "se <- sqrt(vari/nrow(X))\n", + "se <- sqrt(vari / nrow(X))\n", "\n", - "sum.best <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Best')\n", - "tableirm <- rbind(tableirm, sum.best)\n", + "sum_best <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = \"Best\")\n", + "tableirm <- rbind(tableirm, sum_best)\n", "tableirm" ] }, @@ -1192,38 +1346,41 @@ "execution_count": null, "id": "58", "metadata": { - "id": "0-c3NI0fCfqg" + "id": "0-c3NI0fCfqg", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Least squares model average\n", "# We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1\n", "\n", - "ma.dw <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost)$coef\n", - "ma.yw <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost)$coef\n", + "ma_dw <- lm(D ~ dhat_lasso + dhat_rf + dhat_tr + dhat_boost)$coef\n", + "ma_yw <- lm(y ~ yhat_lasso + dhat_rf + yhat_tr + yhat_boost)$coef\n", "\n", - "Dhats <- cbind(as.matrix(rep(1,nrow(X))),dhat.lasso,dhat.rf,dhat.tr,dhat.boost)\n", - "Y0s <- cbind(as.matrix(rep(1,nrow(X))),yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)\n", - "Y1s <- cbind(as.matrix(rep(1,nrow(X))),yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)\n", + "Dhats <- cbind(as.matrix(rep(1, nrow(X))), dhat_lasso, dhat_rf, dhat_tr, dhat_boost)\n", + "Y0s <- cbind(as.matrix(rep(1, nrow(X))), yhat0_lasso, yhat0_rf, yhat0_tr, yhat0_boost)\n", + "Y1s <- cbind(as.matrix(rep(1, nrow(X))), yhat1_lasso, yhat1_rf, yhat1_tr, yhat1_boost)\n", "\n", - "dhat <- Dhats%*%as.matrix(ma.dw)\n", - "yhat0 <- Y0s%*%as.matrix(ma.yw)\n", - "yhat1 <- Y1s%*%as.matrix(ma.yw)\n", + "dhat <- Dhats %*% as.matrix(ma_dw)\n", + "yhat0 <- Y0s %*% as.matrix(ma_yw)\n", + "yhat1 <- Y1s %*% as.matrix(ma_yw)\n", "\n", "# Prediction of treatment and outcome for observed instrument\n", "yhat <- yhat0 * (1 - D) + yhat1 * D\n", "# residuals\n", - "ytil <- y-yhat\n", - "dtil <- D-dhat\n", + "ytil <- y - yhat\n", + "dtil <- D - dhat\n", "# doubly robust quantity for every sample\n", - "drhat <- yhat1 - yhat0 + (y-yhat)* (D/dhat - (1 - D)/(1 - dhat))\n", - "coef.est <- mean(drhat)\n", + "drhat <- yhat1 - yhat0 + (y - yhat) * (D / dhat - (1 - D) / (1 - dhat))\n", + "coef_est <- mean(drhat)\n", "vari <- var(drhat)\n", - "se <- sqrt(vari/nrow(X))\n", + "se <- sqrt(vari / nrow(X))\n", "\n", - "sum.ma <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Model Average')\n", + "sum.ma <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = \"Model Average\")\n", "tableirm <- rbind(tableirm, sum.ma)\n", - "tableirm\n" + "tableirm" ] }, { @@ -1279,7 +1436,10 @@ "start_time": "2022-04-19T09:06:51.130347", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1311,22 +1471,22 @@ "start_time": "2022-04-19T09:07:11.994203", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Constructing the data (as DoubleMLData)\n", - "formula_flex = \"net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown\"\n", - "model_flex = as.data.table(model.frame(formula_flex, pension))\n", - "x_cols = colnames(model_flex)[-c(1,2)]\n", - "data_ml = DoubleMLData$new(model_flex, y_col = \"net_tfa\", d_cols = \"e401\", x_cols=x_cols)\n", + "formula_flex <- paste(\"net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) \",\n", + " \"+ poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown\")\n", + "model_flex <- as.data.table(model.frame(formula_flex, pension))\n", + "x_cols <- colnames(model_flex)[-c(1, 2)]\n", + "data_ml <- DoubleMLData$new(model_flex, y_col = \"net_tfa\", d_cols = \"e401\", x_cols = x_cols)\n", "\n", - "\n", - "p <- dim(model_flex)[2]-2\n", - "p\n", - "\n", - "# complex model with two-way interactions\n", - "#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE)\n" + "p <- dim(model_flex)[2] - 2\n", + "p" ] }, { @@ -1370,17 +1530,20 @@ "start_time": "2022-04-19T09:07:12.173534", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Estimating the PLR\n", "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "lasso <- lrn(\"regr.cv_glmnet\",nfolds = 5, s = \"lambda.min\")\n", + "lasso <- lrn(\"regr.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", "lasso_class <- lrn(\"classif.cv_glmnet\", nfolds = 5, s = \"lambda.min\")\n", "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds = 5)\n", + "dml_plr$fit(store_predictions = TRUE)\n", "dml_plr$summary()\n", "lasso_plr <- dml_plr$coef\n", "lasso_std_plr <- dml_plr$se" @@ -1417,7 +1580,10 @@ "start_time": "2022-04-19T09:07:34.681248", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1439,7 +1605,10 @@ "start_time": "2022-04-19T09:07:34.730771", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1447,8 +1616,8 @@ "y <- as.matrix(pension$net_tfa) # true observations\n", "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", "d <- as.matrix(pension$e401)\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "lasso_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "predictions_y <- as.matrix(d * theta) + g_hat # predictions for y\n", + "lasso_y_rmse <- sqrt(mean((y - predictions_y)^2))\n", "lasso_y_rmse" ] }, @@ -1465,13 +1634,16 @@ "start_time": "2022-04-19T09:07:34.785388", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# cross-fitted RMSE: treatment\n", "d <- as.matrix(pension$e401)\n", - "lasso_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "lasso_d_rmse <- sqrt(mean((d - m_hat)^2))\n", "lasso_d_rmse\n", "\n", "# cross-fitted ce: treatment\n", @@ -1509,17 +1681,20 @@ "start_time": "2022-04-19T09:07:34.863255", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Random Forest\n", "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", "randomForest <- lrn(\"regr.ranger\")\n", - "randomForest_class <- lrn(\"classif.ranger\")\n", + "random_forest_class <- lrn(\"classif.ranger\")\n", "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = randomForest_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = random_forest_class, n_folds = 5)\n", + "dml_plr$fit(store_predictions = TRUE) # set store_predictions=TRUE to evaluate the model\n", "dml_plr$summary()\n", "forest_plr <- dml_plr$coef\n", "forest_std_plr <- dml_plr$se" @@ -1556,7 +1731,10 @@ "start_time": "2022-04-19T09:08:35.944307", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1564,12 +1742,12 @@ "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "forest_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "predictions_y <- as.matrix(d * theta) + g_hat # predictions for y\n", + "forest_y_rmse <- sqrt(mean((y - predictions_y)^2))\n", "forest_y_rmse\n", "\n", "# cross-fitted RMSE: treatment\n", - "forest_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "forest_d_rmse <- sqrt(mean((d - m_hat)^2))\n", "forest_d_rmse\n", "\n", "# cross-fitted ce: treatment\n", @@ -1589,7 +1767,10 @@ "start_time": "2022-04-19T09:08:36.049899", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1599,8 +1780,8 @@ "trees <- lrn(\"regr.rpart\")\n", "trees_class <- lrn(\"classif.rpart\")\n", "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds = 5)\n", + "dml_plr$fit(store_predictions = TRUE)\n", "dml_plr$summary()\n", "tree_plr <- dml_plr$coef\n", "tree_std_plr <- dml_plr$se\n", @@ -1609,12 +1790,12 @@ "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "tree_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "predictions_y <- as.matrix(d * theta) + g_hat # predictions for y\n", + "tree_y_rmse <- sqrt(mean((y - predictions_y)^2))\n", "tree_y_rmse\n", "\n", "# cross-fitted RMSE: treatment\n", - "tree_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "tree_d_rmse <- sqrt(mean((d - m_hat)^2))\n", "tree_d_rmse\n", "\n", "# cross-fitted ce: treatment\n", @@ -1634,7 +1815,10 @@ "start_time": "2022-04-19T09:08:37.283721", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1659,16 +1843,19 @@ "start_time": "2022-04-19T09:09:58.089896", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Boosting\n", - "boost<- lrn(\"regr.glmboost\")\n", + "boost <- lrn(\"regr.glmboost\")\n", "boost_class <- lrn(\"classif.glmboost\")\n", "\n", - "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds=5)\n", - "dml_plr$fit(store_predictions=TRUE)\n", + "dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds = 5)\n", + "dml_plr$fit(store_predictions = TRUE)\n", "dml_plr$summary()\n", "boost_plr <- dml_plr$coef\n", "boost_std_plr <- dml_plr$se\n", @@ -1677,12 +1864,12 @@ "g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o\n", "m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o\n", "theta <- as.numeric(dml_plr$coef) # estimated regression coefficient\n", - "predictions_y <- as.matrix(d*theta)+g_hat # predictions for y\n", - "boost_y_rmse <- sqrt(mean((y-predictions_y)^2))\n", + "predictions_y <- as.matrix(d * theta) + g_hat # predictions for y\n", + "boost_y_rmse <- sqrt(mean((y - predictions_y)^2))\n", "boost_y_rmse\n", "\n", "# cross-fitted RMSE: treatment\n", - "boost_d_rmse <- sqrt(mean((d-m_hat)^2))\n", + "boost_d_rmse <- sqrt(mean((d - m_hat)^2))\n", "boost_d_rmse\n", "\n", "# cross-fitted ce: treatment\n", @@ -1720,18 +1907,21 @@ "start_time": "2022-04-19T09:10:00.534612", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "table <- matrix(0, 4, 4)\n", - "table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr)\n", - "table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr)\n", - "table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse)\n", - "table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse)\n", - "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", - "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", - "tab<- xtable(table, digits = 2)\n", + "table[1, 1:4] <- c(lasso_plr, forest_plr, tree_plr, boost_plr)\n", + "table[2, 1:4] <- c(lasso_std_plr, forest_std_plr, tree_std_plr, boost_std_plr)\n", + "table[3, 1:4] <- c(lasso_y_rmse, forest_y_rmse, tree_y_rmse, boost_y_rmse)\n", + "table[4, 1:4] <- c(lasso_d_rmse, forest_d_rmse, tree_d_rmse, boost_d_rmse)\n", + "rownames(table) <- c(\"Estimate\", \"Std.Error\", \"RMSE Y\", \"RMSE D\")\n", + "colnames(table) <- c(\"Lasso\", \"Random Forest\", \"Trees\", \"Boosting\")\n", + "tab <- xtable(table, digits = 2)\n", "tab" ] }, @@ -1766,7 +1956,10 @@ "start_time": "2022-04-19T09:10:00.656349", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1796,15 +1989,20 @@ "start_time": "2022-04-19T09:10:00.813519", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso,\n", - " ml_m = lasso_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm <- DoubleMLIRM$new(data_ml,\n", + " ml_g = lasso,\n", + " ml_m = lasso_class,\n", + " trimming_threshold = 0.01, n_folds = 5\n", + ")\n", + "dml_irm$fit(store_predictions = TRUE)\n", "dml_irm$summary()\n", "lasso_irm <- dml_irm$coef\n", "lasso_std_irm <- dml_irm$se\n", @@ -1814,8 +2012,8 @@ "dml_irm$params_names()\n", "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", - "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n" + "g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0\n", + "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o" ] }, { @@ -1831,18 +2029,21 @@ "start_time": "2022-04-19T09:10:18.948743", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# cross-fitted RMSE: outcome\n", "y <- as.matrix(pension$net_tfa) # true observations\n", "d <- as.matrix(pension$e401)\n", - "lasso_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "lasso_y_irm <- sqrt(mean((y - g_hat)^2))\n", "lasso_y_irm\n", "\n", "# cross-fitted RMSE: treatment\n", - "lasso_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "lasso_d_irm <- sqrt(mean((d - m_hat)^2))\n", "lasso_d_irm\n", "\n", "# cross-fitted ce: treatment\n", @@ -1862,16 +2063,21 @@ "start_time": "2022-04-19T09:10:19.015521", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "##### forest #####\n", "\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", - " ml_m = randomForest_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm <- DoubleMLIRM$new(data_ml,\n", + " ml_g = randomForest,\n", + " ml_m = random_forest_class,\n", + " trimming_threshold = 0.01, n_folds = 5\n", + ")\n", + "dml_irm$fit(store_predictions = TRUE)\n", "dml_irm$summary()\n", "forest_irm <- dml_irm$coef\n", "forest_std_irm <- dml_plr$se\n", @@ -1879,17 +2085,17 @@ "# predictions\n", "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0\n", "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0\n", "\n", "# cross-fitted RMSE: outcome\n", "y <- as.matrix(pension$net_tfa) # true observations\n", "d <- as.matrix(pension$e401)\n", - "forest_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "forest_y_irm <- sqrt(mean((y - g_hat)^2))\n", "forest_y_irm\n", "\n", "# cross-fitted RMSE: treatment\n", - "forest_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "forest_d_irm <- sqrt(mean((d - m_hat)^2))\n", "forest_d_irm\n", "\n", "# cross-fitted ce: treatment\n", @@ -1897,9 +2103,11 @@ "\n", "##### trees #####\n", "\n", - "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm <- DoubleMLIRM$new(data_ml,\n", + " ml_g = trees, ml_m = trees_class,\n", + " trimming_threshold = 0.01, n_folds = 5\n", + ")\n", + "dml_irm$fit(store_predictions = TRUE)\n", "dml_irm$summary()\n", "tree_irm <- dml_irm$coef\n", "tree_std_irm <- dml_irm$se\n", @@ -1907,17 +2115,17 @@ "# predictions\n", "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0\n", "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", "\n", "# cross-fitted RMSE: outcome\n", "y <- as.matrix(pension$net_tfa) # true observations\n", "d <- as.matrix(pension$e401)\n", - "tree_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "tree_y_irm <- sqrt(mean((y - g_hat)^2))\n", "tree_y_irm\n", "\n", "# cross-fitted RMSE: treatment\n", - "tree_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "tree_d_irm <- sqrt(mean((d - m_hat)^2))\n", "tree_d_irm\n", "\n", "# cross-fitted ce: treatment\n", @@ -1926,9 +2134,11 @@ "\n", "##### boosting #####\n", "\n", - "dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm <- DoubleMLIRM$new(data_ml,\n", + " ml_g = boost, ml_m = boost_class,\n", + " trimming_threshold = 0.01, n_folds = 5\n", + ")\n", + "dml_irm$fit(store_predictions = TRUE)\n", "dml_irm$summary()\n", "boost_irm <- dml_irm$coef\n", "boost_std_irm <- dml_irm$se\n", @@ -1936,17 +2146,17 @@ "# predictions\n", "g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X)\n", "g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X)\n", - "g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0\n", + "g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0\n", "m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o\n", "\n", "# cross-fitted RMSE: outcome\n", "y <- as.matrix(pension$net_tfa) # true observations\n", "d <- as.matrix(pension$e401)\n", - "boost_y_irm <- sqrt(mean((y-g_hat)^2))\n", + "boost_y_irm <- sqrt(mean((y - g_hat)^2))\n", "boost_y_irm\n", "\n", "# cross-fitted RMSE: treatment\n", - "boost_d_irm <- sqrt(mean((d-m_hat)^2))\n", + "boost_d_irm <- sqrt(mean((d - m_hat)^2))\n", "boost_d_irm\n", "\n", "# cross-fitted ce: treatment\n", @@ -1966,18 +2176,21 @@ "start_time": "2022-04-19T09:11:17.801787", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "table <- matrix(0, 4, 4)\n", - "table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm)\n", - "table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm)\n", - "table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm)\n", - "table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm)\n", - "rownames(table) <- c(\"Estimate\",\"Std.Error\",\"RMSE Y\",\"RMSE D\")\n", - "colnames(table) <- c(\"Lasso\",\"Random Forest\",\"Trees\",\"Boosting\")\n", - "tab<- xtable(table, digits = 2)\n", + "table[1, 1:4] <- c(lasso_irm, forest_irm, tree_irm, boost_irm)\n", + "table[2, 1:4] <- c(lasso_std_irm, forest_std_irm, tree_std_irm, boost_std_irm)\n", + "table[3, 1:4] <- c(lasso_y_irm, forest_y_irm, tree_y_irm, boost_y_irm)\n", + "table[4, 1:4] <- c(lasso_d_irm, forest_d_irm, tree_d_irm, boost_d_irm)\n", + "rownames(table) <- c(\"Estimate\", \"Std.Error\", \"RMSE Y\", \"RMSE D\")\n", + "colnames(table) <- c(\"Lasso\", \"Random Forest\", \"Trees\", \"Boosting\")\n", + "tab <- xtable(table, digits = 2)\n", "tab" ] }, @@ -2012,15 +2225,20 @@ "start_time": "2022-04-19T09:11:17.907974", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "lgr::get_logger(\"mlr3\")$set_threshold(\"warn\")\n", - "dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest,\n", - " ml_m = lasso_class,\n", - " trimming_threshold = 0.01, n_folds=5)\n", - "dml_irm$fit(store_predictions=TRUE)\n", + "dml_irm <- DoubleMLIRM$new(data_ml,\n", + " ml_g = randomForest,\n", + " ml_m = lasso_class,\n", + " trimming_threshold = 0.01, n_folds = 5\n", + ")\n", + "dml_irm$fit(store_predictions = TRUE)\n", "dml_irm$summary()\n", "best_irm <- dml_irm$coef\n", "best_std_irm <- dml_irm$se" From c03ebf1e4636fbcaed120189fa1398091d232102 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 05:00:51 -0700 Subject: [PATCH 103/261] Update r_ml_wage_prediction.irnb --- PM3/r_ml_wage_prediction.irnb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index 3b591fe5..5a918407 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -1675,6 +1675,16 @@ "We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On a unix system, installation of the package h2o also requires the RCurl package, which requires the libcurl library to have been installed. If the installation of the h2o package fails, try installing the libcurl package first and repeating the cell. You can install the libcurl package for instance by running:\n", + "```bash\n", + "sudo apt-get install -y libcurl4-openssl-dev\n", + "```" + ] + }, { "cell_type": "code", "execution_count": null, From 340c73b37712d138dc92491aa3f357696e3855d7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 05:03:58 -0700 Subject: [PATCH 104/261] Update r-identification-analysis-of-401-k-example-w-dags.irnb --- ...tion-analysis-of-401-k-example-w-dags.irnb | 94 ++++++++++++------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb b/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb index b9e05adc..cdfeb179 100644 --- a/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb +++ b/PM4/r-identification-analysis-of-401-k-example-w-dags.irnb @@ -31,15 +31,18 @@ "start_time": "2021-04-20T21:06:30.048170", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#install and load package\n", + "# install and load package\n", "install.packages(\"dagitty\")\n", "install.packages(\"ggdag\")\n", "library(dagitty)\n", - "library(ggdag)\n" + "library(ggdag)" ] }, { @@ -109,13 +112,16 @@ "start_time": "2021-04-20T21:06:55.532808", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#generate a DAGs and plot them\n", + "# generate a DAGs and plot them\n", "\n", - "G1 = dagitty('dag{\n", + "G1 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -127,7 +133,7 @@ "X -> Y}')\n", "\n", "\n", - "ggdag(G1)+ theme_dag()" + "ggdag(G1) + theme_dag()" ] }, { @@ -160,11 +166,14 @@ "start_time": "2021-04-20T21:06:56.529499", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "adjustmentSets( G1, \"D\", \"Y\",effect=\"total\" )" + "adjustmentSets(G1, \"D\", \"Y\", effect = \"total\")" ] }, { @@ -215,13 +224,16 @@ "start_time": "2021-04-20T21:06:56.637212", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#generate a couple of DAGs and plot them\n", + "# generate a couple of DAGs and plot them\n", "\n", - "G2 = dagitty('dag{\n", + "G2 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -233,7 +245,7 @@ "X -> Y}')\n", "\n", "\n", - "ggdag(G2)+ theme_dag()" + "ggdag(G2) + theme_dag()" ] }, { @@ -248,11 +260,14 @@ "start_time": "2021-04-20T21:06:57.038336", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "adjustmentSets( G2, \"D\", \"Y\", effect=\"total\" )\n" + "adjustmentSets(G2, \"D\", \"Y\", effect = \"total\")" ] }, { @@ -303,11 +318,14 @@ "start_time": "2021-04-20T21:06:57.174395", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "G3 = dagitty('dag{\n", + "G3 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -321,9 +339,9 @@ "A -> D\n", "X -> Y}')\n", "\n", - "adjustmentSets( G3, \"D\", \"Y\", effect=\"total\" )\n", + "adjustmentSets(G3, \"D\", \"Y\", effect = \"total\")\n", "\n", - "ggdag(G3)+ theme_dag()" + "ggdag(G3) + theme_dag()" ] }, { @@ -357,11 +375,14 @@ "start_time": "2021-04-20T21:06:57.615940", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "G4 = dagitty('dag{\n", + "G4 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -377,7 +398,7 @@ "X -> Y}')\n", "\n", "\n", - "ggdag(G4)+ theme_dag()" + "ggdag(G4) + theme_dag()" ] }, { @@ -392,11 +413,14 @@ "start_time": "2021-04-20T21:06:58.042805", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "adjustmentSets( G4, \"D\", \"Y\",effect=\"total\" )" + "adjustmentSets(G4, \"D\", \"Y\", effect = \"total\")" ] }, { @@ -462,11 +486,14 @@ "start_time": "2021-04-20T21:06:58.210967", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "G5 = dagitty('dag{\n", + "G5 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -484,9 +511,9 @@ "X -> M\n", "X -> Y}')\n", "\n", - "print( adjustmentSets( G5, \"D\", \"Y\",effect=\"total\" ) )\n", + "print(adjustmentSets(G5, \"D\", \"Y\", effect = \"total\"))\n", "\n", - "ggdag(G5)+ theme_dag()" + "ggdag(G5) + theme_dag()" ] }, { @@ -518,11 +545,14 @@ "start_time": "2021-04-20T21:06:58.654264", "status": "completed" }, - "tags": [] + "tags": [], + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "G6 = dagitty('dag{\n", + "G6 <- dagitty('dag{\n", "Y [outcome,pos=\"4, 0\"]\n", "D [exposure,pos=\"0, 0\"]\n", "X [confounder, pos=\"2,-2\"]\n", @@ -541,9 +571,9 @@ "X -> M\n", "X -> Y}')\n", "\n", - "print( adjustmentSets( G6, \"D\", \"Y\" ),effect=\"total\" )\n", + "print(adjustmentSets(G6, \"D\", \"Y\"), effect = \"total\")\n", "\n", - "ggdag(G6)+ theme_dag()" + "ggdag(G6) + theme_dag()" ] }, { From 87a61b7444dacd89d858f51531da931d3fc67f12 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 05:16:34 -0700 Subject: [PATCH 105/261] Moved the R version of Bert to in-progress --- PM5/Autoencoders.irnb | 141 ++- ...leML_and_Feature_Engineering_with_BERT.Rmd | 873 ------------------ ...eML_and_Feature_Engineering_with_BERT.irnb | 312 ++++--- 3 files changed, 296 insertions(+), 1030 deletions(-) delete mode 100644 PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd rename PM5/{ => in_progress}/DoubleML_and_Feature_Engineering_with_BERT.irnb (90%) diff --git a/PM5/Autoencoders.irnb b/PM5/Autoencoders.irnb index 845b5f96..5268b626 100644 --- a/PM5/Autoencoders.irnb +++ b/PM5/Autoencoders.irnb @@ -16,11 +16,14 @@ "3. Train a non-linear auto-encoder that uses a deep neural network\n", "\n", "### Overview\n", - "As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \\in \\mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \\mathbb{R}^d \\to \\mathbb{R}^k$ and $g: \\mathbb{R}^k \\to \\mathbb{R}^d$, with $k \\ll d$, such that $$g(f(X)) \\approx X.$$\n", + "As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \\in \\mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \\mathbb{R}^d \\to \\mathbb{R}^k$ and $g: \\mathbb{R}^k \\to \\mathbb{R}^d$, with $k \\ll d$, such that \n", + "$$g(f(X)) \\approx X.$$\n", + "\n", "In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$.\n", "\n", "It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\\mathbb{R}^d \\to \\mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize\n", - "$$\\mathbb{E}[\\mathrm{loss}(X, f(g(X))]$$ over functions $f \\in F$ and $g \\in G$. As usual, this is done by minimizing the sample analog.\n", + "$$\\mathbb{E}[\\mathrm{loss}(X, f(g(X))]$$\n", + "over functions $f \\in F$ and $g \\in G$. As usual, this is done by minimizing the sample analog.\n", "\n", "\n" ] @@ -45,7 +48,10 @@ "execution_count": null, "id": "2", "metadata": { - "id": "nf4aybFuwTft" + "id": "nf4aybFuwTft", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -57,7 +63,10 @@ "execution_count": null, "id": "3", "metadata": { - "id": "ID08-PSOeKRf" + "id": "ID08-PSOeKRf", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -75,7 +84,10 @@ "execution_count": null, "id": "4", "metadata": { - "id": "Z_ZpuBEBfCeH" + "id": "Z_ZpuBEBfCeH", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -96,8 +108,11 @@ "n_features <- dim(design_matrix)[2]\n", "\n", "# Print the dataset details\n", - "cat(sprintf(\"Labeled Faces in the Wild Dataset:\\n Number of examples: %d\\n Number of features: %d\\n Image height: %d\\n Image width: %d\",\n", - " n_examples, n_features, height, width))\n" + "cat(sprintf(\n", + " past(\"Labeled Faces in the Wild Dataset:\\n Number of examples: %d\\n \",\n", + " \"Number of features: %d\\n Image height: %d\\n Image width: %d\"),\n", + " n_examples, n_features, height, width\n", + "))" ] }, { @@ -105,7 +120,10 @@ "execution_count": null, "id": "5", "metadata": { - "id": "PX_E23v-5yZY" + "id": "PX_E23v-5yZY", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -116,29 +134,29 @@ "# Find indices where the label is 'Arnold Schwarzenegger'\n", "# faces$target uses python style indexing that starts at 0 rather than R style\n", "# indexing that starts at 1, so we subtract 1 so the indexing lines up\n", - "arnold_labels <- which(faces$target_names == \"Arnold Schwarzenegger\")-1\n", + "arnold_labels <- which(faces$target_names == \"Arnold Schwarzenegger\") - 1\n", "# Get indices of all images corresponding to Arnold\n", "arnold_pics <- which(faces$target %in% arnold_labels)\n", "\n", - "plot_faces <- function(images, n_row=2, n_col=3, width, height) {\n", - " par(mfrow=c(n_row, n_col), mar=c(0.5, 0.5, 0.5, 0.5))\n", + "plot_faces <- function(images, n_row = 2, n_col = 3, width, height) {\n", + " par(mfrow = c(n_row, n_col), mar = c(0.5, 0.5, 0.5, 0.5))\n", " for (i in seq_len(n_row * n_col)) {\n", " if (i <= length(images)) {\n", " # image needs to be transposed for and then flipped for correct orientation\n", " # using R \"image\"\n", " tmp <- t(images[[i]])\n", - " tmp <- tmp[,ncol(tmp):1]\n", - " image(tmp, col=gray.colors(256), axes=FALSE, xlab=\"\", ylab=\"\")\n", + " tmp <- tmp[, rev(seq_len(ncol(tmp)))]\n", + " image(tmp, col = gray.colors(256), axes = FALSE, xlab = \"\", ylab = \"\")\n", " }\n", " }\n", "}\n", "\n", "# Ensure arnold_images contains the right amount of data and is not NULL\n", - "arnold_images <- lapply(arnold_pics[1:min(6, length(arnold_pics))], function(idx) {\n", + "arnold_images <- lapply(arnold_pics[seq_len(min(6, length(arnold_pics)))], function(idx) {\n", " faces$images[idx, , ]\n", "})\n", "\n", - "plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47)\n" + "plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47)" ] }, { @@ -146,7 +164,10 @@ "execution_count": null, "id": "6", "metadata": { - "id": "imSXA7-jsGKl" + "id": "imSXA7-jsGKl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -156,7 +177,7 @@ "pca <- prcomp(design_matrix, rank. = 128, retx = TRUE, center = TRUE, scale. = FALSE)\n", "\n", "# Extract the principal components (eigenfaces)\n", - "eigenfaces <- pca$rotation\n" + "eigenfaces <- pca$rotation" ] }, { @@ -164,18 +185,23 @@ "execution_count": null, "id": "7", "metadata": { - "id": "sLi4k8t3DrHe" + "id": "sLi4k8t3DrHe", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# 2. Plot the first 6 \"eigenfaces,\" the six images whose linear span best explains the variation in our dataset\n", "pca_images <- lapply(1:6, function(idx) {\n", - " array_reshape(eigenfaces[,idx], c(height,width))\n", + " array_reshape(eigenfaces[, idx], c(height, width))\n", "})\n", "\n", "plot_faces(pca_images, height = height, width = width)\n", - "# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that captured the highest variation in our dataset of images)\n", - "# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors)" + "# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that\n", + "# captured the highest variation in our dataset of images)\n", + "# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation\n", + "# (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors)" ] }, { @@ -183,14 +209,17 @@ "execution_count": null, "id": "8", "metadata": { - "id": "Gmj2lpTfCXKC" + "id": "Gmj2lpTfCXKC", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "reconstruct <- function(image_vector, n_components, eigenfaces) {\n", " components <- eigenfaces[, 1:n_components, drop = FALSE]\n", " compimage <- components %*% (t(components) %*% image_vector)\n", - " return(array_reshape(compimage, c(height,width)))\n", + " return(array_reshape(compimage, c(height, width)))\n", "}\n", "\n", "# Select an Arnold image for reconstruction\n", @@ -202,7 +231,7 @@ "})\n", "\n", "# Plot the reconstructed faces\n", - "plot_faces(reconstructions, height = height, width = width)\n" + "plot_faces(reconstructions, height = height, width = width)" ] }, { @@ -210,7 +239,10 @@ "execution_count": null, "id": "9", "metadata": { - "id": "eoZ_BsXYDE7P" + "id": "eoZ_BsXYDE7P", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -223,19 +255,23 @@ "execution_count": null, "id": "10", "metadata": { - "id": "urlMaifVJCDc" + "id": "urlMaifVJCDc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "library(keras)\n", + "\n", "encoding_dimension <- 64\n", "input_image <- layer_input(shape = n_features)\n", - "encoded <- layer_dense(units = encoding_dimension, activation = 'linear')(input_image)\n", - "decoded <- layer_dense(units = n_features, activation = 'linear')(encoded)\n", + "encoded <- layer_dense(units = encoding_dimension, activation = \"linear\")(input_image)\n", + "decoded <- layer_dense(units = n_features, activation = \"linear\")(encoded)\n", "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", "autoencoder %>% compile(\n", - " optimizer = 'adam',\n", - " loss = 'mse'\n", + " optimizer = \"adam\",\n", + " loss = \"mse\"\n", ")\n", "autoencoder %>% fit(\n", " design_matrix,\n", @@ -252,7 +288,10 @@ "execution_count": null, "id": "11", "metadata": { - "id": "5OTUbWg8NcIE" + "id": "5OTUbWg8NcIE", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -271,17 +310,21 @@ "execution_count": null, "id": "12", "metadata": { - "id": "90nSf8Y8yIsl" + "id": "90nSf8Y8yIsl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "# Compute neural reconstruction\n", - "face_vector_flat <- as.numeric(face_vector)\n", - "reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1))\n", "library(ggplot2)\n", "library(gridExtra)\n", "library(reshape2)\n", "\n", + "# Compute neural reconstruction\n", + "face_vector_flat <- as.numeric(face_vector)\n", + "reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1))\n", + "\n", "# Do visual comparison\n", "image_height <- 62\n", "image_width <- 47\n", @@ -297,7 +340,7 @@ "img1 <- as.numeric(reconstructions[[4]]) / 255\n", "img2 <- as.numeric(reconstruction) / 255\n", "mse <- mean((img1 - img2)^2)\n", - "mse\n" + "mse" ] }, { @@ -317,30 +360,33 @@ "execution_count": null, "id": "14", "metadata": { - "id": "KHPoFiS9fuhr" + "id": "KHPoFiS9fuhr", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "# Use a nonlinear neural network\n", - "\n", "library(tensorflow)\n", + "\n", + "# Use a nonlinear neural network\n", "n_features <- 2914\n", "encoding_dimension <- 64\n", "\n", "input_image <- layer_input(shape = n_features)\n", "encoded <- input_image %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu')\n", + " layer_dense(units = encoding_dimension, activation = \"relu\") %>%\n", + " layer_dense(units = encoding_dimension, activation = \"relu\")\n", "\n", "decoded <- encoded %>%\n", - " layer_dense(units = encoding_dimension, activation = 'relu') %>%\n", - " layer_dense(units = n_features, activation = 'relu')\n", + " layer_dense(units = encoding_dimension, activation = \"relu\") %>%\n", + " layer_dense(units = n_features, activation = \"relu\")\n", "\n", "autoencoder <- keras_model(inputs = input_image, outputs = decoded)\n", "\n", "autoencoder %>% compile(\n", - " optimizer = 'adam',\n", - " loss = 'mse'\n", + " optimizer = \"adam\",\n", + " loss = \"mse\"\n", ")\n", "autoencoder %>% fit(\n", " design_matrix,\n", @@ -355,14 +401,15 @@ "reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1))\n", "\n", "# Do visual comparison\n", - "plot_faces(list(reconstructions[[4]],t(matrix(reconstruction, nrow = image_width, ncol = image_height))), n_row = 1, n_col = 2, width = image_width, height = image_height)\n", + "plot_faces(list(reconstructions[[4]], t(matrix(reconstruction, nrow = image_width, ncol = image_height))),\n", + " n_row = 1, n_col = 2, width = image_width, height = image_height)\n", "\n", "# Do numeric comparison\n", "# We also normalize the black/white gradient to take values in [0,1] (divide by 255)\n", "img1 <- as.numeric(reconstructions[[4]]) / 255\n", "img2 <- as.numeric(reconstruction) / 255\n", "mse <- mean((img1 - img2)^2)\n", - "mse\n" + "mse" ] } ], diff --git a/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd b/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd deleted file mode 100644 index edf2972a..00000000 --- a/PM5/DoubleML_and_Feature_Engineering_with_BERT.Rmd +++ /dev/null @@ -1,873 +0,0 @@ ---- -title: An R Markdown document converted from "PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb" -output: html_document ---- - -# BERT - -**Bidirectional Encoder Representations from Transformers.** - -_ | _ -- | - -![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png) - - -### **Overview** - -BERT was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin *et al.* The model is based on the Transformer architecture introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani *et al.* and has led to significant improvements in a wide range of natural language tasks. - -At the highest level, BERT maps from a block of text to a numeric vector which summarizes the relevant information in the text. - -What is remarkable is that numeric summary is sufficiently informative that, for example, the numeric summary of a paragraph followed by a reading comprehension question contains all the information necessary to satisfactorily answer the question. - -#### **Transfer Learning** - -BERT is a great example of a paradigm called *transfer learning*, which has proved very effective in recent years. In the first step, a network is trained on an unsupervised task using massive amounts of data. In the case of BERT, it was trained to predict missing words and to detect when pairs of sentences are presented in reversed order using all of Wikipedia. This was initially done by Google, using intense computational resources. - -Once this network has been trained, it is then used to perform many other supervised tasks using only limited data and computational resources: for example, sentiment classification in tweets or quesiton answering. The network is re-trained to perform these other tasks in such a way that only the final, output parts of the network are allowed to adjust by very much, so that most of the "information'' originally learned the network is preserved. This process is called *fine tuning*. - -##Getting to know BERT - -BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology. - -In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below. - -```{r} -install.packages("remotes") -remotes::install_github("rstudio/tensorflow") -install.packages("dplyr") -install.packages("DBI") -install.packages("ggplot2") -install.packages("reticulate") -install.packages("readr") -install.packages("stringr") -install.packages("tidyr") -install.packages("purrr") -install.packages("glmnet") -install.packages("caret") -install.packages("keras") -``` - -##Getting to know BERT - -BERT, and many of its variants, are made avialable to the public by the open source [Huggingface Transformers](https://huggingface.co/transformers/) project. This is an amazing resource, giving researchers and practitioners easy-to-use access to this technology. - -In order to use BERT for modeling, we simply need to download the pre-trained neural network and fine tune it on our dataset, which is illustrated below. - -```{r} -library(reticulate) -library(ggplot2) -library(DBI) -library(dplyr) -theme_set(theme_bw()) -``` - -```{r} -use_python("/usr/bin/python3", required = TRUE) # Adjust the path as needed -``` - -```{r} -py_run_string(' -import tensorflow as tf -import numpy as np -import pandas as pd -from transformers import BertTokenizer, TFBertModel -import warnings -warnings.simplefilter("ignore") -') -``` - -```{r} -# Check GPU availability -# py_run_string(' -# device_name = tf.test.gpu_device_name() -# if device_name != "/device:GPU:0": -# raise SystemError("GPU device not found") -# print("Found GPU at:", device_name) -# ') -``` - -```{r} -ssq <- function(x) sum(x * x) - -get_r2 <- function(y, yhat) { - resids <- yhat - y - flucs <- y - mean(y) - rss <- ssq(resids) - tss <- ssq(flucs) - cat(sprintf("RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f", rss, tss + mean(y)^2, tss, 1 - rss/tss)) -} -``` - -```{r} -py_run_string(' -tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") -bert = TFBertModel.from_pretrained("bert-base-uncased") -') -``` - -### Tokenization - -The first step in using BERT (or any similar text embedding tool) is to *tokenize* the data. This step standardizes blocks of text, so that meaningless differences in text presentation don't affect the behavior of our algorithm. - -Typically the text is transformed into a sequence of 'tokens,' each of which corresponds to a numeric code. - -```{r} -py_run_string(' -s = "What happens to this string?" -tensors = tokenizer.encode_plus(s, add_special_tokens = True, return_tensors = "tf") -output = bert(tensors) -') -``` - -```{r} -# Let's try it out! -s <- "What happens to this string?" -py_run_string -input_ids <- py$tensors$input_ids -attention_mask <- py$tensors$attention_mask -token_type_ids <- py$tensors$token_type_ids - -print(sprintf('Original String: "%s"', s)) -print("Numeric encoding: ") -print(list( - input_ids = input_ids, - attention_mask = attention_mask, - token_type_ids = token_type_ids -)) -# What does this mean? -py_run_string('tokens = tokenizer.convert_ids_to_tokens(tensors["input_ids"].numpy().flatten().tolist())') -tokens <- py$tokens -print("Actual tokens:") -print(tokens) -``` - -### BERT in a nutshell - -Once we have our numeric tokens, we can simply plug them into the BERT network and get a numeric vector summary. Note that in applications, the BERT summary will be "fine tuned" to a particular task, which hasn't happened yet. - -```{r} -# Load the reticulate library -library(reticulate) - -input_text <- "What happens to this string?" - - -cat(sprintf("Input: \"%s\"\n\n", input_text)) - -py_run_string(sprintf(' -tensors_tf = tokenizer("%s", return_tensors="tf") -output = bert(tensors_tf) -', input_text)) - -output <- py$output - -py_run_string(' -from pprint import pformat -output_type = str(type(output["pooler_output"])) -output_shape = output["pooler_output"].shape -output_preview = pformat(output["pooler_output"].numpy()) -') - -output_type <- py$output_type -output_shape <- py$output_shape -output_preview <- py$output_preview - -cat(sprintf( -"Output type: %s\n\nOutput shape: %s\n\nOutput preview: %s\n", -output_type, -paste(output_shape, collapse=", "), -output_preview -)) -``` - -# A practical introduction to BERT - -In the next part of the notebook, we are going to explore how a tool like BERT may be useful for causal inference. - -In particular, we are going to apply BERT to a subset of data from the Amazon marketplace consisting of roughly 10,000 listings for products in the toy category. Each product comes with a text description, a price, and a number of times reviewed (which we'll use as a proxy for demand / market share). - -For more information on the dataset, checkout the [Dataset README](https://github.com/CausalAIBook/MetricsMLNotebooks/blob/main/data/amazon_toys.md). - -**For thought**: -What are some issues you may anticipate when using number of reviews as a proxy for demand or market share? - -### Getting to know the data - -First, we'll download and clean up the data, and do some preliminary inspection. - -```{r} -library(readr) -library(stringr) -library(tidyr) -library(purrr) -data_url <- "https://github.com/CausalAIBook/MetricsMLNotebooks/raw/main/data/amazon_toys.csv" -data <- read_csv(data_url, show_col_types = FALSE) -problems(data) - -data <- data %>% - mutate( - number_of_reviews = as.numeric(str_replace_all(number_of_reviews, ",", "")) - ) -``` - -```{r} -data <- data %>% - mutate( - number_of_reviews = as.numeric(str_replace_all(number_of_reviews, "\\D+", "")), - price = as.numeric(str_extract(price, "\\d+\\.?\\d*")) - ) %>% - filter(number_of_reviews > 0) %>% - mutate( - ln_p = log(price), - ln_q = log(number_of_reviews / sum(number_of_reviews)), - text = str_c(product_name, manufacturer, product_description, sep = " | ") - ) %>% - select(text, ln_p, ln_q, amazon_category_and_sub_category) %>% - drop_na() -print(head(data)) -data$text_num_words <- str_split(data$text, "\\s+") %>% map_int(length) -print(quantile(data$text_num_words, 0.99, na.rm = TRUE)) -``` - -```{r} -ggplot(data, aes(x = text_num_words)) + - geom_density() + - labs(title = "Density Plot of Text Lengths in Words") -``` - -Let's make a two-way scatter plot of prices and (proxied) market shares. - -```{r} -p1 <- ggplot(data, aes(x = ln_p, y = ln_q)) + - geom_point() + - geom_smooth(method = "lm", color = "red") + - labs(title = "Scatter Plot with Regression Line") -print(p1) -``` - -```{r} -p2 <- ggplot(data, aes(x = ln_p, y = ln_q)) + - geom_smooth(method = "lm", color = "red") + - labs(title = "Regression Line Only") -print(p2) -``` - -```{r} -model <- lm(ln_q ~ ln_p, data = data) -elasticity <- coef(model)["ln_p"] -se <- summary(model)$coefficients["ln_p", "Std. Error"] -r_squared_adj <- summary(model)$adj.r.squared -cat(sprintf("Elasticity: %f, SE: %f, R2: %f\n\n", elasticity, se, r_squared_adj)) -conf_intervals <- confint(model, c("(Intercept)", "ln_p"), level = 0.95) -print(conf_intervals) -``` - -Let's begin with a simple prediction task. We will discover how well can we explain the price of these products using their textual descriptions. - -```{r} -install.packages("caTools") -install.packages("base") -library(caTools) -``` - -```{r} -library(caTools) -set.seed(124) -split <- sample.split(Y = data$ln_p, SplitRatio = 0.8) -train_main <- data[split, ] -holdout <- data[!split, ] -split_main <- sample.split(Y = train_main$ln_p, SplitRatio = 0.75) -train <- train_main[split_main, ] -val <- train_main[!split_main, ] -``` - -```{r} -library(reticulate) -use_python("/usr/bin/python3", required = TRUE) -py_run_string('import tensorflow as tf') - -py$train_texts <- train$text -train_tensors <- py_run_string(" -tensors = tokenizer( - list(train_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -train_tensors <- py$tensors - -py$val_texts <- val$text -val_tensors <- py_run_string(" -val_tensors = tokenizer( - list(val_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -val_tensors <- py$val_tensors - -py$holdout_texts <- holdout$text -tensors_holdout <- py_run_string(" -tensors_holdout = tokenizer( - list(holdout_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -tensors_holdout <- py$tensors_holdout -ln_p <- train$ln_p -ln_q <- train$ln_q -val_ln_p <- val$ln_p -val_ln_q <- val$ln_q -``` - -```{r} -ln_p <- train$ln_p -ln_q <- train$ln_q -val_ln_p <- val$ln_p -val_ln_q <- val$ln_q -``` - -# Using BERT as Feature Extractor - -```{r} -library(reticulate) -#Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python") -library(keras) -#install_keras() -``` - -```{r} -library(caTools) -library(dplyr) -library(readr) -library(reticulate) -library(keras) -library(caret) -library(glmnet) -library(stringr) - -use_python("/usr/bin/python3", required = TRUE) -py_run_string('import tensorflow as tf') -py_run_string('from transformers import BertTokenizer, TFBertModel') -py_run_string(' -tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") -bert_model = TFBertModel.from_pretrained("bert-base-uncased") -') - -py$train_texts <- train$text -train_tensors <- py_run_string(" -tensors = tokenizer( - list(train_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -train_tensors <- py$tensors - -py$val_texts <- val$text -val_tensors <- py_run_string(" -val_tensors = tokenizer( - list(val_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -val_tensors <- py$val_tensors - -py$holdout_texts <- holdout$text -tensors_holdout <- py_run_string(" -tensors_holdout = tokenizer( - list(holdout_texts), - padding=True, - truncation=True, - max_length=128, - return_tensors='tf' -)") -tensors_holdout <- py$tensors_holdout - -ln_p <- train$ln_p -val_ln_p <- val$ln_p -holdout_ln_p <- holdout$ln_p - -py_run_string(' -import tensorflow as tf -from transformers import TFBertModel - -# Define the input layers -input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids") -token_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="token_type_ids") -attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask") - -# Load the pre-trained BERT model -bert_model = TFBertModel.from_pretrained("bert-base-uncased") -outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) - -# Define the embedding model -embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :]) -') - -py_run_string(' -import numpy as np -embeddings = embedding_model.predict({ - "input_ids": tf.convert_to_tensor(tensors["input_ids"]), - "token_type_ids": tf.convert_to_tensor(tensors["token_type_ids"]), - "attention_mask": tf.convert_to_tensor(tensors["attention_mask"]) -}) -') - -embeddings <- py$embeddings - -py$ln_p <- ln_p -py_run_string(' -from sklearn.linear_model import LassoCV -from sklearn.model_selection import KFold -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline - -lcv = make_pipeline(StandardScaler(), LassoCV(cv=KFold(n_splits=5, shuffle=True, random_state=123), random_state=123)) -lcv.fit(embeddings, ln_p) -') - -py_run_string(' -embeddings_val = embedding_model.predict({ - "input_ids": tf.convert_to_tensor(val_tensors["input_ids"]), - "token_type_ids": tf.convert_to_tensor(val_tensors["token_type_ids"]), - "attention_mask": tf.convert_to_tensor(val_tensors["attention_mask"]) -}) -val_predictions = lcv.predict(embeddings_val) -') - -val_predictions <- py$val_predictions - -r2_val <- caret::R2(val_predictions, val_ln_p) - -py_run_string(' -embeddings_holdout = embedding_model.predict({ - "input_ids": tf.convert_to_tensor(tensors_holdout["input_ids"]), - "token_type_ids": tf.convert_to_tensor(tensors_holdout["token_type_ids"]), - "attention_mask": tf.convert_to_tensor(tensors_holdout["attention_mask"]) -}) -holdout_predictions = lcv.predict(embeddings_holdout) -') - -holdout_predictions <- py$holdout_predictions - -r2_holdout <- caret::R2(holdout_predictions, holdout_ln_p) - -print(r2_val) -print(r2_holdout) -ln_p_hat_holdout <- holdout_predictions -``` - -# Linear Probing: Training Only Final Layer after BERT - -```{r} -### Now let's prepare our model - -from tensorflow.keras import Model, Input -from tensorflow.keras.layers import Dense, Dropout, Concatenate -import tensorflow_addons as tfa -from tensorflow.keras import regularizers - -tf.keras.utils.set_random_seed(123) - -input_ids = Input(shape=(128,), dtype=tf.int32) -token_type_ids = Input(shape=(128,), dtype=tf.int32) -attention_mask = Input(shape=(128,), dtype=tf.int32) - -# # First we compute the text embedding -Z = bert(input_ids, token_type_ids, attention_mask) - -for layer in bert.layers: - layer.trainable=False - for w in layer.weights: w._trainable=False - -# # We want the "pooled / summary" embedding, not individual word embeddings -Z = Z[1] - -# # Then we do a regular regression -# Z = Dropout(0.2)(Z) -ln_p_hat = Dense(1, activation='linear', - kernel_regularizer=regularizers.L2(1e-3))(Z) - -PricePredictionNetwork = Model([ - input_ids, - token_type_ids, - attention_mask, - ], ln_p_hat) -PricePredictionNetwork.compile( - optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), - loss=tf.keras.losses.MeanSquaredError(), - metrics=tfa.metrics.RSquare(), -) -PricePredictionNetwork.summary() -``` - -```{r} -from livelossplot import PlotLossesKeras - -tf.keras.utils.set_random_seed(123) -earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) -modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/pweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) - -PricePredictionNetwork.fit( - x= [tensors['input_ids'], - tensors['token_type_ids'], - tensors['attention_mask'],], - y=ln_p, - validation_data = ( - [val_tensors['input_ids'], - val_tensors['token_type_ids'], - val_tensors['attention_mask']], val_ln_p - ), - epochs=5, - callbacks = [earlystopping, modelcheckpoint, - PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], - batch_size=16, - shuffle=True) -``` - -# Fine Tuning starting from the Linear Probing Trained Weights - -Now we train the whole network, initializing the weights based on the result of the linear probing phase in the previous section. - -```{r} -### Now let's prepare our model - -from tensorflow.keras import Model, Input -from tensorflow.keras.layers import Dense, Dropout, Concatenate -import tensorflow_addons as tfa -from tensorflow.keras import regularizers - -tf.keras.utils.set_random_seed(123) - -input_ids = Input(shape=(128,), dtype=tf.int32) -token_type_ids = Input(shape=(128,), dtype=tf.int32) -attention_mask = Input(shape=(128,), dtype=tf.int32) - -# # First we compute the text embedding -Z = bert(input_ids, token_type_ids, attention_mask) - -for layer in bert.layers: - layer.trainable=True - for w in layer.weights: w._trainable=True - -# # We want the "pooled / summary" embedding, not individual word embeddings -Z = Z[1] - -# # Then we do a regularized linear regression -ln_p_hat = Dense(1, activation='linear', - kernel_regularizer=regularizers.L2(1e-3))(Z) - -PricePredictionNetwork = Model([ - input_ids, - token_type_ids, - attention_mask, - ], ln_p_hat) -PricePredictionNetwork.compile( - optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), - loss=tf.keras.losses.MeanSquaredError(), - metrics=tfa.metrics.RSquare(), -) -PricePredictionNetwork.summary() -``` - -```{r} -PricePredictionNetwork.load_weights("/content/gdrive/MyDrive/pweights.hdf5") -``` - -```{r} -from livelossplot import PlotLossesKeras - -tf.keras.utils.set_random_seed(123) - -earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) -modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/pweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) - -PricePredictionNetwork.fit( - x= [tensors['input_ids'], - tensors['token_type_ids'], - tensors['attention_mask'],], - y=ln_p, - validation_data = ( - [val_tensors['input_ids'], - val_tensors['token_type_ids'], - val_tensors['attention_mask']], val_ln_p - ), - epochs=10, - callbacks = [earlystopping, modelcheckpoint, - PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], - batch_size=16, - shuffle=True) -``` - -```{r} -PricePredictionNetwork.load_weights("/content/gdrive/MyDrive/pweights.hdf5") -``` - -```{r} -# Compute predictions -ln_p_hat_holdout = PricePredictionNetwork.predict([ - tensors_holdout['input_ids'], - tensors_holdout['token_type_ids'], - tensors_holdout['attention_mask'], - ]) -``` - -```{r} -print('Neural Net R^2, Price Prediction:') -get_r2(holdout['ln_p'], ln_p_hat_holdout) -``` - -```{r} -import matplotlib.pyplot as plt -plt.hist(ln_p_hat_holdout) -plt.show() -``` - -Now, let's go one step further and construct a DML estimator of the average price elasticity. In particular, we will model market share $q_i$ as -$$\ln q_i = \alpha + \beta \ln p_i + \psi(d_i) + \epsilon_i,$$ where $d_i$ denotes the description of product $i$ and $\psi$ is the composition of text embedding and a linear layer. - -```{r} -## Build the quantity prediction network - -tf.keras.utils.set_random_seed(123) - -# Initialize new BERT model from original -bert2 = TFBertModel.from_pretrained("bert-base-uncased") - -# for layer in bert2.layers: -# layer.trainable=False -# for w in layer.weights: w._trainable=False - -# Define inputs -input_ids = Input(shape=(128,), dtype=tf.int32) -token_type_ids = Input(shape=(128,), dtype=tf.int32) -attention_mask = Input(shape=(128,), dtype=tf.int32) - -# First we compute the text embedding -Z = bert2(input_ids, token_type_ids, attention_mask) - -# We want the "pooled / summary" embedding, not individual word embeddings -Z = Z[1] - -ln_q_hat = Dense(1, activation='linear', kernel_regularizer=regularizers.L2(1e-3))(Z) - -# Compile model and optimization routine -QuantityPredictionNetwork = Model([ - input_ids, - token_type_ids, - attention_mask, - ], ln_q_hat) -QuantityPredictionNetwork.compile( - optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), - loss=tf.keras.losses.MeanSquaredError(), - metrics=tfa.metrics.RSquare(), -) -QuantityPredictionNetwork.summary() -``` - -```{r} -## Fit the quantity prediction network in the main sample -tf.keras.utils.set_random_seed(123) - -earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) -modelcheckpoint = tf.keras.callbacks.ModelCheckpoint("/content/gdrive/MyDrive/qweights.hdf5", monitor='val_loss', save_best_only=True, save_weights_only=True) - -QuantityPredictionNetwork.fit( - [ - tensors['input_ids'], - tensors['token_type_ids'], - tensors['attention_mask'], - ], - ln_q, - validation_data = ( - [val_tensors['input_ids'], - val_tensors['token_type_ids'], - val_tensors['attention_mask']], val_ln_q - ), - epochs=10, - callbacks = [earlystopping, modelcheckpoint, - PlotLossesKeras(groups = {'train_loss': ['loss'], 'train_rsq':['r_square'], 'val_loss': ['val_loss'], 'val_rsq': ['val_r_square']})], - batch_size=16, - shuffle=True) -``` - -```{r} -QuantityPredictionNetwork.load_weights("/content/gdrive/MyDrive/qweights.hdf5") -``` - -```{r} -## Predict in the holdout sample, residualize and regress - -ln_q_hat_holdout = QuantityPredictionNetwork.predict([ - tensors_holdout['input_ids'], - tensors_holdout['token_type_ids'], - tensors_holdout['attention_mask'], - ]) -``` - -```{r} -print('Neural Net R^2, Quantity Prediction:') -get_r2(holdout['ln_q'], ln_q_hat_holdout) -``` - -```{r} -# Compute residuals -r_p = holdout["ln_p"] - ln_p_hat_holdout.reshape((-1,)) -r_q = holdout["ln_q"] - ln_q_hat_holdout.reshape((-1,)) - -# Regress to obtain elasticity estimate -beta = np.mean(r_p * r_q) / np.mean(r_p * r_p) - -# standard error on elastiticy estimate -se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout["ln_p"].size) - -print('Elasticity of Demand with Respect to Price: {}'.format(beta)) -print('Standard Error: {}'.format(se)) -``` - -# Heterogeneous Elasticities within Major Product Categories - -We now look at the major product categories that have many products and we investigate whether the "within group" price elasticities - -```{r} -holdout['category'] = holdout['amazon_category_and_sub_category'].str.split('>').apply(lambda x: x[0]) -``` - -```{r} -# Elasticity within the main product categories -sql.run(""" - SELECT category, COUNT(*) - FROM holdout - GROUP BY 1 - HAVING COUNT(*)>=100 - ORDER BY 2 desc -""") -``` - -```{r} -main_cats = sql.run(""" - SELECT category - FROM holdout - GROUP BY 1 - HAVING COUNT(*)>=100 -""")['category'] - -dfs = [] -for cat in main_cats: - r_p = holdout[holdout['category'] == cat]["ln_p"] - ln_p_hat_holdout.reshape((-1,))[holdout['category'] == cat] - r_q = holdout[holdout['category'] == cat]["ln_q"] - ln_q_hat_holdout.reshape((-1,))[holdout['category'] == cat] - # Regress to obtain elasticity estimate - beta = np.mean(r_p * r_q) / np.mean(r_p * r_p) - - # standard error on elastiticy estimate - se = np.sqrt(np.mean( (r_p* r_q)**2)/(np.mean(r_p*r_p)**2)/holdout["ln_p"].size) - - df = pd.DataFrame({'point': beta, 'se': se, 'lower': beta - 1.96 * se, 'upper': beta + 1.96 * se}, index=[0]) - df['category'] = cat - df['N'] = holdout[holdout['category'] == cat].shape[0] - dfs.append(df) - -df = pd.concat(dfs) -df -``` - -## Clustering Products - -In this final part of the notebook, we'll illustrate how the BERT text embeddings can be used to cluster products based on their descriptions. - -Intiuitively, our neural network has now learned which aspects of the text description are relevant to predict prices and market shares. -We can therefore use the embeddings produced by our network to cluster products, and we might expect that the clusters reflect market-relevant information. - -In the following block of cells, we compute embeddings using our learned models and cluster them using $k$-means clustering with $k=10$. Finally, we will explore how the estimated price elasticity differs across clusters. - -### Overview of **$k$-means clustering** -The $k$-means clustering algorithm seeks to divide $n$ data vectors into $k$ groups, each of which contain points that are "close together." - -In particular, let $C_1, \ldots, C_k$ be a partitioning of the data into $k$ disjoint, nonempty subsets (clusters), and define -$$\bar{C_i}=\frac{1}{\#C_i}\sum_{x \in C_i} x$$ -to be the *centroid* of the cluster $C_i$. The $k$-means clustering score $\mathrm{sc}(C_1 \ldots C_k)$ is defined to be -$$\mathrm{sc}(C_1 \ldots C_k) = \sum_{i=1}^k \sum_{x \in C_i} \left(x - \bar{C_i}\right)^2.$$ - -The $k$-means clustering is then defined to be any partitioning $C^*_1 \ldots C^*_k$ that minimizes the score $\mathrm{sc}(-)$. - -```{r} -## STEP 1: Compute embeddings - -input_ids = Input(shape=(128,), dtype=tf.int32) -token_type_ids = Input(shape=(128,), dtype=tf.int32) -attention_mask = Input(shape=(128,), dtype=tf.int32) - -Y1 = bert(input_ids, token_type_ids, attention_mask)[1] -Y2 = bert2(input_ids, token_type_ids, attention_mask)[1] -Y = Concatenate()([Y1,Y2]) - -embedding_model = Model([input_ids, token_type_ids, attention_mask], Y) - -embeddings = embedding_model.predict([tensors_holdout['input_ids'], - tensors_holdout['token_type_ids'], - tensors_holdout['attention_mask']]) -``` - -### Dimension reduction and the **Johnson-Lindenstrauss transform** - -Our learned embeddings have dimension in the $1000$s, and $k$-means clustering is often an expensive operation. To improve the situation, we will use a neat trick that is used extensively in machine learning applications: the *Johnson-Lindenstrauss transform*. - -This trick involves finding a low-dimensional linear projection of the embeddings that approximately preserves pairwise distances. - -In fact, Johnson and Lindenstrauss proved a much more interesting statement: a Gaussian random matrix will *almost always* approximately preserve pairwise distances. - - -```{r} -# STEP 2 Make low-dimensional projections -from sklearn.random_projection import GaussianRandomProjection - -jl = GaussianRandomProjection(eps=.25) -embeddings_lowdim = jl.fit_transform(embeddings) -``` - -```{r} -# STEP 3 Compute clusters -from sklearn.cluster import KMeans - -k_means = KMeans(n_clusters=10) -k_means.fit(embeddings_lowdim) -cluster_ids = k_means.labels_ -``` - -```{r} -# STEP 4 Regress within each cluster - -betas = np.zeros(10) -ses = np.zeros(10) - -r_p = holdout["ln_p"] - ln_p_hat_holdout.reshape((-1,)) -r_q = holdout["ln_q"] - ln_q_hat_holdout.reshape((-1,)) - -for c in range(10): - - r_p_c = r_p[cluster_ids == c] - r_q_c = r_q[cluster_ids == c] - - # Regress to obtain elasticity estimate - betas[c] = np.mean(r_p_c * r_q_c) / np.mean(r_p_c * r_p_c) - - # standard error on elastiticy estimate - ses[c] = np.sqrt(np.mean( (r_p_c * r_q_c)**2)/(np.mean(r_p_c*r_p_c)**2)/r_p_c.size) -``` - -```{r} -# STEP 5 Plot -from matplotlib import pyplot as plt - -plt.bar(range(10), betas, yerr = 1.96 * ses) -``` - diff --git a/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb b/PM5/in_progress/DoubleML_and_Feature_Engineering_with_BERT.irnb similarity index 90% rename from PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb rename to PM5/in_progress/DoubleML_and_Feature_Engineering_with_BERT.irnb index 9bd6d4ed..6bbcfc72 100644 --- a/PM5/DoubleML_and_Feature_Engineering_with_BERT.irnb +++ b/PM5/in_progress/DoubleML_and_Feature_Engineering_with_BERT.irnb @@ -11,8 +11,6 @@ "\n", "**Bidirectional Encoder Representations from Transformers.**\n", "\n", - "_ | _\n", - "- | -\n", "![alt](https://pytorch.org/assets/images/bert1.png) | ![alt](https://pytorch.org/assets/images/bert2.png)\n", "\n", "\n", @@ -54,7 +52,10 @@ "base_uri": "https://localhost:8080/" }, "id": "9rooQWVdri1m", - "outputId": "e8d7baf0-1420-4405-d5d5-c63466fbbcdd" + "outputId": "e8d7baf0-1420-4405-d5d5-c63466fbbcdd", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -96,7 +97,10 @@ "base_uri": "https://localhost:8080/" }, "id": "ppJlcoIatlAw", - "outputId": "3a5cbcea-04c9-4c67-ccb1-1b45e691ccf8" + "outputId": "3a5cbcea-04c9-4c67-ccb1-1b45e691ccf8", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -112,11 +116,14 @@ "execution_count": null, "id": "5", "metadata": { - "id": "GmOhRKEG4jEy" + "id": "GmOhRKEG4jEy", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "use_python(\"/usr/bin/python3\", required = TRUE) # Adjust the path as needed" + "use_python(\"/usr/bin/python3\", required = TRUE) # Adjust the path as needed" ] }, { @@ -124,7 +131,10 @@ "execution_count": null, "id": "6", "metadata": { - "id": "bUEb1TDIs4TK" + "id": "bUEb1TDIs4TK", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -138,30 +148,15 @@ "')" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": { - "id": "A7HTpjkA4u54" - }, - "outputs": [], - "source": [ - "# Check GPU availability\n", - "# py_run_string('\n", - "# device_name = tf.test.gpu_device_name()\n", - "# if device_name != \"/device:GPU:0\":\n", - "# raise SystemError(\"GPU device not found\")\n", - "# print(\"Found GPU at:\", device_name)\n", - "# ')" - ] - }, { "cell_type": "code", "execution_count": null, "id": "8", "metadata": { - "id": "hZaltj7Fv5Gh" + "id": "hZaltj7Fv5Gh", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -172,7 +167,7 @@ " flucs <- y - mean(y)\n", " rss <- ssq(resids)\n", " tss <- ssq(flucs)\n", - " cat(sprintf(\"RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f\", rss, tss + mean(y)^2, tss, 1 - rss/tss))\n", + " cat(sprintf(\"RSS: %f, TSS + MEAN^2: %f, TSS: %f, R^2: %f\", rss, tss + mean(y)^2, tss, 1 - rss / tss))\n", "}" ] }, @@ -181,7 +176,10 @@ "execution_count": null, "id": "9", "metadata": { - "id": "CB3ur5xF41o-" + "id": "CB3ur5xF41o-", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -210,7 +208,10 @@ "execution_count": null, "id": "11", "metadata": { - "id": "cER5mL4fMSCr" + "id": "cER5mL4fMSCr", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -231,7 +232,10 @@ "height": 329 }, "id": "KVETer7w5euE", - "outputId": "3b7b87f7-69f4-4727-bccf-3bd8d6162e00" + "outputId": "3b7b87f7-69f4-4727-bccf-3bd8d6162e00", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -277,7 +281,10 @@ "base_uri": "https://localhost:8080/" }, "id": "Q1ODAgBMa3Zg", - "outputId": "7ccd4481-6489-4830-c86b-a3c40cb1c37c" + "outputId": "7ccd4481-6489-4830-c86b-a3c40cb1c37c", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -308,11 +315,11 @@ "output_preview <- py$output_preview\n", "\n", "cat(sprintf(\n", - "\"Output type: %s\\n\\nOutput shape: %s\\n\\nOutput preview: %s\\n\",\n", - "output_type,\n", - "paste(output_shape, collapse=\", \"),\n", - "output_preview\n", - "))\n" + " \"Output type: %s\\n\\nOutput shape: %s\\n\\nOutput preview: %s\\n\",\n", + " output_type,\n", + " paste(output_shape, collapse = \", \"),\n", + " output_preview\n", + "))" ] }, { @@ -338,16 +345,6 @@ "First, we'll download and clean up the data, and do some preliminary inspection." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": { - "id": "_d5eA3xyzdtb" - }, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -358,7 +355,10 @@ "height": 247 }, "id": "5kzXygwH0BKw", - "outputId": "f368dd28-317f-4815-d05b-a0c2ce7f05d6" + "outputId": "f368dd28-317f-4815-d05b-a0c2ce7f05d6", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -366,6 +366,7 @@ "library(stringr)\n", "library(tidyr)\n", "library(purrr)\n", + "\n", "data_url <- \"https://github.com/CausalAIBook/MetricsMLNotebooks/raw/main/data/amazon_toys.csv\"\n", "data <- read_csv(data_url, show_col_types = FALSE)\n", "problems(data)\n", @@ -373,7 +374,7 @@ "data <- data %>%\n", " mutate(\n", " number_of_reviews = as.numeric(str_replace_all(number_of_reviews, \",\", \"\"))\n", - " )\n" + " )" ] }, { @@ -385,7 +386,10 @@ "base_uri": "https://localhost:8080/" }, "id": "1Su5vOGhD3Df", - "outputId": "577bbbd9-1a7d-4f7e-d9bd-decd46830dc0" + "outputId": "577bbbd9-1a7d-4f7e-d9bd-decd46830dc0", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -417,7 +421,10 @@ "height": 437 }, "id": "lovFEHaWp4lC", - "outputId": "2c191aaa-d7ed-4b62-ef28-c513fabfab05" + "outputId": "2c191aaa-d7ed-4b62-ef28-c513fabfab05", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -446,7 +453,10 @@ "height": 454 }, "id": "dNujhir1q_0N", - "outputId": "06ba2da1-4ffc-4dc4-d762-f0bd6cdc039c" + "outputId": "06ba2da1-4ffc-4dc4-d762-f0bd6cdc039c", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -467,7 +477,10 @@ "height": 454 }, "id": "t7axTjsnq8qI", - "outputId": "84a6aa7b-41cd-43f6-e371-2c46d15c86eb" + "outputId": "84a6aa7b-41cd-43f6-e371-2c46d15c86eb", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -486,7 +499,10 @@ "base_uri": "https://localhost:8080/" }, "id": "aZPOUT5SvR4o", - "outputId": "c5186cae-c1da-4c35-91b0-08f0b9986919" + "outputId": "c5186cae-c1da-4c35-91b0-08f0b9986919", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -518,7 +534,10 @@ "base_uri": "https://localhost:8080/" }, "id": "pGt-G-qpciGd", - "outputId": "dfdf62d9-66b7-4787-b00c-b019f9064cf4" + "outputId": "dfdf62d9-66b7-4787-b00c-b019f9064cf4", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -532,7 +551,10 @@ "execution_count": null, "id": "26", "metadata": { - "id": "EPX_-K9CPpuO" + "id": "EPX_-K9CPpuO", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -551,13 +573,17 @@ "execution_count": null, "id": "27", "metadata": { - "id": "IGFzO9sZPvAJ" + "id": "IGFzO9sZPvAJ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "library(reticulate)\n", + "\n", "use_python(\"/usr/bin/python3\", required = TRUE)\n", - "py_run_string('import tensorflow as tf')\n", + "py_run_string(\"import tensorflow as tf\")\n", "\n", "py$train_texts <- train$text\n", "train_tensors <- py_run_string(\"\n", @@ -602,7 +628,10 @@ "execution_count": null, "id": "28", "metadata": { - "id": "XQ4DMJQ0drZm" + "id": "XQ4DMJQ0drZm", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -622,21 +651,6 @@ "# Using BERT as Feature Extractor" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "30", - "metadata": { - "id": "ytZhy46hdDdr" - }, - "outputs": [], - "source": [ - "library(reticulate)\n", - "#Sys.setenv(RETICULATE_PYTHON = \"/usr/bin/python\")\n", - "library(keras)\n", - "#install_keras()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -647,10 +661,15 @@ "height": 835 }, "id": "TsqfQ3OH-HR3", - "outputId": "a4107a7d-ce9e-4d09-a63e-1bab3174bf76" + "outputId": "a4107a7d-ce9e-4d09-a63e-1bab3174bf76", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ + "library(reticulate)\n", + "library(keras)\n", "library(caTools)\n", "library(dplyr)\n", "library(readr)\n", @@ -661,8 +680,8 @@ "library(stringr)\n", "\n", "use_python(\"/usr/bin/python3\", required = TRUE)\n", - "py_run_string('import tensorflow as tf')\n", - "py_run_string('from transformers import BertTokenizer, TFBertModel')\n", + "py_run_string(\"import tensorflow as tf\")\n", + "py_run_string(\"from transformers import BertTokenizer, TFBertModel\")\n", "py_run_string('\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", "bert_model = TFBertModel.from_pretrained(\"bert-base-uncased\")\n", @@ -719,7 +738,8 @@ "outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)\n", "\n", "# Define the embedding model\n", - "embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs.last_hidden_state[:, 0, :])\n", + "embedding_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask],\n", + " outputs=outputs.last_hidden_state[:, 0, :])\n", "')\n", "\n", "py_run_string('\n", @@ -734,7 +754,7 @@ "embeddings <- py$embeddings\n", "\n", "py$ln_p <- ln_p\n", - "py_run_string('\n", + "py_run_string(\"\n", "from sklearn.linear_model import LassoCV\n", "from sklearn.model_selection import KFold\n", "from sklearn.preprocessing import StandardScaler\n", @@ -742,7 +762,7 @@ "\n", "lcv = make_pipeline(StandardScaler(), LassoCV(cv=KFold(n_splits=5, shuffle=True, random_state=123), random_state=123))\n", "lcv.fit(embeddings, ln_p)\n", - "')\n", + "\")\n", "\n", "py_run_string('\n", "embeddings_val = embedding_model.predict({\n", @@ -790,7 +810,10 @@ "execution_count": null, "id": "33", "metadata": { - "id": "Ck1xqRIrmx8I" + "id": "Ck1xqRIrmx8I", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -840,7 +863,10 @@ "execution_count": null, "id": "34", "metadata": { - "id": "XhTREb3NcZhH" + "id": "XhTREb3NcZhH", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -884,7 +910,10 @@ "execution_count": null, "id": "36", "metadata": { - "id": "NzWCkTY87luH" + "id": "NzWCkTY87luH", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -933,11 +962,14 @@ "execution_count": null, "id": "37", "metadata": { - "id": "PWauCl0T7nUo" + "id": "PWauCl0T7nUo", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" + "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")\n" ] }, { @@ -945,7 +977,10 @@ "execution_count": null, "id": "38", "metadata": { - "id": "iDSBZWAe8nhE" + "id": "iDSBZWAe8nhE", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -978,11 +1013,14 @@ "execution_count": null, "id": "39", "metadata": { - "id": "wchpbXoqBAJu" + "id": "wchpbXoqBAJu", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")" + "PricePredictionNetwork.load_weights(\"/content/gdrive/MyDrive/pweights.hdf5\")\n" ] }, { @@ -990,7 +1028,10 @@ "execution_count": null, "id": "40", "metadata": { - "id": "jpUmDHYfkJEZ" + "id": "jpUmDHYfkJEZ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1007,12 +1048,15 @@ "execution_count": null, "id": "41", "metadata": { - "id": "g_XK81hpkQMN" + "id": "g_XK81hpkQMN", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print('Neural Net R^2, Price Prediction:')\n", - "get_r2(holdout['ln_p'], ln_p_hat_holdout)" + "print(\"Neural Net R^2, Price Prediction:\")\n", + "get_r2(holdout[\"ln_p\"], ln_p_hat_holdout)\n" ] }, { @@ -1020,7 +1064,10 @@ "execution_count": null, "id": "42", "metadata": { - "id": "GR4QP4DJPQk0" + "id": "GR4QP4DJPQk0", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1034,7 +1081,10 @@ "execution_count": null, "id": "43", "metadata": { - "id": "RAGwE4peL1Me" + "id": "RAGwE4peL1Me", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [] @@ -1055,7 +1105,10 @@ "execution_count": null, "id": "45", "metadata": { - "id": "Qiteu6FaoctV" + "id": "Qiteu6FaoctV", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1102,7 +1155,10 @@ "execution_count": null, "id": "46", "metadata": { - "id": "aaxHV0gGMqpw" + "id": "aaxHV0gGMqpw", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1136,11 +1192,14 @@ "execution_count": null, "id": "47", "metadata": { - "id": "TfyQV3lw-xf2" + "id": "TfyQV3lw-xf2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "QuantityPredictionNetwork.load_weights(\"/content/gdrive/MyDrive/qweights.hdf5\")" + "QuantityPredictionNetwork.load_weights(\"/content/gdrive/MyDrive/qweights.hdf5\")\n" ] }, { @@ -1148,7 +1207,10 @@ "execution_count": null, "id": "48", "metadata": { - "id": "YADpNj0jMygZ" + "id": "YADpNj0jMygZ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1166,12 +1228,15 @@ "execution_count": null, "id": "49", "metadata": { - "id": "jh4criU1hGIP" + "id": "jh4criU1hGIP", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print('Neural Net R^2, Quantity Prediction:')\n", - "get_r2(holdout['ln_q'], ln_q_hat_holdout)" + "print(\"Neural Net R^2, Quantity Prediction:\")\n", + "get_r2(holdout[\"ln_q\"], ln_q_hat_holdout)\n" ] }, { @@ -1179,7 +1244,10 @@ "execution_count": null, "id": "50", "metadata": { - "id": "ir-_yAfkPM6f" + "id": "ir-_yAfkPM6f", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1214,7 +1282,10 @@ "execution_count": null, "id": "52", "metadata": { - "id": "XRUZEXqc8HPG" + "id": "XRUZEXqc8HPG", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1226,7 +1297,10 @@ "execution_count": null, "id": "53", "metadata": { - "id": "ymWJv4Ej7lt9" + "id": "ymWJv4Ej7lt9", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1245,7 +1319,10 @@ "execution_count": null, "id": "54", "metadata": { - "id": "E3ncPtwt8nJi" + "id": "E3ncPtwt8nJi", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1307,7 +1384,10 @@ "execution_count": null, "id": "56", "metadata": { - "id": "Mc7I00JPK6wJ" + "id": "Mc7I00JPK6wJ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1350,7 +1430,10 @@ "execution_count": null, "id": "58", "metadata": { - "id": "afGiLR7v6ecJ" + "id": "afGiLR7v6ecJ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1366,7 +1449,10 @@ "execution_count": null, "id": "59", "metadata": { - "id": "9Tl9AM3J6j3X" + "id": "9Tl9AM3J6j3X", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1383,7 +1469,10 @@ "execution_count": null, "id": "60", "metadata": { - "id": "0l7De-Do6mD0" + "id": "0l7De-Do6mD0", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -1412,7 +1501,10 @@ "execution_count": null, "id": "61", "metadata": { - "id": "oXoe98f06njT" + "id": "oXoe98f06njT", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ From 8a5ae9999fedac8433449055ad87f9e1ead8e220 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 05:17:32 -0700 Subject: [PATCH 106/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 2989e516..844101d8 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4'] #, 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] #, 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From 1444d489a5e995dc1d85601966899151a564a487 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 05:48:20 -0700 Subject: [PATCH 107/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 844101d8..5d87b58c 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -28,6 +28,11 @@ jobs: run: | python -m pip install --upgrade pip pip install nbstripout tensorflow + + - name: Install dependencies + if: "matrix.folder == 'PM5'" + run: | + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Install system dependencies run: | From 491c4324e61a3cb6e3823c265716823b5a2f1830 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:04:46 -0700 Subject: [PATCH 108/261] Linting CM1 --- .github/workflows/transform-R-to-Rmd.yml | 4 +- CM1/r-rct-penn-precision-adj.irnb | 162 ++++++++----- CM1/r-rct-vaccines.irnb | 282 +++++++++++++---------- CM1/r-sim-precision-adj.irnb | 98 ++++---- 4 files changed, 314 insertions(+), 232 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 5d87b58c..57100721 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5'] #, 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1'] #, 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 @@ -30,7 +30,7 @@ jobs: pip install nbstripout tensorflow - name: Install dependencies - if: "matrix.folder == 'PM5'" + if: "matrix.directory == 'PM5'" run: | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/CM1/r-rct-penn-precision-adj.irnb b/CM1/r-rct-penn-precision-adj.irnb index 30c98e05..43be2452 100644 --- a/CM1/r-rct-penn-precision-adj.irnb +++ b/CM1/r-rct-penn-precision-adj.irnb @@ -13,7 +13,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "w7B1iDaqa2ZI" + "id": "w7B1iDaqa2ZI", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -43,17 +46,20 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "_whbk2z4elkY" + "id": "_whbk2z4elkY", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "## loading the data\n", - "file = \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", - "Penn <- as.data.frame(read.table(file, header=T))\n", + "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", + "Penn <- as.data.frame(read.table(file, header = TRUE))\n", "\n", "n <- dim(Penn)[1]\n", "p_1 <- dim(Penn)[2]\n", - "Penn<- subset(Penn, tg==4 | tg==0)\n", + "Penn <- subset(Penn, tg == 4 | tg == 0)\n", "attach(Penn)" ] }, @@ -61,11 +67,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "iTghGG4Kelkk" + "id": "iTghGG4Kelkk", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "T4<- (tg==4)\n", + "T4 <- (tg == 4)\n", "summary(T4)" ] }, @@ -73,7 +82,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "ZbWuqKExelkl" + "id": "ZbWuqKExelkl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -162,15 +174,20 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "GYOMYN_WRNTL" + "id": "GYOMYN_WRNTL", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2)\n", + "data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 +\n", + " agelt35 + agegt54 + durable + lusd + husd)^2)\n", "\n", "# individual t-tests\n", - "m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data))\n", - "coeftest(m, vcov = vcovHC(m, type=\"HC1\"))" + "m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 +\n", + " agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data))\n", + "coeftest(m, vcov = vcovHC(m, type = \"HC1\"))" ] }, { @@ -200,7 +217,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "T4AmK55IiiV1" + "id": "T4AmK55IiiV1", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -219,9 +239,9 @@ " return(sig_beta)\n", "}\n", "\n", - "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", + "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type = \"HC1\"))[, 4])\n", "significant_indices <- holm_bonferroni(p_values, alpha = 0.05)\n", - "print(paste(\"Significant Coefficients (Indices): \", significant_indices))\n" + "print(paste(\"Significant Coefficients (Indices): \", significant_indices))" ] }, { @@ -237,11 +257,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "iBMiaj3jAZuo" + "id": "iBMiaj3jAZuo", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type=\"HC1\"))[,4])\n", + "p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type = \"HC1\"))[, 4])\n", "holm_reject <- p.adjust(sort(p_values), \"holm\") <= 0.05\n", "holm_reject" ] @@ -270,31 +293,31 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "43W-vaIzelk1" + "id": "43W-vaIzelk1", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# model specifications\n", "\n", - "\n", "# no adjustment (2-sample approach)\n", - "formula_cl <- log(inuidur1)~T4\n", + "formula_cl <- log(inuidur1) ~ T4\n", "\n", "# adding controls\n", - "formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2\n", + "formula_cra <- log(inuidur1) ~ T4 + (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 +\n", + " agelt35 + agegt54 + durable + lusd + husd)^2\n", "# Omitted dummies: q1, nondurable, muld\n", "\n", + "ols_cl <- lm(formula_cl)\n", + "ols_cra <- lm(formula_cra)\n", "\n", - "ols.cl <- lm(formula_cl)\n", - "ols.cra <- lm(formula_cra)\n", - "\n", - "\n", - "ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type=\"HC1\"))\n", - "ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type=\"HC1\"))\n", + "ols_cl <- coeftest(ols_cl, vcov = vcovHC(ols_cl, type = \"HC1\"))\n", + "ols_cra <- coeftest(ols_cra, vcov = vcovHC(ols_cra, type = \"HC1\"))\n", "\n", - "print(ols.cl)\n", - "print(ols.cra)\n", - "\n" + "print(ols_cl)\n", + "print(ols_cra)" ] }, { @@ -310,23 +333,28 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "SGdP0kQ3elk2" + "id": "SGdP0kQ3elk2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ + "# interactive regression model;\n", "\n", - "#interactive regression model;\n", - "\n", - "X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1]\n", + "X <- model.matrix(~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 +\n", + " agelt35 + agegt54 + durable + lusd + husd)^2)[, -1]\n", "dim(X)\n", - "demean<- function(x){ x - mean(x)}\n", - "X = apply(X, 2, demean)\n", "\n", - "ols.ira = lm(log(inuidur1) ~ T4*X)\n", - "ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type=\"HC1\"))\n", - "print(ols.ira)\n", + "demean <- function(x) {\n", + " x - mean(x)\n", + "}\n", "\n", - "\n" + "X <- apply(X, 2, demean)\n", + "\n", + "ols_ira <- lm(log(inuidur1) ~ T4 * X)\n", + "ols_ira <- coeftest(ols_ira, vcov = vcovHC(ols_ira, type = \"HC1\"))\n", + "print(ols_ira)" ] }, { @@ -342,18 +370,20 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "O9AZ49XNelk3" + "id": "O9AZ49XNelk3", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "T4 = demean(T4)\n", - "\n", - "DX = model.matrix(~T4*X)[,-1]\n", + "T4 <- demean(T4)\n", "\n", - "rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1))\n", + "DX <- model.matrix(~ T4 * X)[, -1]\n", "\n", + "rlasso_ira <- summary(rlassoEffects(DX, log(inuidur1), index = 1))\n", "\n", - "print(rlasso.ira)\n" + "print(rlasso_ira)" ] }, { @@ -369,40 +399,46 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "UGh_LJouellB" + "id": "UGh_LJouellB", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "str(ols.ira)\n", - "ols.ira[2,1]" + "str(ols_ira)\n", + "ols_ira[2, 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "wvxXEMUQellC" + "id": "wvxXEMUQellC", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "table<- matrix(0, 2, 4)\n", - "table[1,1]<- ols.cl[2,1]\n", - "table[1,2]<- ols.cra[2,1]\n", - "table[1,3]<- ols.ira[2,1]\n", - "table[1,4]<- rlasso.ira[[1]][1]\n", + "table <- matrix(0, 2, 4)\n", + "table[1, 1] <- ols_cl[2, 1]\n", + "table[1, 2] <- ols_cra[2, 1]\n", + "table[1, 3] <- ols_ira[2, 1]\n", + "table[1, 4] <- rlasso_ira[[1]][1]\n", "\n", - "table[2,1]<- ols.cl[2,2]\n", - "table[2,2]<- ols.cra[2,2]\n", - "table[2,3]<- ols.ira[2,2]\n", - "table[2,4]<- rlasso.ira[[1]][2]\n", + "table[2, 1] <- ols_cl[2, 2]\n", + "table[2, 2] <- ols_cra[2, 2]\n", + "table[2, 3] <- ols_ira[2, 2]\n", + "table[2, 4] <- rlasso_ira[[1]][2]\n", "\n", "\n", - "colnames(table)<- c(\"CL\",\"CRA\",\"IRA\", \"IRA w Lasso\")\n", - "rownames(table)<- c(\"estimate\", \"standard error\")\n", - "tab<- xtable(table, digits=5)\n", + "colnames(table) <- c(\"CL\", \"CRA\", \"IRA\", \"IRA w Lasso\")\n", + "rownames(table) <- c(\"estimate\", \"standard error\")\n", + "tab <- xtable(table, digits = 5)\n", "tab\n", "\n", - "print(tab, type=\"latex\", digits=5)" + "print(tab, type = \"latex\", digits = 5)" ] }, { diff --git a/CM1/r-rct-vaccines.irnb b/CM1/r-rct-vaccines.irnb index 73bcf929..e06d814b 100644 --- a/CM1/r-rct-vaccines.irnb +++ b/CM1/r-rct-vaccines.irnb @@ -13,7 +13,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "fW54aax9mE2G" + "id": "fW54aax9mE2G", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -57,59 +60,69 @@ "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", - "id": "SE8nvAWberNc" + "id": "SE8nvAWberNc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "NV = 200745 # number of vaccinated (treated)\n", - "NU = 201229 # number of unvaccinated (control)\n", - "RV= 33/NV # average outcome for vaccinated\n", - "RU =115/NU # average outcome for unvaccinated\n", - "VE = (RU - RV)/RU; # vaccine efficacy\n", + "NV <- 200745 # number of vaccinated (treated)\n", + "NU <- 201229 # number of unvaccinated (control)\n", + "RV <- 33 / NV # average outcome for vaccinated\n", + "RU <- 115 / NU # average outcome for unvaccinated\n", + "VE <- (RU - RV) / RU\n", + "# vaccine efficacy\n", "\n", "# incidence per 100000\n", - "Incidence.RV=RV*100000\n", - "Incidence.RU=RU*100000\n", + "incidence_rv <- RV * 100000\n", + "incidence_ru <- RU * 100000\n", "\n", - "print(paste(\"Incidence per 100000 among treated:\", round(Incidence.RV,4)))\n", + "print(paste(\"Incidence per 100000 among treated:\", round(incidence_rv, 4)))\n", "\n", - "print(paste(\"Incidence per 100000 among controlled:\", round(Incidence.RU,4)))\n", + "print(paste(\"Incidence per 100000 among controlled:\", round(incidence_ru, 4)))\n", "\n", "# treatment effect - estimated reduction in incidence per 100000 people\n", - "delta.hat = 100000*(RV-RU)\n", + "delta_hat <- 100000 * (RV - RU)\n", "\n", - "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n", + "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta_hat, 4)))\n", "\n", "# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n", - "Var.RV = RV*(1-RV)/NV\n", - "Var.RU = RU*(1-RU)/NU\n", - "Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n", - "Std.delta.hat = sqrt(Var.delta.hat)\n", + "var_rv <- RV * (1 - RV) / NV\n", + "var_ru <- RU * (1 - RU) / NU\n", + "var_delta_hat <- 100000^2 * (var_rv + var_ru)\n", + "std_delta_hat <- sqrt(var_delta_hat)\n", "\n", - "print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n", + "print(paste(\"Standard deviation for ATE is\", round(std_delta_hat, 4)))\n", "\n", - "CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n", - " delta.hat +1.96*sqrt(Var.delta.hat))\n", + "ci_delta <- c(\n", + " delta_hat - 1.96 * sqrt(var_delta_hat),\n", + " delta_hat + 1.96 * sqrt(var_delta_hat)\n", + ")\n", "\n", - "print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n", - " round(CI.delta[2],4), \"]\" ))\n", + "print(paste(\n", + " \"95% confidence interval of ATE is [\", round(ci_delta[1], 4), \",\",\n", + " round(ci_delta[2], 4), \"]\"\n", + "))\n", "\n", - "print(paste(\"Overall VE is\", round(VE,4) ))\n", + "print(paste(\"Overall VE is\", round(VE, 4)))\n", "\n", "# we use an approximate bootstrap to find the confidence interval of vaccine efficacy\n", "# via Monte Carlo draws\n", "set.seed(1)\n", - "B = 10000 # number of bootstraps\n", - "RVs = RV + rnorm(B)*sqrt(Var.RV)\n", - "RUs = RU + rnorm(B)*sqrt(Var.RU)\n", - "VEs= (RUs - RVs)/RUs\n", + "B <- 10000 # number of bootstraps\n", + "RVs <- RV + rnorm(B) * sqrt(var_rv)\n", + "RUs <- RU + rnorm(B) * sqrt(var_ru)\n", + "VEs <- (RUs - RVs) / RUs\n", "\n", - "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "plot(density(VEs), col = 2, main = \"Approximate Distribution of VE estimates\")\n", "\n", - "CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps\n", + "ci_ve <- quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps\n", "\n", - "print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n", - " round(CI.VE[2],4), \"]\"))" + "print(paste(\n", + " \"95% confidence interval of VE is [\", round(ci_ve[1], 4), \",\",\n", + " round(ci_ve[2], 4), \"]\"\n", + "))" ] }, { @@ -138,59 +151,73 @@ "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", - "id": "mdrjpK4XerNl" + "id": "mdrjpK4XerNl", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "NV = 19965; # number vaccinated\n", - "NU = 20172; # number unvaccinated\n", - "RV = 9/NV; # average outcome for vaccinated\n", - "RU = 169/NU; # average outcome for unvaccinated\n", - "VE = (RU - RV)/RU; # vaccine efficacy\n", + "NV <- 19965\n", + "# number vaccinated\n", + "NU <- 20172\n", + "# number unvaccinated\n", + "RV <- 9 / NV\n", + "# average outcome for vaccinated\n", + "RU <- 169 / NU\n", + "# average outcome for unvaccinated\n", + "VE <- (RU - RV) / RU\n", + "# vaccine efficacy\n", "\n", "# incidence per 100000\n", - "Incidence.RV=RV*100000\n", - "Incidence.RU=RU*100000\n", + "incidence_rv <- RV * 100000\n", + "incidence_ru <- RU * 100000\n", "\n", - "print(paste(\"Incidence per 100000 among vaccinated:\", round(Incidence.RV,4)))\n", + "print(paste(\"Incidence per 100000 among vaccinated:\", round(incidence_rv, 4)))\n", "\n", - "print(paste(\"Incidence per 100000 among unvaccinated:\", round(Incidence.RU,4)))\n", + "print(paste(\"Incidence per 100000 among unvaccinated:\", round(incidence_ru, 4)))\n", "\n", "# treatment effect - estimated reduction in incidence per 100000 people\n", - "delta.hat = 100000*(RV-RU)\n", + "delta_hat <- 100000 * (RV - RU)\n", "\n", - "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta.hat,4)))\n", + "print(paste(\"Estimated ATE of occurances per 100,000 is\", round(delta_hat, 4)))\n", "\n", "# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli\n", - "Var.RV = RV*(1-RV)/NV\n", - "Var.RU = RU*(1-RU)/NU\n", - "Var.delta.hat = 100000^2*(Var.RV + Var.RU)\n", - "Std.delta.hat = sqrt(Var.delta.hat)\n", + "var_rv <- RV * (1 - RV) / NV\n", + "var_ru <- RU * (1 - RU) / NU\n", + "var_delta_hat <- 100000^2 * (var_rv + var_ru)\n", + "std_delta_hat <- sqrt(var_delta_hat)\n", "\n", - "print(paste(\"Standard deviation for ATE is\", round(Std.delta.hat,4)))\n", + "print(paste(\"Standard deviation for ATE is\", round(std_delta_hat, 4)))\n", "\n", - "CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat),\n", - " delta.hat +1.96*sqrt(Var.delta.hat))\n", + "ci_delta <- c(\n", + " delta_hat - 1.96 * sqrt(var_delta_hat),\n", + " delta_hat + 1.96 * sqrt(var_delta_hat)\n", + ")\n", "\n", - "print(paste(\"95% confidence interval of ATE is [\", round(CI.delta[1],4), \",\",\n", - " round(CI.delta[2],4), \"]\" ))\n", + "print(paste(\n", + " \"95% confidence interval of ATE is [\", round(ci_delta[1], 4), \",\",\n", + " round(ci_delta[2], 4), \"]\"\n", + "))\n", "\n", - "print(paste(\"Overall VE is\", round(VE,4) ))\n", + "print(paste(\"Overall VE is\", round(VE, 4)))\n", "\n", "# we use an approximate bootstrap to find the VE confidence interval\n", "# using Monte Carlo draws as before\n", "set.seed(1)\n", - "B = 10000\n", - "RVs = RV + rnorm(B)*sqrt(Var.RV)\n", - "RUs = RU + rnorm(B)*sqrt(Var.RU)\n", - "VEs= (RUs - RVs)/RUs\n", + "B <- 10000\n", + "RVs <- RV + rnorm(B) * sqrt(var_rv)\n", + "RUs <- RU + rnorm(B) * sqrt(var_ru)\n", + "VEs <- (RUs - RVs) / RUs\n", "\n", - "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "plot(density(VEs), col = 2, main = \"Approximate Distribution of VE estimates\")\n", "\n", - "CI.VE = quantile(VEs, c(.025, .975))\n", + "ci_ve <- quantile(VEs, c(.025, .975))\n", "\n", - "print(paste(\"95% confidence interval of VE is [\", round(CI.VE[1],4), \",\",\n", - " round(CI.VE[2],4), \"]\" ))" + "print(paste(\n", + " \"95% confidence interval of VE is [\", round(ci_ve[1], 4), \",\",\n", + " round(ci_ve[2], 4), \"]\"\n", + "))" ] }, { @@ -206,42 +233,48 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "kOcfliFTerNo" + "id": "kOcfliFTerNo", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older\n", - "NV = 3239+805;\n", - "NU = 3255+812;\n", - "RV = 1/NV;\n", - "RU = (14+5)/NU;\n", - "VE = (RU - RV)/RU;\n", + "NV <- 3239 + 805\n", + "NU <- 3255 + 812\n", + "RV <- 1 / NV\n", + "RU <- (14 + 5) / NU\n", + "VE <- (RU - RV) / RU\n", + "print(paste(\"Overall VE is\", round(VE, 4)))\n", "\n", - "print(paste(\"Overall VE is\", round(VE,4)) )\n", - "\n", - "Var.RV = RV*(1-RV)/NV\n", - "Var.RU = RU*(1-RU)/NU\n", + "var_rv <- RV * (1 - RV) / NV\n", + "var_ru <- RU * (1 - RU) / NU\n", "\n", "# As before, we use an approximate bootstrap to find the confidence intervals\n", "# using Monte Carlo draws\n", "\n", "set.seed(1)\n", - "B = 10000\n", - " RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10)\n", - " RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10)\n", - " VEs= (RUs - RVs)/RUs\n", + "B <- 10000\n", + "RVs <- RV + rnorm(B) * sqrt(var_rv) + 10^(-10)\n", + "RUs <- RU + rnorm(B) * sqrt(var_ru) + 10^(-10)\n", + "VEs <- (RUs - RVs) / RUs\n", "\n", - "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "plot(density(VEs), col = 2, main = \"Approximate Distribution of VE estimates\")\n", "\n", - "CI.VE = quantile(VEs, c(.025, .975))\n", + "ci_ve <- quantile(VEs, c(.025, .975))\n", "\n", - "print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n", - " CI.VE[2], \"]\" ))\n", + "print(paste(\n", + " \"two-sided 95 % confidence interval is [\", ci_ve[1], \",\",\n", + " ci_ve[2], \"]\"\n", + "))\n", "\n", - "OneSidedCI.VE = quantile(VEs, c(.05))\n", + "one_sided_ci_ve <- quantile(VEs, c(.05))\n", "\n", - "print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\",\n", - " 1, \"]\" ))" + "print(paste(\n", + " \"one-sided 95 % confidence interval is [\", one_sided_ci_ve[1], \",\",\n", + " 1, \"]\"\n", + "))" ] }, { @@ -257,34 +290,38 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "3WTthWWeerNr" + "id": "3WTthWWeerNr", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "NV = 3239+805;\n", - "NU = 3255+812;\n", - "RV = 1/NV;\n", - "RU = (14+5)/NU;\n", - "VE = (RU - RV)/RU;\n", - "\n", - "print(paste(\"Overall VE is\", VE) )\n", + "NV <- 3239 + 805\n", + "NU <- 3255 + 812\n", + "RV <- 1 / NV\n", + "RU <- (14 + 5) / NU\n", + "VE <- (RU - RV) / RU\n", + "print(paste(\"Overall VE is\", VE))\n", "\n", "set.seed(1)\n", - "B = 10000 #number of simulation draw\n", - " RVs = rbinom(100000, size= NV, prob = RV)\n", - " RUs = rbinom(100000, size= NU, prob = RU)\n", - " VEs= (RUs - RVs)/RUs\n", + "B <- 10000 # number of simulation draw\n", + "RVs <- rbinom(100000, size = NV, prob = RV)\n", + "RUs <- rbinom(100000, size = NU, prob = RU)\n", + "VEs <- (RUs - RVs) / RUs\n", "\n", - "plot(density(VEs), col=2, main=\"Approximate Distribution of VE estimates\")\n", + "plot(density(VEs), col = 2, main = \"Approximate Distribution of VE estimates\")\n", "\n", - "CI.VE = quantile(VEs, c(.025, .975))\n", + "ci_ve <- quantile(VEs, c(.025, .975))\n", "\n", - "print(paste(\"two-sided 95 % confidence interval is [\", CI.VE[1], \",\",\n", - " CI.VE[2], \"]\" ))\n", + "print(paste(\n", + " \"two-sided 95 % confidence interval is [\", ci_ve[1], \",\",\n", + " ci_ve[2], \"]\"\n", + "))\n", "\n", - "OneSidedCI.VE = quantile(VEs, c(.05))\n", + "one_sided_ci_ve <- quantile(VEs, c(.05))\n", "\n", - "print(paste(\"one-sided 95 % confidence interval is [\", OneSidedCI.VE[1], \",\", 1, \"]\" ))" + "print(paste(\"one-sided 95 % confidence interval is [\", one_sided_ci_ve[1], \",\", 1, \"]\"))" ] }, { @@ -302,20 +339,22 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "XwFMp7vLerNz" + "id": "XwFMp7vLerNz", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure\n", - "NV = 19965;\n", - "NU = 20172;\n", - "RV = 9/NV;\n", - "RU = 169/NU;\n", - "VE = (RU - RV)/RU;\n", - "\n", - "# 1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n", - "1-riskscoreci(9,NV,169,NU,0.95)$conf.int[2]\n", - "1-riskscoreci(9,NV,169,NU,0.95)$conf.int[1]" + "NV <- 19965\n", + "NU <- 20172\n", + "RV <- 9 / NV\n", + "RU <- 169 / NU\n", + "VE <- (RU - RV) / RU\n", + "\n", + "1 - riskscoreci(9, NV, 169, NU, 0.95)$conf.int[2]\n", + "1 - riskscoreci(9, NV, 169, NU, 0.95)$conf.int[1]" ] }, { @@ -340,21 +379,22 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "iP0ZCUw8erN3" + "id": "iP0ZCUw8erN3", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older\n", - "NV = 3239+805;\n", - "NU = 3255+812;\n", - "RV = 1/NV;\n", - "RU = (14+5)/NU;\n", - "VE = (RU - RV)/RU;\n", - "\n", - "# 1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08))\n", - "\n", - "1-riskscoreci(1,NV,19,NU,0.95)$conf.int[2]\n", - "1-riskscoreci(1,NV,19,NU,0.95)$conf.int[1]\n" + "NV <- 3239 + 805\n", + "NU <- 3255 + 812\n", + "RV <- 1 / NV\n", + "RU <- (14 + 5) / NU\n", + "VE <- (RU - RV) / RU\n", + "\n", + "1 - riskscoreci(1, NV, 19, NU, 0.95)$conf.int[2]\n", + "1 - riskscoreci(1, NV, 19, NU, 0.95)$conf.int[1]" ] } ], diff --git a/CM1/r-sim-precision-adj.irnb b/CM1/r-sim-precision-adj.irnb index 2f50dd80..5b7c4fd4 100644 --- a/CM1/r-sim-precision-adj.irnb +++ b/CM1/r-sim-precision-adj.irnb @@ -13,7 +13,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VzLzwihLjfEJ" + "id": "VzLzwihLjfEJ", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -54,20 +57,23 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "GXWON7gHd8zj" + "id": "GXWON7gHd8zj", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# generate the simulated dataset\n", - "set.seed(123) # set MC seed\n", - "n = 1000 # sample size\n", - "Z = rnorm(n) # generate Z\n", - "Y0 = -Z + rnorm(n) # conditional average baseline response is -Z\n", - "Y1 = Z + rnorm(n) # conditional average treatment effect is +Z\n", - "D = (runif(n)<.2) # treatment indicator; only 20% get treated\n", - "Y = Y1*D + Y0*(1-D) # observed Y\n", - "D = D - mean(D) # demean D\n", - "Z = Z-mean(Z) # demean Z" + "set.seed(123) # set MC seed\n", + "n <- 1000 # sample size\n", + "Z <- rnorm(n) # generate Z\n", + "Y0 <- -Z + rnorm(n) # conditional average baseline response is -Z\n", + "Y1 <- Z + rnorm(n) # conditional average treatment effect is +Z\n", + "D <- (runif(n) < .2) # treatment indicator; only 20% get treated\n", + "Y <- Y1 * D + Y0 * (1 - D) # observed Y\n", + "D <- D - mean(D) # demean D\n", + "Z <- Z - mean(Z) # demean Z" ] }, { @@ -93,19 +99,22 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "mGfqEgHLd8zs" + "id": "mGfqEgHLd8zs", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "# implement each of the models on the simulated data\n", - "CL = lm(Y ~ D)\n", - "CRA = lm(Y ~ D+ Z) #classical\n", - "IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach\n", + "CL <- lm(Y ~ D)\n", + "CRA <- lm(Y ~ D + Z) # classical\n", + "IRA <- lm(Y ~ D + Z + Z * D) # interactive approach\n", "\n", "# we are interested in the coefficients on variable \"D\".\n", - "coeftest(CL, vcov = vcovHC(CL, type=\"HC1\"))\n", - "coeftest(CRA, vcov = vcovHC(CRA, type=\"HC1\"))\n", - "coeftest(IRA, vcov = vcovHC(IRA, type=\"HC1\"))" + "coeftest(CL, vcov = vcovHC(CL, type = \"HC1\"))\n", + "coeftest(CRA, vcov = vcovHC(CRA, type = \"HC1\"))\n", + "coeftest(IRA, vcov = vcovHC(IRA, type = \"HC1\"))" ] }, { @@ -125,7 +134,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "rOTRDgBld8zw" + "id": "rOTRDgBld8zw", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -149,30 +161,33 @@ "metadata": { "_execution_state": "idle", "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "id": "bmtL0a9Nd8z2" + "id": "bmtL0a9Nd8z2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(123)\n", - "n = 1000\n", - "B= 1000\n", + "n <- 1000\n", + "B <- 1000\n", "\n", - "CLs = rep(0, B)\n", - "CRAs = rep(0, B)\n", - "IRAs = rep(0, B)\n", + "CLs <- rep(0, B)\n", + "CRAs <- rep(0, B)\n", + "IRAs <- rep(0, B)\n", "\n", - "for ( i in 1:B){\n", - " Z = rnorm(n)\n", - " Y0 = -Z + rnorm(n)\n", - " Y1 = Z + rnorm(n)\n", - " Z = Z - mean(Z)\n", - " D = (runif(n)<.1)\n", - " D = D- mean(D)\n", - " Y = Y1*D + Y0*(1-D)\n", - " CLs[i]= lm(Y ~ D)$coef[2]\n", - " CRAs[i] = lm(Y ~ D+ Z)$coef[2]\n", - " IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2]\n", - " }\n", + "for (i in 1:B) {\n", + " Z <- rnorm(n)\n", + " Y0 <- -Z + rnorm(n)\n", + " Y1 <- Z + rnorm(n)\n", + " Z <- Z - mean(Z)\n", + " D <- (runif(n) < .1)\n", + " D <- D - mean(D)\n", + " Y <- Y1 * D + Y0 * (1 - D)\n", + " CLs[i] <- lm(Y ~ D)$coef[2]\n", + " CRAs[i] <- lm(Y ~ D + Z)$coef[2]\n", + " IRAs[i] <- lm(Y ~ D + Z + Z * D)$coef[2]\n", + "}\n", "\n", "print(\"Standard deviations for estimators\")\n", "\n", @@ -180,15 +195,6 @@ "sqrt(mean(CRAs^2))\n", "sqrt(mean(IRAs^2))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NTFe4jwBlVNo" - }, - "outputs": [], - "source": [] } ], "metadata": { From a74e8030745f158df0b800037f0b7f6e09abc0b3 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:13:38 -0700 Subject: [PATCH 109/261] Linting CM1 --- .github/workflows/transform-R-to-Rmd.yml | 2 +- CM2/r-colliderbias-hollywood.irnb | 59 +++++++++++++++--------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 57100721..580ba7dc 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1'] #, 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2'] #, 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index f1648997..04be3a5a 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -24,7 +24,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "HqSBY2MwlFy2" + "id": "HqSBY2MwlFy2", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -36,12 +39,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "eITRXDGJlFy5" + "id": "eITRXDGJlFy5", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "g <- dagitty( \"dag{ T -> C <- B }\" )\n", - "plot(g)" + "causal_graph <- dagitty(\"dag{ Talent -> Congen <- Beauty }\")\n", + "plot(causal_graph)" ] }, { @@ -50,21 +56,24 @@ "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", - "id": "XrV8UFAOlFy5" + "id": "XrV8UFAOlFy5", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#collider bias\n", - "n=1000000\n", - "T = rnorm(n) #talent\n", - "B = rnorm(n) #beaty\n", - "C = T+B + rnorm(n) #congeniality\n", - "T.H= subset(T, C>0) # condition on C>0\n", - "B.H= subset(B, C>0) # condition on C>0\n", + "# collider bias\n", + "n <- 1000000\n", + "Talent <- rnorm(n) # talent\n", + "Beauty <- rnorm(n) # beauty\n", + "Congen <- Talent + Beauty + rnorm(n) # congeniality\n", + "TalentH <- subset(Talent, Congen > 0) # condition on Congen>0\n", + "BeautyH <- subset(Beaty, Congen > 0) # condition on Congen>0\n", "\n", - "summary(lm(T~ B)) #regression of T on B\n", - "summary(lm(T~ B +C)) #regression of T on B and C\n", - "summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0." + "summary(lm(Talent ~ Beauty)) # regression of Talent on Beauty\n", + "summary(lm(Talent ~ Beauty + Congen)) # regression of Talent on Beauty and Congen\n", + "summary(lm(TalentH ~ BeautyH)) # regression of Talent on Beauty, conditional on Congen>0." ] }, { @@ -82,23 +91,27 @@ "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", - "id": "B4vPm1JRlFy6" + "id": "B4vPm1JRlFy6", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "## If we want to infer causal effec of B on T,\n", + "## If we want to infer causal effect of Beauty on Talent,\n", "## we can apply the command to figure out\n", "## variables we should condition on:\n", "\n", - "adjustmentSets( g, \"T\", \"B\" )\n", + "adjustmentSets(causal_graph, \"Talent\", \"Beauty\")\n", "\n", "## empty set -- we should not condition on the additional\n", - "## variable C.\n", + "## variable Congen.\n", "\n", - "## Generate data where C = .5T + .5B\n", - "set.seed( 123); d <- simulateSEM( g, .5 )\n", - "confint( lm( T ~ B, d ) )[\"B\",] # includes 0\n", - "confint( lm( T ~ B + C, d ) )[\"B\",] # does not include 0\n" + "## Generate data where Congen = .5*Talent + .5*Beauty\n", + "set.seed(123)\n", + "data <- simulateSEM(causal_graph, .5)\n", + "confint(lm(Talent ~ Beauty, data))[\"Beauty\", ] # includes 0\n", + "confint(lm(Talent ~ Beauty + Congen, data))[\"Beauty\", ] # does not include 0" ] } ], From df2d12d4699cbae0a3700fd69e5c5bf7c25f6f9c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:18:10 -0700 Subject: [PATCH 110/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 580ba7dc..469622a7 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -33,6 +33,7 @@ jobs: if: "matrix.directory == 'PM5'" run: | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + shell: bash - name: Install system dependencies run: | From 7e5d8a4bc0632f4f9046325a2fac183af8f93595 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:22:03 -0700 Subject: [PATCH 111/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 469622a7..36e2e5e6 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -22,13 +22,13 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.8' # Specify your Python version here + python-version: '3.10' # Specify your Python version here - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install nbstripout tensorflow - + - name: Install dependencies if: "matrix.directory == 'PM5'" run: | From 67a07777c8c6f1830249f6e89eec2322353f6315 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:29:15 -0700 Subject: [PATCH 112/261] Linting errors --- CM2/r-colliderbias-hollywood.irnb | 4 +- CM3/r-dagitty.irnb | 149 +++++++++++++++++++----------- 2 files changed, 95 insertions(+), 58 deletions(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index 04be3a5a..4422f20e 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -17,7 +17,7 @@ "source": [ "Here is a simple mnemonic example to illustate the collider or M-bias.\n", "\n", - "Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that \"talent and beaty are negatively correlated\" for celebrities. " + "Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that \"talent and beauty are negatively correlated\" for celebrities. " ] }, { @@ -69,7 +69,7 @@ "Beauty <- rnorm(n) # beauty\n", "Congen <- Talent + Beauty + rnorm(n) # congeniality\n", "TalentH <- subset(Talent, Congen > 0) # condition on Congen>0\n", - "BeautyH <- subset(Beaty, Congen > 0) # condition on Congen>0\n", + "BeautyH <- subset(Beauty, Congen > 0) # condition on Congen>0\n", "\n", "summary(lm(Talent ~ Beauty)) # regression of Talent on Beauty\n", "summary(lm(Talent ~ Beauty + Congen)) # regression of Talent on Beauty and Congen\n", diff --git a/CM3/r-dagitty.irnb b/CM3/r-dagitty.irnb index 12e7fa72..875876ea 100644 --- a/CM3/r-dagitty.irnb +++ b/CM3/r-dagitty.irnb @@ -13,13 +13,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9TqXQzIlOelc" + "id": "9TqXQzIlOelc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "system('sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable')\n", - "system('sudo apt-get update')\n", - "system('sudo apt-get install libglpk-dev libgmp-dev libxml2-dev')" + "system(\"sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable\")\n", + "system(\"sudo apt-get update\")\n", + "system(\"sudo apt-get install libglpk-dev libgmp-dev libxml2-dev\")" ] }, { @@ -41,15 +44,18 @@ "base_uri": "https://localhost:8080/" }, "id": "md3VArZXk2-G", - "outputId": "2b1a6a60-18a4-4de3-cec6-de65a2516641" + "outputId": "2b1a6a60-18a4-4de3-cec6-de65a2516641", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#install and load package\n", + "# install and load package\n", "install.packages(\"dagitty\")\n", "install.packages(\"ggdag\")\n", "library(dagitty)\n", - "library(ggdag)\n" + "library(ggdag)" ] }, { @@ -79,13 +85,16 @@ "height": 437 }, "id": "1fgBFtRxk2-K", - "outputId": "88af1fad-7fbe-46a6-986c-637462e8e86d" + "outputId": "88af1fad-7fbe-46a6-986c-637462e8e86d", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "#generate a couple of DAGs and plot them\n", + "# generate a couple of DAGs and plot them\n", "\n", - "G = dagitty('dag{\n", + "G <- dagitty('dag{\n", "Z1 [pos=\"-2,-1.5\"]\n", "X1 [pos=\"-2,0\"]\n", "Z2 [pos=\"1.5,-1.5\"]\n", @@ -108,7 +117,7 @@ "}')\n", "\n", "\n", - "ggdag(G)+ theme_dag()" + "ggdag(G) + theme_dag()" ] }, { @@ -128,15 +137,17 @@ "base_uri": "https://localhost:8080/" }, "id": "PDE3ROjfk2-P", - "outputId": "0890ac94-4658-4ec7-b984-e7d0b2913483" + "outputId": "0890ac94-4658-4ec7-b984-e7d0b2913483", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "print(parents(G, \"X2\"))\n", "print(children(G, \"X2\"))\n", "print(ancestors(G, \"X2\"))\n", - "print(descendants(G, \"X2\"))\n", - "\n" + "print(descendants(G, \"X2\"))" ] }, { @@ -158,7 +169,10 @@ "height": 114 }, "id": "5JhG_60wk2-R", - "outputId": "10ada2ca-f201-4a3a-ff84-89b2948775d6" + "outputId": "10ada2ca-f201-4a3a-ff84-89b2948775d6", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -182,11 +196,14 @@ "base_uri": "https://localhost:8080/" }, "id": "B5LnSoCik2-T", - "outputId": "a03a58e3-5ae1-4d22-beea-ff8f97f6c64b" + "outputId": "a03a58e3-5ae1-4d22-beea-ff8f97f6c64b", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print( impliedConditionalIndependencies(G) )" + "print(impliedConditionalIndependencies(G))" ] }, { @@ -206,11 +223,14 @@ "base_uri": "https://localhost:8080/" }, "id": "dXKGvXgTk2-V", - "outputId": "daefd9dd-eba8-4057-b2d8-e5347ee30c79" + "outputId": "daefd9dd-eba8-4057-b2d8-e5347ee30c79", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print( adjustmentSets( G, \"D\", \"Y\" ) )" + "print(adjustmentSets(G, \"D\", \"Y\"))" ] }, { @@ -231,11 +251,14 @@ "height": 437 }, "id": "Zv3rbjEuk2-W", - "outputId": "ce658fd8-b475-4deb-c809-602de1686b60" + "outputId": "ce658fd8-b475-4deb-c809-602de1686b60", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "SWIG = dagitty('dag{\n", + "SWIG <- dagitty('dag{\n", "Z1 [pos=\"-2,-1.5\"]\n", "X1 [pos=\"-2,0\"]\n", "Z2 [pos=\"1.5,-1.5\"]\n", @@ -258,7 +281,7 @@ "d-> Md\n", "}')\n", "\n", - "ggdag(SWIG)+ theme_dag()" + "ggdag(SWIG) + theme_dag()" ] }, { @@ -279,11 +302,14 @@ "base_uri": "https://localhost:8080/" }, "id": "KwJgFzoqk2-X", - "outputId": "b66ee434-d948-4c7e-cf45-0fbc6843c6cc" + "outputId": "b66ee434-d948-4c7e-cf45-0fbc6843c6cc", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "print( impliedConditionalIndependencies(SWIG)[5:8] )\n" + "print(impliedConditionalIndependencies(SWIG)[5:8])" ] }, { @@ -312,19 +338,21 @@ "base_uri": "https://localhost:8080/" }, "id": "OcBfUurBk2-X", - "outputId": "b2a8b5c2-3407-48e4-e68e-61f19ac9e598" + "outputId": "b2a8b5c2-3407-48e4-e68e-61f19ac9e598", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "for( n in names(G) ){\n", - " for( m in children(G,n) ){\n", - " a <- adjustmentSets( G, n, m )\n", - " if( length(a) > 0 ){\n", - " cat(\"The effect \",n,\"->\",m,\n", - " \" is identifiable by controlling for:\\n\",sep=\"\")\n", - " print( a, prefix=\" * \" )\n", - " }\n", + "for (n in names(G)) {\n", + " for (m in children(G, n)) {\n", + " a <- adjustmentSets(G, n, m)\n", + " if (length(a) > 0) {\n", + " cat(\"The effect \", n, \"->\", m, \" is identifiable by controlling for:\\n\", sep = \"\")\n", + " print(a, prefix = \" * \")\n", " }\n", + " }\n", "}" ] }, @@ -346,13 +374,15 @@ "height": 437 }, "id": "8vNiL5HWk2-Y", - "outputId": "fcd29ac8-675b-4fb1-8a95-8e62ccba8d44" + "outputId": "fcd29ac8-675b-4fb1-8a95-8e62ccba8d44", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "P=equivalenceClass(G)\n", - "plot(P)\n", - "#equivalentDAGs(G,10)" + "P <- equivalenceClass(G)\n", + "plot(P)" ] }, { @@ -377,21 +407,23 @@ "height": 437 }, "id": "pBHDnH7Fk2-Z", - "outputId": "ea6c8fd5-4e90-4ce6-e84b-449d9c49d8b5" + "outputId": "ea6c8fd5-4e90-4ce6-e84b-449d9c49d8b5", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "G3<- dagitty('dag{\n", + "G3 <- dagitty(\"dag{\n", "D -> Y\n", "X -> D\n", "X -> Y\n", "}\n", - "')\n", + "\")\n", "\n", - "ggdag(G3)+ theme_dag()\n", + "ggdag(G3) + theme_dag()\n", "\n", - "print(impliedConditionalIndependencies(G3))\n", - "\n" + "print(impliedConditionalIndependencies(G3))" ] }, { @@ -403,14 +435,16 @@ "height": 1000 }, "id": "1cw47mOEk2-Z", - "outputId": "9e4f3af6-1f2b-4eac-ab68-841aaebaa87d" + "outputId": "9e4f3af6-1f2b-4eac-ab68-841aaebaa87d", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "P=equivalenceClass(G3)\n", + "P <- equivalenceClass(G3)\n", "plot(P)\n", - "equivalentDAGs(G3,10)\n", - "\n" + "equivalentDAGs(G3, 10)" ] }, { @@ -436,16 +470,17 @@ "height": 1000 }, "id": "MZ_4jxNdk2-a", - "outputId": "d91485d7-311d-4ae2-ea60-2e7f87a2fb96" + "outputId": "d91485d7-311d-4ae2-ea60-2e7f87a2fb96", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ "set.seed(1)\n", "x <- simulateSEM(G)\n", "head(x)\n", - "#cov(x)\n", - "localTests(G, data = x, type = c(\"cis\"))\n", - "\n" + "localTests(G, data = x, type = c(\"cis\"))" ] }, { @@ -471,15 +506,17 @@ "height": 1000 }, "id": "C0ND4GbEk2-a", - "outputId": "37236927-3ece-458f-fcd4-820422609d7b" + "outputId": "37236927-3ece-458f-fcd4-820422609d7b", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "x.R = x\n", - "x.R$D = (x$D+ x$Y)/2\n", + "xR <- x\n", + "xR$D <- (x$D + x$Y) / 2\n", "\n", - "localTests(G, data = x.R, type = c(\"cis\"))\n", - "\n" + "localTests(G, data = xR, type = c(\"cis\"))" ] } ], From 5b7d810874325ec0d4da847aaf14b0033edabd5f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:34:49 -0700 Subject: [PATCH 113/261] Linting errors --- .github/workflows/transform-R-to-Rmd.yml | 2 +- CM3/r-dosearch.irnb | 199 +++++++++++++---------- 2 files changed, 116 insertions(+), 85 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 36e2e5e6..45ec033e 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2'] #, 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3'] #, 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/CM3/r-dosearch.irnb b/CM3/r-dosearch.irnb index 6125719e..3e3c90ee 100644 --- a/CM3/r-dosearch.irnb +++ b/CM3/r-dosearch.irnb @@ -24,7 +24,10 @@ "base_uri": "https://localhost:8080/" }, "id": "_67DDyL8kzXV", - "outputId": "8c8ac5d4-b0b1-4a10-ed14-ddb50956add0" + "outputId": "8c8ac5d4-b0b1-4a10-ed14-ddb50956add0", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -63,17 +66,22 @@ "height": 35 }, "id": "0vvfKbSzkzXZ", - "outputId": "2c8dd4d2-ba1b-42ea-fb02-a326b6cc62ec" + "outputId": "2c8dd4d2-ba1b-42ea-fb02-a326b6cc62ec", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- \"p(y,d,x)\" #data structure\n", + "data <- \"p(y, d, x)\" # data structure\n", "\n", - "query <- \"p(y | do(d),x)\" #query -- target parameter\n", + "query <- \"p(y | do(d), x)\" # query -- target parameter\n", "\n", - "graph <- \"x -> y\n", + "graph <- \"\n", + " x -> y\n", " x -> d\n", - " d -> y\"\n", + " d -> y\n", + "\"\n", "\n", "dosearch(data, query, graph)" ] @@ -99,20 +107,24 @@ "height": 35 }, "id": "tXw8HnV2kzXZ", - "outputId": "29481c74-cc2e-4042-f39d-5612862bfb68" + "outputId": "29481c74-cc2e-4042-f39d-5612862bfb68", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- \"p(y,d,x)\"\n", + "data <- \"p(y, d, x)\"\n", "\n", "query <- \"p(y | do(d))\"\n", "\n", - "graph <- \"x -> y\n", + "graph <- \"\n", + " x -> y\n", " x -> d\n", - " d -> y\"\n", - "\n", + " d -> y\n", + "\"\n", "\n", - "dosearch(data, query, graph)\n" + "dosearch(data, query, graph)" ] }, { @@ -146,17 +158,22 @@ "height": 35 }, "id": "cjis082KkzXa", - "outputId": "c803c856-acab-4cc7-9e20-f6e5c790ffc1" + "outputId": "c803c856-acab-4cc7-9e20-f6e5c790ffc1", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- \"p(y,d)\"\n", + "data <- \"p(y, d)\"\n", "\n", "query <- \"p(y | do(d))\"\n", "\n", - "graph <- \"x -> y\n", + "graph <- \"\n", + " x -> y\n", " x -> d\n", - " d -> y\"\n", + " d -> y\n", + "\"\n", "\n", "dosearch(data, query, graph)" ] @@ -181,28 +198,31 @@ "height": 35 }, "id": "J4mz88VZkzXa", - "outputId": "30f73953-79de-448d-ddc2-aa470a97840e" + "outputId": "30f73953-79de-448d-ddc2-aa470a97840e", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ + "data <- \"p(y, d, x2)\" # observed only (Y, D, X_2)\n", "\n", - "data <- \"p(y,d,x2)\" #observed only (Y, D, X_2)\n", - "\n", - "query<- \"p(y|do(d))\" #target parameter\n", + "query <- \"p(y | do(d))\" # target parameter\n", "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", + "graph <- \"\n", + " z1 -> x1\n", + " z1 -> x2\n", + " z2 -> x2\n", + " z2 -> x3\n", + " x2 -> d\n", + " x2 -> y\n", + " x3 -> y\n", + " x1 -> d\n", + " d -> m\n", + " m -> y\n", "\"\n", "\n", - "dosearch(data, query, graph)\n" + "dosearch(data, query, graph)" ] }, { @@ -222,30 +242,33 @@ "base_uri": "https://localhost:8080/" }, "id": "wGIVqPAIkzXb", - "outputId": "0402b81b-ca16-4274-df93-33c0c2ae2327" + "outputId": "0402b81b-ca16-4274-df93-33c0c2ae2327", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ + "data <- \"p(y, d, x2, x3)\"\n", "\n", - "data <- \"p(y,d,x2,x3)\"\n", - "\n", - "conditional.query<- \"p(y|do(d),x2, x3)\" #can ID conditional average effect?\n", - "query<- \"p(y|do(d))\" #can ID unconditional effect?\n", + "conditional_query <- \"p(y | do(d), x2, x3)\" # can ID conditional average effect?\n", + "query <- \"p(y | do(d))\" # can ID unconditional effect?\n", "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", + "graph <- \"\n", + " z1 -> x1\n", + " z1 -> x2\n", + " z2 -> x2\n", + " z2 -> x3\n", + " x2 -> d\n", + " x2 -> y\n", + " x3 -> y\n", + " x1 -> d\n", + " d -> m\n", + " m -> y\n", "\"\n", "\n", - "print(dosearch(data, conditional.query, graph))\n", - "print(dosearch(data, query, graph))\n" + "print(dosearch(data, conditional_query, graph))\n", + "print(dosearch(data, query, graph))" ] }, { @@ -287,30 +310,34 @@ "base_uri": "https://localhost:8080/" }, "id": "eaPrMHWTkzXb", - "outputId": "4cc25d7c-75ba-44a6-93df-41359b55495b" + "outputId": "4cc25d7c-75ba-44a6-93df-41359b55495b", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- \"p(y,d, m)\"\n", + "data <- \"p(y, d, m)\"\n", "\n", - "query.dm<- \"p(m|do(d))\"\n", - "query.md<- \"p(y|do(m))\"\n", - "query<- \"p(y|do(d))\"\n", + "query_dm <- \"p(m | do(d))\"\n", + "query_md <- \"p(y | do(m))\"\n", + "query <- \"p(y | do(d))\"\n", "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", + "graph <- \"\n", + " z1 -> x1\n", + " z1 -> x2\n", + " z2 -> x2\n", + " z2 -> x3\n", + " x2 -> d\n", + " x2 -> y\n", + " x3 -> y\n", + " x1 -> d\n", + " d -> m\n", + " m -> y\n", "\"\n", - "print(dosearch(data, query.dm, graph))\n", - "print(dosearch(data, query.md, graph))\n", - "print(dosearch(data, query, graph))\n" + "print(dosearch(data, query_dm, graph))\n", + "print(dosearch(data, query_md, graph))\n", + "print(dosearch(data, query, graph))" ] }, { @@ -351,30 +378,34 @@ "base_uri": "https://localhost:8080/" }, "id": "x-ACaOv_kzXc", - "outputId": "e5ba4476-2eb2-4806-fa2d-71c73dac986d" + "outputId": "e5ba4476-2eb2-4806-fa2d-71c73dac986d", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ - "data <- \"p(y,m)\n", - " p(m,d)\"\n", + "data <- \"p(y, m)\n", + " p(m, d)\"\n", "\n", - "query.dm<- \"p(m|do(d))\"\n", - "query.md<- \"p(y|do(m))\"\n", - "query<- \"p(y|do(d))\"\n", + "query_dm <- \"p(m | do(d))\"\n", + "query_md <- \"p(y | do(m))\"\n", + "query <- \"p(y | do(d))\"\n", "\n", - "graph<- \"z1 -> x1\n", - "z1 -> x2\n", - "z2 -> x2\n", - "z2 -> x3\n", - "x2 -> d\n", - "x2 -> y\n", - "x3 -> y\n", - "x1 -> d\n", - "d -> m\n", - "m -> y\n", + "graph <- \"\n", + " z1 -> x1\n", + " z1 -> x2\n", + " z2 -> x2\n", + " z2 -> x3\n", + " x2 -> d\n", + " x2 -> y\n", + " x3 -> y\n", + " x1 -> d\n", + " d -> m\n", + " m -> y\n", "\"\n", - "print(dosearch(data, query.dm, graph))\n", - "print(dosearch(data, query.md, graph))\n", + "print(dosearch(data, query_dm, graph))\n", + "print(dosearch(data, query_md, graph))\n", "print(dosearch(data, query, graph))" ] } From 05efce23d24895794d898793c9c6c56a957160f2 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 06:38:26 -0700 Subject: [PATCH 114/261] Update r-colliderbias-hollywood.irnb --- CM2/r-colliderbias-hollywood.irnb | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index 4422f20e..9b4e1920 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -105,13 +105,24 @@ "adjustmentSets(causal_graph, \"Talent\", \"Beauty\")\n", "\n", "## empty set -- we should not condition on the additional\n", - "## variable Congen.\n", - "\n", + "## variable Congen." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ "## Generate data where Congen = .5*Talent + .5*Beauty\n", "set.seed(123)\n", "data <- simulateSEM(causal_graph, .5)\n", - "confint(lm(Talent ~ Beauty, data))[\"Beauty\", ] # includes 0\n", - "confint(lm(Talent ~ Beauty + Congen, data))[\"Beauty\", ] # does not include 0" + "confint(lm(Talent ~ Beauty, data))[\"Beauty\", ]\n", + "confint(lm(Talent ~ Beauty + Congen, data))[\"Beauty\", ]" ] } ], From a7cea63fcac21cfe9cb13c3d5d431375e64eb712 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 13:45:02 +0000 Subject: [PATCH 115/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM2 --- CM2/r-colliderbias-hollywood.Rmd | 45 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/CM2/r-colliderbias-hollywood.Rmd b/CM2/r-colliderbias-hollywood.Rmd index 5bf2e323..ad659b8d 100644 --- a/CM2/r-colliderbias-hollywood.Rmd +++ b/CM2/r-colliderbias-hollywood.Rmd @@ -7,7 +7,7 @@ output: html_document Here is a simple mnemonic example to illustate the collider or M-bias. -Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beaty are negatively correlated" for celebrities. +Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beauty are negatively correlated" for celebrities. ```{r} install.packages("dagitty") @@ -15,39 +15,42 @@ library(dagitty) ``` ```{r} -g <- dagitty( "dag{ T -> C <- B }" ) -plot(g) +causal_graph <- dagitty("dag{ Talent -> Congen <- Beauty }") +plot(causal_graph) ``` ```{r} -#collider bias -n=1000000 -T = rnorm(n) #talent -B = rnorm(n) #beaty -C = T+B + rnorm(n) #congeniality -T.H= subset(T, C>0) # condition on C>0 -B.H= subset(B, C>0) # condition on C>0 - -summary(lm(T~ B)) #regression of T on B -summary(lm(T~ B +C)) #regression of T on B and C -summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0. +# collider bias +n <- 1000000 +Talent <- rnorm(n) # talent +Beauty <- rnorm(n) # beauty +Congen <- Talent + Beauty + rnorm(n) # congeniality +TalentH <- subset(Talent, Congen > 0) # condition on Congen>0 +BeautyH <- subset(Beauty, Congen > 0) # condition on Congen>0 + +summary(lm(Talent ~ Beauty)) # regression of Talent on Beauty +summary(lm(Talent ~ Beauty + Congen)) # regression of Talent on Beauty and Congen +summary(lm(TalentH ~ BeautyH)) # regression of Talent on Beauty, conditional on Congen>0. ``` We can also use package Dagitty to illustrate collider bias, also known as M-bias. ```{r} -## If we want to infer causal effec of B on T, +## If we want to infer causal effect of Beauty on Talent, ## we can apply the command to figure out ## variables we should condition on: -adjustmentSets( g, "T", "B" ) +adjustmentSets(causal_graph, "Talent", "Beauty") ## empty set -- we should not condition on the additional -## variable C. +## variable Congen. +``` -## Generate data where C = .5T + .5B -set.seed( 123); d <- simulateSEM( g, .5 ) -confint( lm( T ~ B, d ) )["B",] # includes 0 -confint( lm( T ~ B + C, d ) )["B",] # does not include 0 +```{r} +## Generate data where Congen = .5*Talent + .5*Beauty +set.seed(123) +data <- simulateSEM(causal_graph, .5) +confint(lm(Talent ~ Beauty, data))["Beauty", ] +confint(lm(Talent ~ Beauty + Congen, data))["Beauty", ] ``` From da46ae73ba30221c121c220f86638540fb6917aa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 13:47:38 +0000 Subject: [PATCH 116/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM1 --- CM1/r-rct-penn-precision-adj.Rmd | 98 ++++++------ CM1/r-rct-vaccines.Rmd | 247 +++++++++++++++++-------------- CM1/r-sim-precision-adj.Rmd | 68 ++++----- 3 files changed, 216 insertions(+), 197 deletions(-) diff --git a/CM1/r-rct-penn-precision-adj.Rmd b/CM1/r-rct-penn-precision-adj.Rmd index d34b6524..4eef845c 100644 --- a/CM1/r-rct-penn-precision-adj.Rmd +++ b/CM1/r-rct-penn-precision-adj.Rmd @@ -23,17 +23,17 @@ In this lab, we analyze the Pennsylvania re-employment bonus experiment, which w ```{r} ## loading the data -file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat" -Penn <- as.data.frame(read.table(file, header=T)) +file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat" +Penn <- as.data.frame(read.table(file, header = TRUE)) n <- dim(Penn)[1] p_1 <- dim(Penn)[2] -Penn<- subset(Penn, tg==4 | tg==0) +Penn <- subset(Penn, tg == 4 | tg == 0) attach(Penn) ``` ```{r} -T4<- (tg==4) +T4 <- (tg == 4) summary(T4) ``` @@ -85,11 +85,13 @@ We first look at the coefficients individually with a $t$-test, and then we adju The regression below is done using "type='HC1'" which computes the correct Eicker-Huber-White standard errors, instead of the classical non-robust formula based on homoscedasticity. ```{r} -data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2) +data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + + agelt35 + agegt54 + durable + lusd + husd)^2) # individual t-tests -m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data)) -coeftest(m, vcov = vcovHC(m, type="HC1")) +m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + + agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data)) +coeftest(m, vcov = vcovHC(m, type = "HC1")) ``` -Background - -The target function is Conditional Average Treatment Effect, defined as - -$$ g(x)=E [ Y(1) - Y(0) |X=x], $$ - -where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income. -$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$. - - - -If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is - -$$ g(x) = E [ Y (\eta_0) \mid X=x], $$ -where the transformed outcome variable is - -$$Y (\eta) = \dfrac{D}{s(Z)} \left( Y - \mu(1,Z) \right) - \dfrac{1-D}{1-s(Z)} \left( Y - \mu(0,Z) \right) + \mu(1,Z) - \mu(0,Z),$$ - -the probability of eligibility is - -$$s_0(z) = Pr (D=1 \mid Z=z),$$ - -the expected net financial asset given $D =d \in \{1,0\}$ and $Z=z$ is - -$$ \mu(d,z) = E[ Y \mid Z=z, D=d]. $$ - -Our goal is to estimate $g(x)$. - - -In step 1, we estimate the unknown functions $s_0(z), \mu(1,z), \mu(0,z)$ and plug them into $Y (\eta)$. - - -In step 2, we approximate the function $g(x)$ by a linear combination of basis functions: - -$$ g(x) = p(x)' \beta_0, $$ - - -where $p(x)$ is a vector of polynomials or splines and - -$$ \beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\eta_0) $$ - -is the best linear predictor. We report - -$$ -\widehat{g}(x) = p(x)' \widehat{\beta}, -$$ - -where $\widehat{\beta}$ is the ordinary least squares estimate of $\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$ - -$$ - \widehat{\beta} :=\left( \dfrac{1}{N} \sum_{i=1}^N p(X_i) p(X_i)' \right)^{-1} \dfrac{1}{N} \sum_{i=1}^N p(X_i)Y_i(\widehat{\eta}) -$$ - - - - - - - - - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -## load packages -rm(list=ls()) -library(foreign) -library(quantreg) -library(splines) -library(lattice) -#library(mnormt); -library(Hmisc) -library(fda); -library(hdm) -library(randomForest) -library(ranger) -library(sandwich) -``` - -```{r} -## 401k dataset -data(pension) -pension$net_tfa<-pension$net_tfa/10000 -## covariate of interest -- log income -- -pension$inc = log(pension$inc) -#pension$inc[is.na(pension$inc)]<-0 -pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,] - - -## outcome variable -- total net financial assets -Y=pension$net_tfa -## binary treatment -- indicator of 401(k) eligibility -D=pension$e401 - - -X=pension$inc -## target parameter is CATE = E[ Y(1) - Y(0) | X] - - -## raw covariates so that Y(1) and Y(0) are independent of D given Z -Z = pension[,c("age","inc","fsize","educ","male","db","marr","twoearn","pira","hown","hval","hequity","hmort", - "nohs","hs","smcol")] - - -y_name <- "net_tfa"; -d_name <- "e401"; -form_z <- "(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2"; - - - -cat(sprintf("\n sample size is %g \n", length(Y) )) -cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2] )) -``` - -In Step 1, we estimate three functions: - -1. probability of treatment assignment $s_0(z)$ - -2.-3. regression functions $\mu_0(1,z)$ and $\mu_0(0,z)$. - -We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r - -For each function, we try random forest. - - - -First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by lasso - -```{r} - -first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) { - - # Sample size - N<-dim(data)[1] - # Estimated regression function in control group - mu0.hat<-rep(1,N) - # Estimated regression function in treated group - mu1.hat<-rep(1,N) - # Propensity score - s.hat<-rep(1,N) - seed=1 - ## define sample splitting - set.seed(seed) - inds.train=sample(1:N,floor(N/2)) - inds.eval=setdiff(1:N,inds.train) - - print ("Estimate treatment probability, first half") - ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.train,]) - - s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type="response") - print ("Estimate treatment probability, second half") - fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.eval,]) - s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type="response") - - - - - - data1<-data - data1[,d_name]<-1 - - data0<-data - data0[,d_name]<-0 - - print ("Estimate expectation function, first half") - fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.train,]) - mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,]) - mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,]) - - print ("Estimate expectation function, second half") - fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.eval,]) - mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,]) - mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,]) - - return (list(mu1.hat=mu1.hat, - mu0.hat=mu0.hat, - s.hat=s.hat)) - -} -``` - -First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by random forest - -```{r} -first_stage_rf<-function(Y,D,Z,seed=1) { - - # Sample size - N<-length(D) - # Estimated regression function in control group - mu0.hat<-rep(1,N) - # Estimated regression function in treated group - mu1.hat<-rep(1,N) - # Propensity score - s.hat<-rep(1,N) - - - ## define sample splitting - set.seed(seed) - inds.train=sample(1:N,floor(N/2)) - inds.eval=setdiff(1:N,inds.train) - - print ("Estimate treatment probability, first half") - ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - D.f<-as.factor(as.character(D)) - fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train) - s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type="prob")[,2] - print ("Estimate treatment probability, second half") - fitted.rf<-randomForest(Z,D.f,subset=inds.eval) - s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type="prob")[,2] - - ## conditional expected net financial assets (i.e., regression function) based on random forest - - covariates<-cbind(Z,D) - - covariates1<-cbind(Z,D=rep(1,N)) - covariates0<-cbind(Z,D=rep(0,N)) - - print ("Estimate expectation function, first half") - fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train) - mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,]) - mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,]) - - print ("Estimate expectation function, second half") - fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval) - mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,]) - mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,]) - - return (list(mu1.hat=mu1.hat, - mu0.hat=mu0.hat, - s.hat=s.hat)) - -} -``` - - -In Step 2, we approximate $Y(\eta_0)$ by a vector of basis functions. There are two use cases: -**** -2.A. Group Average Treatment Effect, described above - - -2.B. Average Treatment Effect conditional on income value. There are three smoothing options: - -1. splines offered in ```least_squares_splines``` - -2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series``` - -3. standard polynomials with the highest degree input by user ```least_squares_series_old``` - - -The default option is option 3. - - -2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as - -$$ - \infty = \ell_0 < \ell_1 < \ell_2 \dots \ell_K = \infty $$ - -define intervals $I_k = [ \ell_{k-1}, \ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator - -$$ G_k(X) = 1[X \in I_k], $$ - -and the vector of basis functions - -$$ p(X) = (G_1(X), G_2(X), \dots, G_K(X)) $$ - -Then, the Best Linear Predictor $\beta_0$ vector shows the average treatment effect for each group. - -```{r} -## estimate first stage functions by random forest -## may take a while -fs.hat.rf = first_stage_rf(Y,D,Z) -``` - -```{r} -X=pension$inc -fs.hat<-fs.hat.rf -min_cutoff=0.01 -# regression function -mu1.hat<-fs.hat[["mu1.hat"]] -mu0.hat<-fs.hat[["mu0.hat"]] -# propensity score -s.hat<-fs.hat[["s.hat"]] -s.hat<-sapply(s.hat,max,min_cutoff) -### Construct Orthogonal Signal - - -RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat -``` - -```{r} -qtmax <- function(C, S=10000, alpha) - {; - p <- nrow(C); - tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max); - return(quantile(tmaxs, 1-alpha)); - }; - -# This function computes the square root of a symmetric matrix using the spectral decomposition; - - -group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) { - - grid<-quantile(X,probs=c((0:max_grid)/max_grid)) - X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1) - - for (k in 2:((length(grid)))) { - X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) ) - - ols.fit<- lm(Y~X.raw-1) - coefs <- coef(ols.fit) - vars <- names(coefs) - HCV.coefs <- vcovHC(ols.fit, type = 'HC') - coefs.se <- sqrt(diag(HCV.coefs)) # White std errors - ## this is an identity matrix - ## qtmax is simplified - C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs)))); - - - tes <- coefs - tes.se <- coefs.se - tes.cor <- C.coefs - crit.val <- qtmax(tes.cor,B,alpha); - - tes.ucb <- tes + crit.val * tes.se; - tes.lcb <- tes - crit.val * tes.se; - - tes.uci <- tes + qnorm(1-alpha/2) * tes.se; - tes.lci <- tes + qnorm(alpha/2) * tes.se; - - - return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci, - ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val )) -} -``` - -```{r} -res<-group_average_treatment_effect(X=X,Y=RobustSignal) -``` - -```{r} -## this code is taken from L1 14.382 taught at MIT -## author: Mert Demirer -options(repr.plot.width=10, repr.plot.height=8) - -tes<-res$beta.hat -tes.lci<-res$ghat.lower.point -tes.uci<-res$ghat.upper.point - -tes.lcb<-res$ghat.lower -tes.ucb<-res$ghat.upper -tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%') - -plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type="n",xlab="Income group", - ylab="Average Effect on NET TFA (per 10 K)", main="Group Average Treatment Effects on NET TFA", xaxt="n"); -axis(1, at=1:5, labels=tes.lev); -for (i in 1:5) -{; - rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = "red", lwd = 3); - rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 ); - segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 ); -}; -abline(h=0); - -legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8); - -dev.off() -``` - -```{r} -least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) { - ## Create technical regressors - cv.bsp<-rep(0,max_knot-1) - for (knot in 2:max_knot) { - breaks<- quantile(X, c(0:knot)/knot) - formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1] - fit <- lm(formula.bsp); - cv.bsp[knot-1] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); - } - ## Number of knots chosen by cross-validation - cv_knot<-which.min(cv.bsp)+1 - breaks<- quantile(X, c(0:cv_knot)/cv_knot) - formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1] - fit <- lm(formula.bsp); - - - return(list(cv_knot=cv_knot,fit=fit)) -} - - -least_squares_series<-function(X, Y,max_degree,...) { - - cv.pol<-rep(0,max_degree) - for (degree in 1:max_degree) { - formula.pol <- Y ~ poly(X, degree) - fit <- lm(formula.pol ); - cv.pol[degree] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); - } - ## Number of knots chosen by cross-validation - cv_degree<-which.min(cv.pol) - ## Estimate coefficients - formula.pol <- Y ~ poly(X, cv_degree) - fit <- lm(formula.pol); - - - return(list(fit=fit,cv_degree=cv_degree)) -} -``` - -```{r} -msqrt <- function(C) - {; - C.eig <- eigen(C); - return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors)); - }; - - -tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) { - - - numerator_grid<-regressors_grid%*%msqrt( Omega.hat) - denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid))) - - norm_numerator_grid<-numerator_grid - for (k in 1:dim(numerator_grid)[1]) { - norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k] - } - - tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max) - return(quantile(tmaxs, 1-alpha)) - -} -``` - -```{r} -second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method="poly",min_cutoff=0.01,alpha=0.05,eps=0.1,...) { - - X_grid = seq(min(X),max(X),eps) - mu1.hat<-fs.hat[["mu1.hat"]] - mu0.hat<-fs.hat[["mu0.hat"]] - # propensity score - s.hat<-fs.hat[["s.hat"]] - s.hat<-sapply(s.hat,max,min_cutoff) - ### Construct Orthogonal Signal - - RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat - - - - # Estimate the target function using least squares series - if (ss_method == "ortho_poly") { - res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree) - fit<-res$fit - cv_degree<-res$cv_degree - regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree)) - - } - if (ss_method == "splines") { - - res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv) - fit<-res$fit - cv_knot<-res$cv_knot - breaks<- quantile(X, c(0:cv_knot)/cv_knot) - regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]) - degree=cv_knot - - - } - - - g.hat<-regressors_grid%*%coef(fit) - - - HCV.coefs <- vcovHC(fit, type = 'HC') - #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit)) - standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid))) - ### Lower Pointwise CI - ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error - ### Upper Pointwise CI - ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error - - max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha) - - - ## Lower Uniform CI - ghat.lower<-g.hat-max_tstat*standard_error - ## Upper Uniform CI - ghat.upper<-g.hat+max_tstat*standard_error - return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point, - ghat.upper.point=ghat.upper.point,X_grid=X_grid)) - - - -} -``` - -```{r} -make_plot<-function(res,lowy,highy,degree,ss_method = "series",uniform=TRUE,...) { - - - title=paste0("Effect of 401(k) on Net TFA, ", ss_method) - X_grid=res$X_grid - len = length(X_grid) - - - if (uniform) { - group <-c(rep("UCI",len), rep("PCI",len), rep("Estimate",len),rep("PCIL",len),rep("UCIL",len)) - group_type<- c(rep("CI",len), rep("CI",len), rep("Estimate",len),rep("CI",len),rep("CI",len)) - group_ci_type<-c(rep("Uniform",len), rep("Point",len), rep("Uniform",len),rep("Point",len),rep("Uniform",len)) - - df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type ) - p<-ggplot(data=df)+ - aes(x=exp(income),y=outcome,colour=group )+ - theme_bw()+ - xlab("Income")+ - ylab("Net TFA, (thousand dollars)")+ - scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ - theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ - theme(legend.title=element_blank())+ - theme(legend.position="none")+ - ylim(low=lowy,high=highy)+ - geom_line(aes(linetype = group_line),size=1.5)+ - scale_linetype_manual(values=c("dashed","solid"))+ - ggtitle(title) - } - - if (!uniform) { - group <-c( rep("PCI",len), rep("Estimate",len),rep("PCIL",len)) - group_type<- c(rep("CI",len), rep("Estimate",len),rep("CI",len)) - group_ci_type<-c(rep("Point",len), rep("Uniform",len),rep("Point",len)) - - df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type ) - - p<-ggplot(data=df)+ - aes(x=exp(income),y=outcome,colour=group )+ - theme_bw()+ - xlab("Income")+ - ylab("Net TFA, (thousand dollars)")+ - scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ - theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ - theme(legend.title=element_blank())+ - theme(legend.position="none")+ - ylim(low=lowy,high=highy)+ - geom_line(aes(linetype = group_line),size=1.5)+ - scale_linetype_manual(values=c("dashed","solid"))+ - ggtitle(title) - - } - - - - return(p) -} -``` - -```{r} -res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="splines",max_degree=3) -``` - -```{r} -res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="ortho_poly",max_degree=3) -``` - -#plot findings: - --- black solid line shows estimated function $p(x)' \widehat{\beta}$ - --- blue dashed lines show pointwise confidence bands for this function - -```{r} -p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho_poly",uniform=FALSE, lowy=-10,highy=20) -options(repr.plot.width=15, repr.plot.height=10) -print(p) -``` - -plot findings: - --- black solid line shows estimated function $p(x)' \widehat{\beta}$ - --- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\beta_0$ with probability 0.95 - --- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \rightarrow p(x)'\beta_0$ with probability 0.95 on some compact range - -```{r} -p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho polynomials",uniform=TRUE,lowy=-10,highy=25) -options(repr.plot.width=15, repr.plot.height=10) -print(p) -``` - -```{r} -p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=FALSE, lowy=-15,highy=10) -options(repr.plot.width=15, repr.plot.height=10) -print(p) -``` - -```{r} -p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=TRUE,lowy=-20,highy=20) -options(repr.plot.width=15, repr.plot.height=10) -print(p) -``` - -```{r} - -``` From c0048e7185e1aec8de77b1017dcf1f1ab494a81c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 10:00:08 -0700 Subject: [PATCH 122/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 32eb20b6..0db20e87 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1'] #, 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 From c46ebaa3c0f64c95a38caff061fd1619df61f6d9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 17:06:34 +0000 Subject: [PATCH 123/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM2 --- CM2/r-colliderbias-hollywood.Rmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CM2/r-colliderbias-hollywood.Rmd b/CM2/r-colliderbias-hollywood.Rmd index ad659b8d..674a968e 100644 --- a/CM2/r-colliderbias-hollywood.Rmd +++ b/CM2/r-colliderbias-hollywood.Rmd @@ -11,6 +11,9 @@ Here the idea is that people who get to Hollywood have to have high congenility ```{r} install.packages("dagitty") +``` + +```{r} library(dagitty) ``` From 918c643a9fef0df7a696062ea45df05237a5339a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 17:10:04 +0000 Subject: [PATCH 124/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM1 --- CM1/r-rct-penn-precision-adj.Rmd | 3 +++ CM1/r-rct-vaccines.Rmd | 3 +++ CM1/r-sim-precision-adj.Rmd | 3 +++ 3 files changed, 9 insertions(+) diff --git a/CM1/r-rct-penn-precision-adj.Rmd b/CM1/r-rct-penn-precision-adj.Rmd index 4eef845c..5a2dc0fa 100644 --- a/CM1/r-rct-penn-precision-adj.Rmd +++ b/CM1/r-rct-penn-precision-adj.Rmd @@ -10,6 +10,9 @@ install.packages("sandwich") install.packages("lmtest") install.packages("xtable") install.packages("hdm") +``` + +```{r} library(sandwich) library(lmtest) library(xtable) diff --git a/CM1/r-rct-vaccines.Rmd b/CM1/r-rct-vaccines.Rmd index 0145ef96..43c78d85 100644 --- a/CM1/r-rct-vaccines.Rmd +++ b/CM1/r-rct-vaccines.Rmd @@ -7,6 +7,9 @@ This notebook contains some RCT examples for teaching. ```{r} install.packages("PropCIs") # Exact CI exploiting Bernoulli outcome using the Cornfield Procedure +``` + +```{r} library(PropCIs) ``` diff --git a/CM1/r-sim-precision-adj.Rmd b/CM1/r-sim-precision-adj.Rmd index f73208fd..da358e95 100644 --- a/CM1/r-sim-precision-adj.Rmd +++ b/CM1/r-sim-precision-adj.Rmd @@ -8,6 +8,9 @@ output: html_document ```{r} install.packages("sandwich") install.packages("lmtest") +``` + +```{r} library(sandwich) # heterokedasticity robust standard errors library(lmtest) # coefficient testing ``` From 46aafe76b0c7ea012fdca1d0e675dd1d4ee8a55a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 17:10:32 +0000 Subject: [PATCH 125/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in AC1 --- AC1/r-proxy-controls.Rmd | 2 ++ ...analysis-with-sensmakr-and-debiased-ml.Rmd | 3 ++ ...nalysis-with-sensmakr-and-debiased-ml.irnb | 36 +++++++++---------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/AC1/r-proxy-controls.Rmd b/AC1/r-proxy-controls.Rmd index 8f2d3a3a..21be891b 100644 --- a/AC1/r-proxy-controls.Rmd +++ b/AC1/r-proxy-controls.Rmd @@ -30,7 +30,9 @@ Since we're considering only linear models and in a low-dimensional setting, we' ```{r} install.packages("hdm") +``` +```{r} library(hdm) set.seed(1) diff --git a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd index 9bec8b4c..96f7e7d7 100644 --- a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd +++ b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd @@ -95,6 +95,9 @@ install.packages("sensemakr") install.packages("lfe") install.packages("hdm") install.packages("randomForest") +``` + +```{r} library(sensemakr) library(lfe) library(hdm) diff --git a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb index 723d0d7d..89bec79e 100644 --- a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb +++ b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb @@ -174,7 +174,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aac06c33", + "id": "5", "metadata": { "vscode": { "languageId": "r" @@ -193,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "6", "metadata": { "id": "zipYYvHdl60m", "vscode": { @@ -210,7 +210,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "7", "metadata": { "id": "hidden-packing", "papermill": { @@ -230,7 +230,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "8", "metadata": { "id": "authorized-transformation", "papermill": { @@ -272,7 +272,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": { "id": "dpvDjIKNw7Nk" }, @@ -283,7 +283,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": { "id": "ck8nzqQbv8dz", "vscode": { @@ -309,7 +309,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": { "id": "careful-dollar", "papermill": { @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": { "id": "remarkable-mozambique", "papermill": { @@ -363,7 +363,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": { "id": "built-enlargement", "papermill": { @@ -382,7 +382,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": { "id": "respective-sister", "papermill": { @@ -429,7 +429,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": { "id": "sorted-hands", "papermill": { @@ -448,7 +448,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": { "id": "proper-accessory", "papermill": { @@ -475,7 +475,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": { "id": "charged-mauritius", "papermill": { @@ -493,7 +493,7 @@ }, { "cell_type": "markdown", - "id": "17", + "id": "18", "metadata": { "id": "charitable-placement", "papermill": { @@ -512,7 +512,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "19", "metadata": { "id": "collect-neutral", "papermill": { @@ -556,7 +556,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "20", "metadata": { "id": "d8eMyN7NRhYO", "vscode": { @@ -593,7 +593,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "21", "metadata": { "id": "cUxDc1mYdMHH" }, @@ -605,7 +605,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "22", "metadata": { "id": "obvious-there", "papermill": { From 0a45d5681abd6fba260aca303f3579c371881e32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 17:11:36 +0000 Subject: [PATCH 126/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM1 --- PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd | 2 ++ PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd index 573705f7..ea7e6892 100644 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd @@ -25,7 +25,9 @@ $W$'s are a vector of worker characteristics explaining variation in wages. Cons install.packages("xtable") install.packages("hdm") # a library for high-dimensional metrics install.packages("sandwich") # a package used to compute robust standard errors +``` +```{r} library(hdm) library(xtable) library(sandwich) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd index 627cfb37..54e2607f 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd @@ -25,7 +25,9 @@ The variable of interest $Y$ is the (log) hourly wage rate constructed as the ra install.packages("xtable") install.packages("hdm") # a library for high-dimensional metrics install.packages("glmnet") # for lasso CV +``` +```{r} library(hdm) library(xtable) library(glmnet) From f92a567ac578bf455d3294233585bca6e02c99d9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 10:29:23 -0700 Subject: [PATCH 127/261] Update r_functional_approximation_by_nn_and_rf.irnb --- PM3/r_functional_approximation_by_nn_and_rf.irnb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 35e30bed..1fb62763 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -347,16 +347,16 @@ "source": [ "build_model <- function() {\n", "\n", - " model <- keras::keras_model_sequential() %>%\n", - " keras::layer_dense(\n", + " model <- keras_model_sequential() %>%\n", + " layer_dense(\n", " units = 200, activation = \"relu\",\n", " input_shape = 1\n", " ) %>%\n", - " keras::layer_dense(units = 20, activation = \"relu\") %>%\n", - " keras::layer_dense(units = 1)\n", + " layer_dense(units = 20, activation = \"relu\") %>%\n", + " layer_dense(units = 1)\n", "\n", - " model %>% keras::compile(\n", - " optimizer = keras::optimizer_adam(lr = 0.01),\n", + " model %>% compile(\n", + " optimizer = optimizer_adam(lr = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", " )\n", From 09d0a8c351b5573e49ae1c633a67536120ad31da Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 23:02:37 -0700 Subject: [PATCH 128/261] Added install::keras --- PM3/r_functional_approximation_by_nn_and_rf.irnb | 3 ++- PM3/r_ml_wage_prediction.irnb | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 1fb62763..47a04432 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -39,7 +39,8 @@ "install.packages(\"randomForest\")\n", "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", - "install.packages(\"keras\")" + "install.packages(\"keras\")\n", + "keras::install_keras()" ] }, { diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index 33ed7b2f..d25ccb59 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -57,7 +57,8 @@ "install.packages(\"nnet\")\n", "install.packages(\"gbm\")\n", "install.packages(\"rpart.plot\")\n", - "install.packages(\"keras\")" + "install.packages(\"keras\")\n", + "keras::install_keras()" ] }, { From 5f931e93ab469f9a336b9866bb0787859f9712c7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 16 Jul 2024 23:16:34 -0700 Subject: [PATCH 129/261] Added upper bound on tensorflow for keras to work in R --- .github/workflows/transform-R-to-Rmd.yml | 18 +++++++----------- ..._functional_approximation_by_nn_and_rf.irnb | 3 +-- PM3/r_ml_wage_prediction.irnb | 3 +-- requirements.txt | 2 +- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 0db20e87..7a9dd9cf 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -19,6 +19,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libcurl4-openssl-dev + - name: Set up Python uses: actions/setup-python@v2 with: @@ -27,19 +32,10 @@ jobs: - name: Install Python dependencies run: | python -m pip install --upgrade pip - pip install nbstripout tensorflow - - - name: Install dependencies - if: "matrix.directory == 'PM5'" - run: | + pip install nbstripout if [ -f requirements.txt ]; then pip install -r requirements.txt; fi shell: bash - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libcurl4-openssl-dev - + - name: Set up R uses: r-lib/actions/setup-r@v2 diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 47a04432..1fb62763 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -39,8 +39,7 @@ "install.packages(\"randomForest\")\n", "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", - "install.packages(\"keras\")\n", - "keras::install_keras()" + "install.packages(\"keras\")" ] }, { diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index d25ccb59..33ed7b2f 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -57,8 +57,7 @@ "install.packages(\"nnet\")\n", "install.packages(\"gbm\")\n", "install.packages(\"rpart.plot\")\n", - "install.packages(\"keras\")\n", - "keras::install_keras()" + "install.packages(\"keras\")" ] }, { diff --git a/requirements.txt b/requirements.txt index 1792c752..0ef9f635 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ torchvision lightgbm>=4.1.0 xgboost~=2.0.3 seaborn>=0.13.1 -tensorflow<3.0 +tensorflow<2.16 keras>=2.15.0,<3.0 tf_keras>=2.15.1,<3.0 plotnine>=0.12.4 From 91213d1a88e5d1abbf82f113de99af1bb2097010 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jul 2024 06:33:06 +0000 Subject: [PATCH 130/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM5 --- PM5/Autoencoders.Rmd | 73 ++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/PM5/Autoencoders.Rmd b/PM5/Autoencoders.Rmd index 6b6b7bc3..1758fded 100644 --- a/PM5/Autoencoders.Rmd +++ b/PM5/Autoencoders.Rmd @@ -12,11 +12,14 @@ In this notebook, we'll introduce and explore "autoencoders," which are a very s 3. Train a non-linear auto-encoder that uses a deep neural network ### Overview -As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \in \mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \mathbb{R}^d \to \mathbb{R}^k$ and $g: \mathbb{R}^k \to \mathbb{R}^d$, with $k \ll d$, such that $$g(f(X)) \approx X.$$ +As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \in \mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \mathbb{R}^d \to \mathbb{R}^k$ and $g: \mathbb{R}^k \to \mathbb{R}^d$, with $k \ll d$, such that +$$g(f(X)) \approx X.$$ + In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$. It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\mathbb{R}^d \to \mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize -$$\mathbb{E}[\mathrm{loss}(X, f(g(X))]$$ over functions $f \in F$ and $g \in G$. As usual, this is done by minimizing the sample analog. +$$\mathbb{E}[\mathrm{loss}(X, f(g(X))]$$ +over functions $f \in F$ and $g \in G$. As usual, this is done by minimizing the sample analog. @@ -39,6 +42,8 @@ install.packages("gridExtra") install.packages("dplyr") install.packages("purrr") install.packages("reshape2") +install.packages("ggplot2") +install.packages("stats") ``` ```{r} @@ -59,8 +64,11 @@ design_matrix <- array_reshape(faces$images, c(n_examples, height * width)) n_features <- dim(design_matrix)[2] # Print the dataset details -cat(sprintf("Labeled Faces in the Wild Dataset:\n Number of examples: %d\n Number of features: %d\n Image height: %d\n Image width: %d", - n_examples, n_features, height, width)) +cat(sprintf( + paste("Labeled Faces in the Wild Dataset:\n Number of examples: %d\n ", + "Number of features: %d\n Image height: %d\n Image width: %d"), + n_examples, n_features, height, width +)) ``` ```{r} @@ -71,25 +79,25 @@ library(grid) # Find indices where the label is 'Arnold Schwarzenegger' # faces$target uses python style indexing that starts at 0 rather than R style # indexing that starts at 1, so we subtract 1 so the indexing lines up -arnold_labels <- which(faces$target_names == "Arnold Schwarzenegger")-1 +arnold_labels <- which(faces$target_names == "Arnold Schwarzenegger") - 1 # Get indices of all images corresponding to Arnold arnold_pics <- which(faces$target %in% arnold_labels) -plot_faces <- function(images, n_row=2, n_col=3, width, height) { - par(mfrow=c(n_row, n_col), mar=c(0.5, 0.5, 0.5, 0.5)) +plot_faces <- function(images, n_row = 2, n_col = 3, width, height) { + par(mfrow = c(n_row, n_col), mar = c(0.5, 0.5, 0.5, 0.5)) for (i in seq_len(n_row * n_col)) { if (i <= length(images)) { # image needs to be transposed for and then flipped for correct orientation # using R "image" tmp <- t(images[[i]]) - tmp <- tmp[,ncol(tmp):1] - image(tmp, col=gray.colors(256), axes=FALSE, xlab="", ylab="") + tmp <- tmp[, rev(seq_len(ncol(tmp)))] + image(tmp, col = gray.colors(256), axes = FALSE, xlab = "", ylab = "") } } } # Ensure arnold_images contains the right amount of data and is not NULL -arnold_images <- lapply(arnold_pics[1:min(6, length(arnold_pics))], function(idx) { +arnold_images <- lapply(arnold_pics[seq_len(min(6, length(arnold_pics)))], function(idx) { faces$images[idx, , ] }) @@ -109,19 +117,21 @@ eigenfaces <- pca$rotation ```{r} # 2. Plot the first 6 "eigenfaces," the six images whose linear span best explains the variation in our dataset pca_images <- lapply(1:6, function(idx) { - array_reshape(eigenfaces[,idx], c(height,width)) + array_reshape(eigenfaces[, idx], c(height, width)) }) plot_faces(pca_images, height = height, width = width) -# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that captured the highest variation in our dataset of images) -# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors) +# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that +# captured the highest variation in our dataset of images) +# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation +# (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors) ``` ```{r} reconstruct <- function(image_vector, n_components, eigenfaces) { components <- eigenfaces[, 1:n_components, drop = FALSE] compimage <- components %*% (t(components) %*% image_vector) - return(array_reshape(compimage, c(height,width))) + return(array_reshape(compimage, c(height, width))) } # Select an Arnold image for reconstruction @@ -143,14 +153,15 @@ plot_faces(reconstructions, height = height, width = width) ```{r} library(keras) + encoding_dimension <- 64 input_image <- layer_input(shape = n_features) -encoded <- layer_dense(units = encoding_dimension, activation = 'linear')(input_image) -decoded <- layer_dense(units = n_features, activation = 'linear')(encoded) +encoded <- layer_dense(units = encoding_dimension, activation = "linear")(input_image) +decoded <- layer_dense(units = n_features, activation = "linear")(encoded) autoencoder <- keras_model(inputs = input_image, outputs = decoded) autoencoder %>% compile( - optimizer = 'adam', - loss = 'mse' + optimizer = "adam", + loss = "mse" ) autoencoder %>% fit( design_matrix, @@ -174,13 +185,14 @@ autoencoder %>% fit( ``` ```{r} -# Compute neural reconstruction -face_vector_flat <- as.numeric(face_vector) -reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1)) library(ggplot2) library(gridExtra) library(reshape2) +# Compute neural reconstruction +face_vector_flat <- as.numeric(face_vector) +reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1)) + # Do visual comparison image_height <- 62 image_width <- 47 @@ -204,26 +216,26 @@ mse Finally, let's train a nonlinear autoencoder for the same data where $F$ and $G$ are neural networks, and we restrict the dimension to be $k=64$. ```{r} -# Use a nonlinear neural network - library(tensorflow) + +# Use a nonlinear neural network n_features <- 2914 encoding_dimension <- 64 input_image <- layer_input(shape = n_features) encoded <- input_image %>% - layer_dense(units = encoding_dimension, activation = 'relu') %>% - layer_dense(units = encoding_dimension, activation = 'relu') + layer_dense(units = encoding_dimension, activation = "relu") %>% + layer_dense(units = encoding_dimension, activation = "relu") decoded <- encoded %>% - layer_dense(units = encoding_dimension, activation = 'relu') %>% - layer_dense(units = n_features, activation = 'relu') + layer_dense(units = encoding_dimension, activation = "relu") %>% + layer_dense(units = n_features, activation = "relu") autoencoder <- keras_model(inputs = input_image, outputs = decoded) autoencoder %>% compile( - optimizer = 'adam', - loss = 'mse' + optimizer = "adam", + loss = "mse" ) autoencoder %>% fit( design_matrix, @@ -238,7 +250,8 @@ autoencoder %>% fit( reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1)) # Do visual comparison -plot_faces(list(reconstructions[[4]],t(matrix(reconstruction, nrow = image_width, ncol = image_height))), n_row = 1, n_col = 2, width = image_width, height = image_height) +plot_faces(list(reconstructions[[4]], t(matrix(reconstruction, nrow = image_width, ncol = image_height))), + n_row = 1, n_col = 2, width = image_width, height = image_height) # Do numeric comparison # We also normalize the black/white gradient to take values in [0,1] (divide by 255) From d8beb6f2e0ed59bb1241683358850f5a53c342ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jul 2024 06:34:19 +0000 Subject: [PATCH 131/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in CM3 --- CM3/r-dagitty.Rmd | 67 +++++++++--------- CM3/r-dosearch.Rmd | 170 ++++++++++++++++++++++++--------------------- 2 files changed, 121 insertions(+), 116 deletions(-) diff --git a/CM3/r-dagitty.Rmd b/CM3/r-dagitty.Rmd index 87d99014..d7885b91 100644 --- a/CM3/r-dagitty.Rmd +++ b/CM3/r-dagitty.Rmd @@ -6,17 +6,20 @@ output: html_document There are system packages that some of the R packages need. We install them here. ```{r} -system('sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable') -system('sudo apt-get update') -system('sudo apt-get install libglpk-dev libgmp-dev libxml2-dev') +system("sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable") +system("sudo apt-get update") +system("sudo apt-get install libglpk-dev libgmp-dev libxml2-dev") ``` # Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests ```{r} -#install and load package +# install and load package install.packages("dagitty") install.packages("ggdag") +``` + +```{r} library(dagitty) library(ggdag) ``` @@ -26,9 +29,9 @@ library(ggdag) The following DAG is due to Judea Pearl ```{r} -#generate a couple of DAGs and plot them +# generate a couple of DAGs and plot them -G = dagitty('dag{ +G <- dagitty('dag{ Z1 [pos="-2,-1.5"] X1 [pos="-2,0"] Z2 [pos="1.5,-1.5"] @@ -51,7 +54,7 @@ D->M }') -ggdag(G)+ theme_dag() +ggdag(G) + theme_dag() ``` # Report Relatives of X2 @@ -61,7 +64,6 @@ print(parents(G, "X2")) print(children(G, "X2")) print(ancestors(G, "X2")) print(descendants(G, "X2")) - ``` # Find Paths Between D and Y @@ -74,19 +76,19 @@ paths(G, "D", "Y") # List All Testable Implications of the Model ```{r} -print( impliedConditionalIndependencies(G) ) +print(impliedConditionalIndependencies(G)) ``` # Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \to Y$ ```{r} -print( adjustmentSets( G, "D", "Y" ) ) +print(adjustmentSets(G, "D", "Y")) ``` # Identification via SWIG and D-separation ```{r} -SWIG = dagitty('dag{ +SWIG <- dagitty('dag{ Z1 [pos="-2,-1.5"] X1 [pos="-2,0"] Z2 [pos="1.5,-1.5"] @@ -109,14 +111,14 @@ Md-> Yd d-> Md }') -ggdag(SWIG)+ theme_dag() +ggdag(SWIG) + theme_dag() ``` # Deduce Conditional Exogeneity or Ignorability by D-separation ```{r} -print( impliedConditionalIndependencies(SWIG)[5:8] ) +print(impliedConditionalIndependencies(SWIG)[5:8]) ``` This coincides with the backdoor criterion for this graph. @@ -124,24 +126,22 @@ This coincides with the backdoor criterion for this graph. # Print All Average Effects Identifiable by Conditioning ```{r} -for( n in names(G) ){ - for( m in children(G,n) ){ - a <- adjustmentSets( G, n, m ) - if( length(a) > 0 ){ - cat("The effect ",n,"->",m, - " is identifiable by controlling for:\n",sep="") - print( a, prefix=" * " ) - } +for (n in names(G)) { + for (m in children(G, n)) { + a <- adjustmentSets(G, n, m) + if (length(a) > 0) { + cat("The effect ", n, "->", m, " is identifiable by controlling for:\n", sep = "") + print(a, prefix = " * ") } + } } ``` # Equivalence Classes ```{r} -P=equivalenceClass(G) +P <- equivalenceClass(G) plot(P) -#equivalentDAGs(G,10) ``` Next Consider the elemntary Triangular Model: @@ -151,24 +151,22 @@ $$ This model has no testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$. ```{r} -G3<- dagitty('dag{ +G3 <- dagitty("dag{ D -> Y X -> D X -> Y } -') +") -ggdag(G3)+ theme_dag() +ggdag(G3) + theme_dag() print(impliedConditionalIndependencies(G3)) - ``` ```{r} -P=equivalenceClass(G3) +P <- equivalenceClass(G3) plot(P) -equivalentDAGs(G3,10) - +equivalentDAGs(G3, 10) ``` # Example of Testing DAG Validity @@ -182,9 +180,7 @@ There are many other options for nonlinear models and discrete categorical varia set.seed(1) x <- simulateSEM(G) head(x) -#cov(x) localTests(G, data = x, type = c("cis")) - ``` Next we replaced $D$ by $\bar D$ generated differently: @@ -195,10 +191,9 @@ $\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the res ```{r} -x.R = x -x.R$D = (x$D+ x$Y)/2 - -localTests(G, data = x.R, type = c("cis")) +xR <- x +xR$D <- (x$D + x$Y) / 2 +localTests(G, data = xR, type = c("cis")) ``` diff --git a/CM3/r-dosearch.Rmd b/CM3/r-dosearch.Rmd index babe674c..b5d2b0e0 100644 --- a/CM3/r-dosearch.Rmd +++ b/CM3/r-dosearch.Rmd @@ -12,6 +12,9 @@ NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when - ```{r} install.packages("dosearch") +``` + +```{r} library("dosearch") ``` @@ -24,13 +27,15 @@ $$ Now suppose we want conditional average policy effect. ```{r} -data <- "p(y,d,x)" #data structure +data <- "p(y, d, x)" # data structure -query <- "p(y | do(d),x)" #query -- target parameter +query <- "p(y | do(d), x)" # query -- target parameter -graph <- "x -> y +graph <- " + x -> y x -> d - d -> y" + d -> y +" dosearch(data, query, graph) ``` @@ -41,14 +46,15 @@ p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x). $$ ```{r} -data <- "p(y,d,x)" +data <- "p(y, d, x)" query <- "p(y | do(d))" -graph <- "x -> y +graph <- " + x -> y x -> d - d -> y" - + d -> y +" dosearch(data, query, graph) ``` @@ -62,13 +68,15 @@ We integrate out $x$ in the previous formula. Suppose we don't observe the confounder. The effect is generally not identified. ```{r} -data <- "p(y,d)" +data <- "p(y, d)" query <- "p(y | do(d))" -graph <- "x -> y +graph <- " + x -> y x -> d - d -> y" + d -> y +" dosearch(data, query, graph) ``` @@ -78,21 +86,21 @@ The next graph is an example of J. Pearl (different notation), where the graph i Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully). ```{r} - -data <- "p(y,d,x2)" #observed only (Y, D, X_2) - -query<- "p(y|do(d))" #target parameter - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y +data <- "p(y, d, x2)" # observed only (Y, D, X_2) + +query <- "p(y | do(d))" # target parameter + +graph <- " + z1 -> x1 + z1 -> x2 + z2 -> x2 + z2 -> x3 + x2 -> d + x2 -> y + x3 -> y + x1 -> d + d -> m + m -> y " dosearch(data, query, graph) @@ -101,25 +109,25 @@ dosearch(data, query, graph) Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work. ```{r} - -data <- "p(y,d,x2,x3)" - -conditional.query<- "p(y|do(d),x2, x3)" #can ID conditional average effect? -query<- "p(y|do(d))" #can ID unconditional effect? - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y +data <- "p(y, d, x2, x3)" + +conditional_query <- "p(y | do(d), x2, x3)" # can ID conditional average effect? +query <- "p(y | do(d))" # can ID unconditional effect? + +graph <- " + z1 -> x1 + z1 -> x2 + z2 -> x2 + z2 -> x3 + x2 -> d + x2 -> y + x3 -> y + x1 -> d + d -> m + m -> y " -print(dosearch(data, conditional.query, graph)) +print(dosearch(data, conditional_query, graph)) print(dosearch(data, query, graph)) ``` @@ -141,25 +149,26 @@ $$ Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \to Y$? Can we use back-door-criterion twice to get $D \to M$ and $M \to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively. ```{r} -data <- "p(y,d, m)" - -query.dm<- "p(m|do(d))" -query.md<- "p(y|do(m))" -query<- "p(y|do(d))" - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y +data <- "p(y, d, m)" + +query_dm <- "p(m | do(d))" +query_md <- "p(y | do(m))" +query <- "p(y | do(d))" + +graph <- " + z1 -> x1 + z1 -> x2 + z2 -> x2 + z2 -> x3 + x2 -> d + x2 -> y + x3 -> y + x1 -> d + d -> m + m -> y " -print(dosearch(data, query.dm, graph)) -print(dosearch(data, query.md, graph)) +print(dosearch(data, query_dm, graph)) +print(dosearch(data, query_md, graph)) print(dosearch(data, query, graph)) ``` @@ -180,26 +189,27 @@ $$ The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \to Y$. The answer is (guess) and the package correctly recovers it. ```{r} -data <- "p(y,m) - p(m,d)" - -query.dm<- "p(m|do(d))" -query.md<- "p(y|do(m))" -query<- "p(y|do(d))" - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y +data <- "p(y, m) + p(m, d)" + +query_dm <- "p(m | do(d))" +query_md <- "p(y | do(m))" +query <- "p(y | do(d))" + +graph <- " + z1 -> x1 + z1 -> x2 + z2 -> x2 + z2 -> x3 + x2 -> d + x2 -> y + x3 -> y + x1 -> d + d -> m + m -> y " -print(dosearch(data, query.dm, graph)) -print(dosearch(data, query.md, graph)) +print(dosearch(data, query_dm, graph)) +print(dosearch(data, query_md, graph)) print(dosearch(data, query, graph)) ``` From b993d1096d371f0ffe03e6fa2224a5bc3c737460 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jul 2024 06:34:32 +0000 Subject: [PATCH 132/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM3 --- ..._functional_approximation_by_nn_and_rf.Rmd | 15 ++++---- ...functional_approximation_by_nn_and_rf.irnb | 38 +++++++++---------- PM3/r_ml_wage_prediction.Rmd | 11 +++++- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd index be988103..aca4a796 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.Rmd +++ b/PM3/r_functional_approximation_by_nn_and_rf.Rmd @@ -16,8 +16,9 @@ install.packages("randomForest") install.packages("rpart") install.packages("gbm") install.packages("keras") +``` - +```{r} library(randomForest) library(rpart) library(gbm) @@ -108,16 +109,16 @@ points(x_train, pred_bt, col = 4, pch = 19) ```{r} build_model <- function() { - model <- keras::keras_model_sequential() %>% - keras::layer_dense( + model <- keras_model_sequential() %>% + layer_dense( units = 200, activation = "relu", input_shape = 1 ) %>% - keras::layer_dense(units = 20, activation = "relu") %>% - keras::layer_dense(units = 1) + layer_dense(units = 20, activation = "relu") %>% + layer_dense(units = 1) - model %>% keras::compile( - optimizer = keras::optimizer_adam(lr = 0.01), + model %>% compile( + optimizer = optimizer_adam(lr = 0.01), loss = "mse", metrics = c("mae"), ) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index 1fb62763..ef445b1b 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -45,7 +45,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c39fef90", + "id": "2", "metadata": { "vscode": { "languageId": "r" @@ -61,7 +61,7 @@ }, { "cell_type": "markdown", - "id": "2", + "id": "3", "metadata": { "id": "widespread-mention", "papermill": { @@ -79,7 +79,7 @@ }, { "cell_type": "markdown", - "id": "3", + "id": "4", "metadata": { "id": "C4WFqJKmC25Z" }, @@ -92,7 +92,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4", + "id": "5", "metadata": { "id": "registered-correction", "papermill": { @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "6", "metadata": { "id": "banner-sleeve", "papermill": { @@ -156,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "7", "metadata": { "id": "local-saturn", "papermill": { @@ -174,7 +174,7 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "8", "metadata": { "id": "international-serum", "papermill": { @@ -197,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8", + "id": "9", "metadata": { "id": "further-siemens", "papermill": { @@ -222,7 +222,7 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "10", "metadata": { "id": "infrared-belgium", "papermill": { @@ -241,7 +241,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10", + "id": "11", "metadata": { "id": "naval-twenty", "papermill": { @@ -275,7 +275,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": { "id": "listed-michigan", "papermill": { @@ -307,7 +307,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", @@ -328,7 +328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": { "id": "behind-redhead", "papermill": { @@ -366,7 +366,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14", + "id": "15", "metadata": { "id": "modified-monitor", "papermill": { @@ -390,7 +390,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": { "id": "early-savannah", "papermill": { @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "17", "metadata": { "id": "answering-ready", "papermill": { @@ -447,7 +447,7 @@ }, { "cell_type": "markdown", - "id": "17", + "id": "18", "metadata": { "id": "RAE1DNS1TL8K" }, @@ -458,7 +458,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "19", "metadata": { "id": "_cyeRToRTORV", "vscode": { @@ -486,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "20", "metadata": { "id": "FuBqP_e7Te5Y", "vscode": { diff --git a/PM3/r_ml_wage_prediction.Rmd b/PM3/r_ml_wage_prediction.Rmd index 1cf232e3..c5cd0bf7 100644 --- a/PM3/r_ml_wage_prediction.Rmd +++ b/PM3/r_ml_wage_prediction.Rmd @@ -10,7 +10,6 @@ Now, we also consider nonlinear prediction rules including tree-based methods. ```{r} # Import relevant packages - install.packages("xtable") install.packages("hdm") install.packages("glmnet") @@ -20,7 +19,9 @@ install.packages("nnet") install.packages("gbm") install.packages("rpart.plot") install.packages("keras") +``` +```{r} library(hdm) library(xtable) library(glmnet) @@ -495,9 +496,17 @@ We illustrate how to predict an outcome variable Y in a high-dimensional setting We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other. +On a unix system, installation of the package h2o also requires the RCurl package, which requires the libcurl library to have been installed. If the installation of the h2o package fails, try installing the libcurl package first and repeating the cell. You can install the libcurl package for instance by running: +```bash +sudo apt-get install -y libcurl4-openssl-dev +``` + ```{r} # load the H2O package install.packages("h2o") +``` + +```{r} library(h2o) ``` From 8947b02286c29fb190b3573e41ee84f97506d717 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 12:50:02 +0300 Subject: [PATCH 133/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 1 + 1 file changed, 1 insertion(+) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index ba63f5e7..04924081 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -60,6 +60,7 @@ "install.packages(\"randomForest\")\n", "install.packages(\"glmnet\")\n", "install.packages(\"rpart\")\n", + "install.packages(\"data.table\")\n", "install.packages(\"gbm\")" ] }, From fdecbf998cf34176145f1bda31661c2147ad26b1 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 12:51:26 +0300 Subject: [PATCH 134/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 7a9dd9cf..91ddf548 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -77,7 +77,7 @@ jobs: if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) - # stop("Linting failed with warnings") + stop("Linting failed with warnings") } }) ' From b2d7222be8ad944e2b7b875d57f104693b5538ee Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 05:17:21 -0700 Subject: [PATCH 135/261] Update dml-for-conditional-average-treatment-effect.irnb --- T/dml-for-conditional-average-treatment-effect.irnb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/T/dml-for-conditional-average-treatment-effect.irnb b/T/dml-for-conditional-average-treatment-effect.irnb index ad6b0c01..ea413404 100644 --- a/T/dml-for-conditional-average-treatment-effect.irnb +++ b/T/dml-for-conditional-average-treatment-effect.irnb @@ -595,8 +595,8 @@ "outputs": [], "source": [ "msqrt <- function(C) {\n", - " C.eig <- eigen(C)\n", - " return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors))\n", + " Ceig <- eigen(C)\n", + " return(Ceig$vectors %*% diag(sqrt(Ceig$values)) %*% solve(Ceig$vectors))\n", "}\n", "\n", "tboot <- function(regressors_grid, omega_hat, alpha, B = 10000) {\n", From ad084b2a664f3c07afde0679ad44ea8699c1077a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 08:25:07 -0700 Subject: [PATCH 136/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 91ddf548..74c82aea 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -87,24 +87,24 @@ jobs: run: | log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' + options(show.error.locations = TRUE) files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) log_con <- file("'$log_file'", open = "wt") sink(log_con, type = "output") sink(log_con, type = "message") errors <- list() for (gitrfile in files) { - tryCatch( - { - source(gitrfile) - }, + withCallingHandlers( + withRestarts( + source(gitrfile), + muffleStop = function() NULL + ), error = function(e) { - traceback_info <- capture.output({ - cat("Traceback:\n") - traceback() - }) - errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, message = e$message, traceback = traceback_info) - } - ) + traceback_info <- sys.calls() + errors[[length(errors) + 1]] <<- list(gitrfile = gitrfile, location = capture.output(print(e$call)), message = e$message, traceback = traceback_info) + invokeRestart("muffleStop") + } + ) } sink(type = "output") sink(type = "message") @@ -112,8 +112,11 @@ jobs: if (length(errors) > 0) { for (error in errors) { cat("Error found in file:", error$gitrfile, "\n") + cat("at line::", error$location, "\n") cat("Error message:", error$message, "\n") + print("Traceback:\n") cat(paste(error$traceback, collapse = "\n")) + print("\n") } quit(status = 1, save = "no") # Exit with an error status if errors are found } From 9ebc5c97541c8b3c51204088711b4f81030cd245 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 08:27:26 -0700 Subject: [PATCH 137/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 04924081..7c760e3d 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -863,18 +863,18 @@ "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", "\n", "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n_trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", + "## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", "dreg_boost <- function(x, d) {\n", " gbm(as.formula(\"D~.\"), cbind(data.frame(D = d), x), distribution = \"bernoulli\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "yreg_boost <- function(x, y) {\n", " gbm(as.formula(\"y~.\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "zreg_boost <- function(x, z) {\n", " gbm(as.formula(\"Z~.\"), cbind(data.frame(Z = z), x), distribution = \"bernoulli\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "\n", "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", @@ -1513,26 +1513,26 @@ "cat(sprintf(\"\\nDML with Boosted Trees \\n\"))\n", "\n", "# NB: early stopping cannot easily be implemented with gbm\n", - "## set n_trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", + "## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE)\n", "dreg0 <- function(x, d) {\n", " gbm(as.formula(\"D ~ .\"), cbind(data.frame(D = d), x), distribution = \"bernoulli\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "dreg1 <- function(x, d) {\n", " gbm(as.formula(\"D ~ .\"), cbind(data.frame(D = d), x), distribution = \"bernoulli\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "yreg0 <- function(x, y) {\n", " gbm(as.formula(\"y ~ .\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "yreg1 <- function(x, y) {\n", " gbm(as.formula(\"y ~ .\"), cbind(data.frame(y = y), x), distribution = \"gaussian\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "zreg <- function(x, z) {\n", " gbm(as.formula(\"Z ~ .\"), cbind(data.frame(Z = z), x), distribution = \"bernoulli\",\n", - " interaction.depth = 2, n_trees = 100, shrinkage = .1)\n", + " interaction.depth = 2, n.trees = 100, shrinkage = .1)\n", "}\n", "\n", "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", From 6fbbd4ba0402d5b994264f1089fe78631cf65d91 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 10:09:47 -0700 Subject: [PATCH 138/261] Executions errors in AC2 --- AC2/r-dml-401k-IV.irnb | 8 ++++---- AC2/r-weak-iv-experiments.irnb | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 7c760e3d..e6e9ef1d 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1143,7 +1143,7 @@ " } else if (type == \"prob\") {\n", " return(predict(dfit0, (Xb), type = \"prob\")[, 2])\n", " } else {\n", - " return(predict(dfit0, (Xb)))\n", + " return(predict(dfit0, (Xb))[, 2])\n", " }\n", " } else {\n", " return(0)\n", @@ -1160,7 +1160,7 @@ " } else if (type == \"prob\") {\n", " return(predict(dfit1, (Xb), type = \"response\")[, 2])\n", " } else {\n", - " return(predict(dfit1, (Xb)))\n", + " return(predict(dfit1, (Xb))[, 2])\n", " }\n", " } else {\n", " return(1)\n", @@ -1251,8 +1251,8 @@ " yfit1 <- yreg1((XZ1), yZ1)\n", " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", "\n", - " dhat0[I[[b]]] <- get_dhat0(XZ0, as.factor(DZ0), Xb, dreg0, type = \"prob\")\n", - " dhat1[I[[b]]] <- get_dhat1(XZ1, as.factor(DZ1), Xb, dreg1, type = \"prob\")\n", + " dhat0[I[[b]]] <- get_dhat0(XZ0, as.factor(DZ0), Xb, dreg0)\n", + " dhat1[I[[b]]] <- get_dhat1(XZ1, as.factor(DZ1), Xb, dreg1)\n", "\n", " } else if (method == \"boostedtrees\") {\n", " XZ0 <- as.data.frame(XZ0)\n", diff --git a/AC2/r-weak-iv-experiments.irnb b/AC2/r-weak-iv-experiments.irnb index 203c1e67..588f7e97 100644 --- a/AC2/r-weak-iv-experiments.irnb +++ b/AC2/r-weak-iv-experiments.irnb @@ -20,6 +20,7 @@ }, "outputs": [], "source": [ + "install.packages(\"stats\")\n", "install.packages(\"hdm\")" ] }, @@ -33,6 +34,7 @@ }, "outputs": [], "source": [ + "library(stats)\n", "library(hdm)" ] }, From fc6230c24eb367f609b928d52e4a9493be28df85 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 11:54:29 -0700 Subject: [PATCH 139/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index e6e9ef1d..36cb1d1b 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1133,11 +1133,15 @@ }, "outputs": [], "source": [ - "get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL) {\n", + "get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL, to_factor = FALSE) {\n", " # train a treatment model on training data that received Z=0 and predict treatment on all data in test set\n", " if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically\n", " dreg0_ <- dreg0\n", - " dfit0 <- dreg0_((XZ0), DZ0)\n", + " if (to_factor == TRUE) {\n", + " dfit0 <- dreg0_((XZ0), as.factor(DZ0))\n", + " } else {\n", + " dfit0 <- dreg0_((XZ0), DZ0)\n", + " }\n", " if (type == \"reponse\") {\n", " return(predict(dfit0, (Xb), type = \"response\"))\n", " } else if (type == \"prob\") {\n", @@ -1150,11 +1154,15 @@ " }\n", "}\n", "\n", - "get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL) {\n", + "get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL, to_factor = FALSE) {\n", " # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set\n", " if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically\n", " dreg1_ <- dreg1\n", - " dfit1 <- dreg1_((XZ1), DZ1)\n", + " if (to_factor == TRUE) {\n", + " dfit1 <- dreg1_((XZ1), as.factor(DZ1))\n", + " } else {\n", + " dfit1 <- dreg1_((XZ1), DZ1)\n", + " }\n", " if (type == \"response\") {\n", " return(predict(dfit1, (Xb), type = \"response\"))\n", " } else if (type == \"prob\") {\n", @@ -1251,8 +1259,8 @@ " yfit1 <- yreg1((XZ1), yZ1)\n", " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", "\n", - " dhat0[I[[b]]] <- get_dhat0(XZ0, as.factor(DZ0), Xb, dreg0)\n", - " dhat1[I[[b]]] <- get_dhat1(XZ1, as.factor(DZ1), Xb, dreg1)\n", + " dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, to_factor = TRUE)\n", + " dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, to_factor = TRUE)\n", "\n", " } else if (method == \"boostedtrees\") {\n", " XZ0 <- as.data.frame(XZ0)\n", From 8472c2fe7c14a15375e42ba2ccc9f87fe27cebdb Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 12:03:06 -0700 Subject: [PATCH 140/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 36cb1d1b..31ff0d91 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1133,14 +1133,14 @@ }, "outputs": [], "source": [ - "get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL, to_factor = FALSE) {\n", + "get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL, DZ0factor = NULL) {\n", " # train a treatment model on training data that received Z=0 and predict treatment on all data in test set\n", " if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically\n", " dreg0_ <- dreg0\n", - " if (to_factor == TRUE) {\n", - " dfit0 <- dreg0_((XZ0), as.factor(DZ0))\n", - " } else {\n", + " if (DZ0factor == NULL) {\n", " dfit0 <- dreg0_((XZ0), DZ0)\n", + " } else {\n", + " dfit0 <- dreg0_((XZ0), DZ0factor)\n", " }\n", " if (type == \"reponse\") {\n", " return(predict(dfit0, (Xb), type = \"response\"))\n", @@ -1154,14 +1154,14 @@ " }\n", "}\n", "\n", - "get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL, to_factor = FALSE) {\n", + "get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL, DZ1factor = NULL) {\n", " # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set\n", " if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically\n", " dreg1_ <- dreg1\n", - " if (to_factor == TRUE) {\n", - " dfit1 <- dreg1_((XZ1), as.factor(DZ1))\n", - " } else {\n", + " if (DZ1factor == NULL) {\n", " dfit1 <- dreg1_((XZ1), DZ1)\n", + " } else {\n", + " dfit1 <- dreg1_((XZ1), DZ1factor)\n", " }\n", " if (type == \"response\") {\n", " return(predict(dfit1, (Xb), type = \"response\"))\n", @@ -1243,8 +1243,8 @@ " yfit1 <- yreg1((XZ1), yZ1)\n", " yhat1[I[[b]]] <- predict(yfit1, (Xb), type = \"response\")\n", "\n", - " dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0factor, Xb, dreg0, type = \"prob\")\n", - " dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1factor, Xb, dreg1, type = \"prob\")\n", + " dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = \"prob\", DZ0factor = DZ0factor)\n", + " dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = \"prob\", DZ1factor = DZ1factor)\n", "\n", " } else if (method == \"decisiontrees\") {\n", " XZ0 <- as.data.frame(XZ0)\n", @@ -1259,8 +1259,8 @@ " yfit1 <- yreg1((XZ1), yZ1)\n", " yhat1[I[[b]]] <- predict(yfit1, (Xb))\n", "\n", - " dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, to_factor = TRUE)\n", - " dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, to_factor = TRUE)\n", + " dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, DZ0factor = as.factor(DZ0))\n", + " dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, DZ1factor = as.factor(DZ1))\n", "\n", " } else if (method == \"boostedtrees\") {\n", " XZ0 <- as.data.frame(XZ0)\n", From 39db4dd54d66a6977d4f0fd35a7e21c7b3b59af7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Wed, 17 Jul 2024 23:05:41 -0700 Subject: [PATCH 141/261] Linting and execution errors The function "summary" was defined in the r-dml-401k-IV notebook, but then because summary is a global function used by the stats package, redefining it , basically redefined the global variable. Any script that executes after this script, will have the summary function redefined and this throws an error. Better not re-use and re-define such global library R functions. --- AC2/r-dml-401k-IV.irnb | 68 ++++++++++++++-------------- AC2/r-weak-iv-experiments.irnb | 2 - PM2/r_experiment_non_orthogonal.irnb | 6 +-- PM5/Autoencoders.irnb | 3 +- 4 files changed, 37 insertions(+), 42 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 31ff0d91..ab1a959d 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -625,7 +625,7 @@ }, "outputs": [], "source": [ - "summary <- function(point, stderr, resy, resD, resZ, name) {\n", + "summary_for_plivm <- function(point, stderr, resy, resD, resZ, name) {\n", " data <- data.frame(\n", " estimate = point, # point estimate\n", " stderr = stderr, # standard error\n", @@ -680,8 +680,8 @@ "\n", "dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_lasso_cv, yreg_lasso_cv, zreg_lasso_cv,\n", " nfold = 5, method = \"regression\")\n", - "sum_lasso_cv <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"LassoCV\")\n", + "sum_lasso_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"LassoCV\")\n", "tableplr <- data.frame()\n", "tableplr <- rbind(sum_lasso_cv)\n", "tableplr\n", @@ -729,8 +729,8 @@ "\n", "dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_ridge_cv, yreg_ridge_cv, zreg_ridge_cv,\n", " nfold = 5, method = \"regression\")\n", - "sum_lasso_ridge_cv <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"LassoCV/LogisticCV\")\n", + "sum_lasso_ridge_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"LassoCV/LogisticCV\")\n", "tableplr <- rbind(tableplr, sum_lasso_ridge_cv)\n", "tableplr\n", "\n", @@ -777,8 +777,8 @@ "\n", "dml2_results <- dml2_for_plivm(as.matrix(X), as.factor(D), as.factor(Z), y, dreg_rf, yreg_rf, zreg_rf,\n", " nfold = 5, method = \"randomforest\")\n", - "sum_rf <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"RF\")\n", + "sum_rf <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"RF\")\n", "tableplr <- rbind(tableplr, sum_rf)\n", "tableplr\n", "\n", @@ -826,8 +826,8 @@ "\n", "dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_tr, yreg_tr, zreg_tr,\n", " nfold = 5, method = \"decisiontrees\")\n", - "sum_tr <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"Decision Trees\")\n", + "sum_tr <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"Decision Trees\")\n", "tableplr <- rbind(tableplr, sum_tr)\n", "tableplr\n", "\n", @@ -880,8 +880,8 @@ "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", "dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_boost, yreg_boost, zreg_boost,\n", " nfold = 5, method = \"regression\")\n", - "sum_boost <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"Boosted Trees\")\n", + "sum_boost <- summary_for_plivm_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"Boosted Trees\")\n", "tableplr <- rbind(tableplr, sum_boost)\n", "tableplr\n", "\n", @@ -924,8 +924,8 @@ "source": [ "# Best fit is boosted trees for D, Z, Y\n", "\n", - "sum_best <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"Best\")\n", + "sum_best <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"Best\")\n", "tableplr <- rbind(tableplr, sum_best)\n", "tableplr" ] @@ -980,7 +980,7 @@ "coef_est <- ivfit$coef # extract coefficient\n", "se <- ivfit$se # record standard error\n", "\n", - "sum_ma <- summary(coef_est, se, ma_ytil, ma_dtil, ma_ztil, name = \"Model Average\")\n", + "sum_ma <- summary_for_plivm(coef_est, se, ma_ytil, ma_dtil, ma_ztil, name = \"Model Average\")\n", "tableplr <- rbind(tableplr, sum_ma)\n", "tableplr" ] @@ -1137,7 +1137,7 @@ " # train a treatment model on training data that received Z=0 and predict treatment on all data in test set\n", " if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically\n", " dreg0_ <- dreg0\n", - " if (DZ0factor == NULL) {\n", + " if (is.null(DZ0factor)) {\n", " dfit0 <- dreg0_((XZ0), DZ0)\n", " } else {\n", " dfit0 <- dreg0_((XZ0), DZ0factor)\n", @@ -1158,7 +1158,7 @@ " # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set\n", " if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically\n", " dreg1_ <- dreg1\n", - " if (DZ1factor == NULL) {\n", + " if (is.null(DZ1factor)) {\n", " dfit1 <- dreg1_((XZ1), DZ1)\n", " } else {\n", " dfit1 <- dreg1_((XZ1), DZ1factor)\n", @@ -1166,7 +1166,7 @@ " if (type == \"response\") {\n", " return(predict(dfit1, (Xb), type = \"response\"))\n", " } else if (type == \"prob\") {\n", - " return(predict(dfit1, (Xb), type = \"response\")[, 2])\n", + " return(predict(dfit1, (Xb), type = \"prob\")[, 2])\n", " } else {\n", " return(predict(dfit1, (Xb))[, 2])\n", " }\n", @@ -1339,7 +1339,7 @@ }, "outputs": [], "source": [ - "summary <- function(coef_est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) {\n", + "summary_for_iivm <- function(coef_est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) {\n", " summary_data <- data.frame(\n", " estimate = coef_est, # point estimate\n", " se = se, # standard error\n", @@ -1390,9 +1390,9 @@ "\n", "dml2_results <- dml2_for_iivm(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg,\n", " trimming = 0.01, nfold = 5, method = \"regression\")\n", - "sum_lasso_ridge_cv <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", - " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", - " dml2_results$drZ, dml2_results$drD, name = \"LassoCV/LogisticCV\")\n", + "sum_lasso_ridge_cv <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", + " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", + " dml2_results$drZ, dml2_results$drD, name = \"LassoCV/LogisticCV\")\n", "table <- data.frame()\n", "table <- rbind(table, sum_lasso_ridge_cv)\n", "table\n", @@ -1440,9 +1440,9 @@ "\n", "dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg,\n", " trimming = 0.01, nfold = 5, method = \"randomforest\")\n", - "sum_rf <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", - " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", - " dml2_results$drZ, dml2_results$drD, name = \"RF\")\n", + "sum_rf <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", + " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", + " dml2_results$drZ, dml2_results$drD, name = \"RF\")\n", "table <- rbind(table, sum_rf)\n", "table\n", "\n", @@ -1489,9 +1489,9 @@ "\n", "dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg,\n", " trimming = 0.01, nfold = 5, method = \"decisiontrees\")\n", - "sum_tr <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", - " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", - " dml2_results$drZ, dml2_results$drD, name = \"Decision Trees\")\n", + "sum_tr <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", + " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", + " dml2_results$drZ, dml2_results$drD, name = \"Decision Trees\")\n", "table <- rbind(table, sum_tr)\n", "table\n", "\n", @@ -1546,9 +1546,9 @@ "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", "dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg,\n", " trimming = 0.01, nfold = 5, method = \"boostedtrees\")\n", - "sum_boost <- summary(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", - " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", - " dml2_results$drZ, dml2_results$drD, name = \"Boosted Trees\")\n", + "sum_boost <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat,\n", + " dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil,\n", + " dml2_results$drZ, dml2_results$drD, name = \"Boosted Trees\")\n", "table <- rbind(table, sum_boost)\n", "table\n", "\n", @@ -1619,8 +1619,8 @@ "variance <- mean(psi^2) / Jhat^2\n", "se <- sqrt(variance / nrow(X))\n", "\n", - "sum_best <- summary(coef_est, se, best_yhat, best_dhat, best_zhat,\n", - " ytil_best, dtil_best, ztil_best, drZ, drD, name = \"Best\")\n", + "sum_best <- summary_for_iivm(coef_est, se, best_yhat, best_dhat, best_zhat,\n", + " ytil_best, dtil_best, ztil_best, drZ, drD, name = \"Best\")\n", "table <- rbind(table, sum_best)\n", "table" ] @@ -1677,8 +1677,8 @@ "variance <- mean(psi^2) / Jhat^2\n", "se <- sqrt(variance / nrow(X))\n", "\n", - "sum_ma <- summary(coef_est, se, ma_yhat, ma_dhat, ma_zhat,\n", - " ma_ytil, ma_dtil, ma_ztil, drZ, drD, name = \"Model Average\")\n", + "sum_ma <- summary_for_iivm(coef_est, se, ma_yhat, ma_dhat, ma_zhat,\n", + " ma_ytil, ma_dtil, ma_ztil, drZ, drD, name = \"Model Average\")\n", "table <- rbind(table, sum_ma)\n", "table" ] diff --git a/AC2/r-weak-iv-experiments.irnb b/AC2/r-weak-iv-experiments.irnb index 588f7e97..203c1e67 100644 --- a/AC2/r-weak-iv-experiments.irnb +++ b/AC2/r-weak-iv-experiments.irnb @@ -20,7 +20,6 @@ }, "outputs": [], "source": [ - "install.packages(\"stats\")\n", "install.packages(\"hdm\")" ] }, @@ -34,7 +33,6 @@ }, "outputs": [], "source": [ - "library(stats)\n", "library(hdm)" ] }, diff --git a/PM2/r_experiment_non_orthogonal.irnb b/PM2/r_experiment_non_orthogonal.irnb index fc1e92b2..507334a3 100644 --- a/PM2/r_experiment_non_orthogonal.irnb +++ b/PM2/r_experiment_non_orthogonal.irnb @@ -20,8 +20,7 @@ }, "outputs": [], "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"stats\")" + "install.packages(\"hdm\")" ] }, { @@ -34,8 +33,7 @@ }, "outputs": [], "source": [ - "library(hdm)\n", - "library(stats)" + "library(hdm)" ] }, { diff --git a/PM5/Autoencoders.irnb b/PM5/Autoencoders.irnb index 41924058..27ca5d04 100644 --- a/PM5/Autoencoders.irnb +++ b/PM5/Autoencoders.irnb @@ -77,8 +77,7 @@ "install.packages(\"dplyr\")\n", "install.packages(\"purrr\")\n", "install.packages(\"reshape2\")\n", - "install.packages(\"ggplot2\")\n", - "install.packages(\"stats\")" + "install.packages(\"ggplot2\")" ] }, { From 2bd5db9102b4243f90b2e57be0196ecbd152594c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 18 Jul 2024 06:20:11 +0000 Subject: [PATCH 142/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM5 --- PM5/Autoencoders.Rmd | 1 - 1 file changed, 1 deletion(-) diff --git a/PM5/Autoencoders.Rmd b/PM5/Autoencoders.Rmd index 1758fded..95a288c4 100644 --- a/PM5/Autoencoders.Rmd +++ b/PM5/Autoencoders.Rmd @@ -43,7 +43,6 @@ install.packages("dplyr") install.packages("purrr") install.packages("reshape2") install.packages("ggplot2") -install.packages("stats") ``` ```{r} From ca4d753501f9494208c6727021854989d5199d0f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 00:08:28 -0700 Subject: [PATCH 143/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index ab1a959d..5f91ace5 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1142,12 +1142,14 @@ " } else {\n", " dfit0 <- dreg0_((XZ0), DZ0factor)\n", " }\n", - " if (type == \"reponse\") {\n", - " return(predict(dfit0, (Xb), type = \"response\"))\n", + " if (is.null(type)) {\n", + " return(predict(dfit0, (Xb))[, 2])\n", " } else if (type == \"prob\") {\n", " return(predict(dfit0, (Xb), type = \"prob\")[, 2])\n", + " } else if (type == \"reponse\"){\n", + " return(predict(dfit0, (Xb), type = \"response\"))\n", " } else {\n", - " return(predict(dfit0, (Xb))[, 2])\n", + " stop(\"Invalid argument `type`.\")\n", " }\n", " } else {\n", " return(0)\n", @@ -1163,12 +1165,14 @@ " } else {\n", " dfit1 <- dreg1_((XZ1), DZ1factor)\n", " }\n", - " if (type == \"response\") {\n", - " return(predict(dfit1, (Xb), type = \"response\"))\n", + " if (is.null(type)) {\n", + " return(predict(dfit1, (Xb))[, 2])\n", " } else if (type == \"prob\") {\n", " return(predict(dfit1, (Xb), type = \"prob\")[, 2])\n", + " } else if (type == \"response\") {\n", + " return(predict(dfit1, (Xb), type = \"response\"))\n", " } else {\n", - " return(predict(dfit1, (Xb))[, 2])\n", + " stop(\"Invalid argument `type`.\")\n", " }\n", " } else {\n", " return(1)\n", @@ -1324,7 +1328,7 @@ " return(list(coef_est = coef_est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil,\n", " dtil = dtil, ztil = ztil, drZ = drZ, drD = drD,\n", " yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1))\n", - "}\n" + "}" ] }, { From 34ab8e9e117d820f683a8a01dbfda8750b069793 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 15:10:35 +0300 Subject: [PATCH 144/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 5f91ace5..125e7322 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -1146,10 +1146,10 @@ " return(predict(dfit0, (Xb))[, 2])\n", " } else if (type == \"prob\") {\n", " return(predict(dfit0, (Xb), type = \"prob\")[, 2])\n", - " } else if (type == \"reponse\"){\n", + " } else if (type == \"reponse\") {\n", " return(predict(dfit0, (Xb), type = \"response\"))\n", " } else {\n", - " stop(\"Invalid argument `type`.\")\n", + " stop(\"Invalid argument `type`.\")\n", " }\n", " } else {\n", " return(0)\n", From 02b156f6c65bcd57707aeb34ea2a862d6ebec775 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 08:38:36 -0700 Subject: [PATCH 145/261] Update r-dml-401k-IV.irnb --- AC2/r-dml-401k-IV.irnb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index 125e7322..bb1f40ad 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -880,8 +880,8 @@ "# passing these through regression as type=\"response\", and D and Z should not be factors!\n", "dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_boost, yreg_boost, zreg_boost,\n", " nfold = 5, method = \"regression\")\n", - "sum_boost <- summary_for_plivm_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", - " dml2_results$ztil, name = \"Boosted Trees\")\n", + "sum_boost <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil,\n", + " dml2_results$ztil, name = \"Boosted Trees\")\n", "tableplr <- rbind(tableplr, sum_boost)\n", "tableplr\n", "\n", From f08bde7f318156ed25536e839ef5c8a2d9c8a4ea Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 10:13:59 -0700 Subject: [PATCH 146/261] Testing download problem for extralearners --- AC2/aa.irnb | 208 +++++++++++++++++++ PM1/r-linear-model-overfitting.irnb | 33 +-- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 8 +- 3 files changed, 233 insertions(+), 16 deletions(-) create mode 100644 AC2/aa.irnb diff --git a/AC2/aa.irnb b/AC2/aa.irnb new file mode 100644 index 00000000..c3e00acb --- /dev/null +++ b/AC2/aa.irnb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": { + "id": "f02fa044", + "papermill": { + "duration": 0.012988, + "end_time": "2022-04-19T09:06:48.772902", + "exception": false, + "start_time": "2022-04-19T09:06:48.759914", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": { + "id": "23154404", + "papermill": { + "duration": 0.009437, + "end_time": "2022-04-19T09:06:48.791895", + "exception": false, + "start_time": "2022-04-19T09:06:48.782458", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Impact of 401(k) on Financial Wealth\n", + "\n", + "We consider estimation of the effect of 401(k) participation\n", + "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", + "\n", + "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." + ] + }, + { + "cell_type": "markdown", + "id": "66", + "metadata": { + "id": "akCGDMZJCN3h" + }, + "source": [ + "We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this." + ] + }, + { + "cell_type": "markdown", + "id": "67", + "metadata": { + "id": "01de9f24", + "papermill": { + "duration": 0.010725, + "end_time": "2022-04-19T09:06:51.098483", + "exception": false, + "start_time": "2022-04-19T09:06:51.087758", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# DoubleML package\n", + "\n", + "There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", + "\n", + "We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", + "\n", + "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68", + "metadata": { + "id": "2846a36a", + "papermill": { + "duration": 20.239271, + "end_time": "2022-04-19T09:07:11.369618", + "exception": false, + "start_time": "2022-04-19T09:06:51.130347", + "status": "completed" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"DoubleML\")\n", + "install.packages(\"mlr3learners\")\n", + "install.packages(\"mlr3\")\n", + "install.packages(\"data.table\")\n", + "install.packages(\"ranger\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bd37969", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "library(DoubleML)\n", + "library(mlr3learners)\n", + "library(mlr3)\n", + "library(data.table)\n", + "library(ranger)" + ] + }, + { + "cell_type": "markdown", + "id": "79", + "metadata": { + "id": "a7461966", + "papermill": { + "duration": 0.016468, + "end_time": "2022-04-19T09:12:22.311250", + "exception": false, + "start_time": "2022-04-19T09:12:22.294782", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Again, we repeat the procedure for the other machine learning methods:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80", + "metadata": { + "id": "59YzwIcpEnyV", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# needed to run boosting\n", + "options(timeout = max(6000, getOption(\"timeout\")))\n", + "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", + "install.packages(\"mlr3extralearners\")\n", + "install.packages(\"mboost\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66979b8b", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "library(mlr3extralearners)\n", + "library(mboost)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.0.5" + }, + "papermill": { + "default_parameters": {}, + "duration": 427.936706, + "end_time": "2022-04-19T09:13:53.230849", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-04-19T09:06:45.294143", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index e02072f0..60d7aafc 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -24,7 +24,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "cbgHLwp5ynNS" + "id": "cbgHLwp5ynNS", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -33,15 +36,15 @@ "n <- 1000\n", "\n", "p <- n\n", - "x <- matrix(rnorm(n * p), n, p)\n", + "X <- matrix(rnorm(n * p), n, p)\n", "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(y ~ x))$r.squared)\n", + "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(y ~ x))$adj.r.squared)\n" + "print(summary(lm(y ~ X))$adj.r.squared)\n" ] }, { @@ -57,7 +60,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "gWbDboRYynNV" + "id": "gWbDboRYynNV", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -66,15 +72,15 @@ "n <- 1000\n", "\n", "p <- n / 2\n", - "x <- matrix(rnorm(n * p), n, p)\n", + "X <- matrix(rnorm(n * p), n, p)\n", "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(y ~ x))$r.squared)\n", + "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(y ~ x))$adj.r.squared)\n" + "print(summary(lm(y ~ X))$adj.r.squared)\n" ] }, { @@ -90,7 +96,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "uF5tT-MdynNV" + "id": "uF5tT-MdynNV", + "vscode": { + "languageId": "r" + } }, "outputs": [], "source": [ @@ -99,15 +108,15 @@ "n <- 1000\n", "\n", "p <- .05 * n\n", - "x <- matrix(rnorm(n * p), n, p)\n", + "X <- matrix(rnorm(n * p), n, p)\n", "y <- rnorm(n)\n", "\n", "print(\"p/n is\")\n", "print(p / n)\n", "print(\"R2 is\")\n", - "print(summary(lm(y ~ x))$r.squared)\n", + "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", - "print(summary(lm(y ~ x))$adj.r.squared)\n" + "print(summary(lm(y ~ X))$adj.r.squared)\n" ] } ], diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 997b3fd4..0e611733 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -157,8 +157,8 @@ "# construct matrices for estimation from the data\n", "y <- log(data$wage)\n", "n <- length(y)\n", - "z <- data[- which(colnames(data) %in% c(\"wage\", \"lwage\"))]\n", - "p <- dim(z)[2]\n", + "Z <- data[- which(colnames(data) %in% c(\"wage\", \"lwage\"))]\n", + "p <- dim(Z)[2]\n", "\n", "cat(\"Number of observations:\", n, \"\\n\")\n", "cat(\"Number of raw regressors:\", p)" @@ -185,10 +185,10 @@ "outputs": [], "source": [ "# generate a table of means of variables\n", - "z_subset <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", + "Zsubset <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", " \"clg\", \"ad\", \"mw\", \"so\", \"we\", \"ne\", \"exp1\"))]\n", "table <- matrix(0, 12, 1)\n", - "table[1:12, 1] <- as.numeric(lapply(z_subset, mean))\n", + "table[1:12, 1] <- as.numeric(lapply(Zsubset, mean))\n", "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Some High School\",\n", " \"High School Graduate\", \"Some College\", \"College Graduate\",\n", " \"Advanced Degree\", \"Midwest\", \"South\", \"West\", \"Northeast\", \"Experience\")\n", From cb3d480d3e4a100c02f260b3eaab17a202de23dd Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 11:06:49 -0700 Subject: [PATCH 147/261] Update r-ols-and-lasso-for-wage-prediction.irnb --- PM1/r-ols-and-lasso-for-wage-prediction.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.irnb b/PM1/r-ols-and-lasso-for-wage-prediction.irnb index 0e611733..3f45551c 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.irnb +++ b/PM1/r-ols-and-lasso-for-wage-prediction.irnb @@ -186,7 +186,7 @@ "source": [ "# generate a table of means of variables\n", "Zsubset <- data[which(colnames(data) %in% c(\"lwage\", \"sex\", \"shs\", \"hsg\", \"scl\",\n", - " \"clg\", \"ad\", \"mw\", \"so\", \"we\", \"ne\", \"exp1\"))]\n", + " \"clg\", \"ad\", \"mw\", \"so\", \"we\", \"ne\", \"exp1\"))]\n", "table <- matrix(0, 12, 1)\n", "table[1:12, 1] <- as.numeric(lapply(Zsubset, mean))\n", "rownames(table) <- c(\"Log Wage\", \"Sex\", \"Some High School\",\n", From 3c8aca9f06ed22bc0a61be9c5390b376451b4c6d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 18 Jul 2024 18:22:57 +0000 Subject: [PATCH 148/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM1 --- PM1/r-linear-model-overfitting.Rmd | 18 +++++++++--------- PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/PM1/r-linear-model-overfitting.Rmd b/PM1/r-linear-model-overfitting.Rmd index 68ef059c..2dbba836 100644 --- a/PM1/r-linear-model-overfitting.Rmd +++ b/PM1/r-linear-model-overfitting.Rmd @@ -13,15 +13,15 @@ set.seed(123) n <- 1000 p <- n -x <- matrix(rnorm(n * p), n, p) +X <- matrix(rnorm(n * p), n, p) y <- rnorm(n) print("p/n is") print(p / n) print("R2 is") -print(summary(lm(y ~ x))$r.squared) +print(summary(lm(y ~ X))$r.squared) print("Adjusted R2 is") -print(summary(lm(y ~ x))$adj.r.squared) +print(summary(lm(y ~ X))$adj.r.squared) ``` Second, set p=n/2. @@ -32,15 +32,15 @@ set.seed(123) n <- 1000 p <- n / 2 -x <- matrix(rnorm(n * p), n, p) +X <- matrix(rnorm(n * p), n, p) y <- rnorm(n) print("p/n is") print(p / n) print("R2 is") -print(summary(lm(y ~ x))$r.squared) +print(summary(lm(y ~ X))$r.squared) print("Adjusted R2 is") -print(summary(lm(y ~ x))$adj.r.squared) +print(summary(lm(y ~ X))$adj.r.squared) ``` Third, set p/n =.05 @@ -51,14 +51,14 @@ set.seed(123) n <- 1000 p <- .05 * n -x <- matrix(rnorm(n * p), n, p) +X <- matrix(rnorm(n * p), n, p) y <- rnorm(n) print("p/n is") print(p / n) print("R2 is") -print(summary(lm(y ~ x))$r.squared) +print(summary(lm(y ~ X))$r.squared) print("Adjusted R2 is") -print(summary(lm(y ~ x))$adj.r.squared) +print(summary(lm(y ~ X))$adj.r.squared) ``` diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd index 54e2607f..cc690a06 100644 --- a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd +++ b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd @@ -55,8 +55,8 @@ We construct the output variable $Y$ and the matrix $Z$ which includes the chara # construct matrices for estimation from the data y <- log(data$wage) n <- length(y) -z <- data[- which(colnames(data) %in% c("wage", "lwage"))] -p <- dim(z)[2] +Z <- data[- which(colnames(data) %in% c("wage", "lwage"))] +p <- dim(Z)[2] cat("Number of observations:", n, "\n") cat("Number of raw regressors:", p) @@ -66,10 +66,10 @@ For the outcome variable *wage* and a subset of the raw regressors, we calculate ```{r} # generate a table of means of variables -z_subset <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", - "clg", "ad", "mw", "so", "we", "ne", "exp1"))] +Zsubset <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", + "clg", "ad", "mw", "so", "we", "ne", "exp1"))] table <- matrix(0, 12, 1) -table[1:12, 1] <- as.numeric(lapply(z_subset, mean)) +table[1:12, 1] <- as.numeric(lapply(Zsubset, mean)) rownames(table) <- c("Log Wage", "Sex", "Some High School", "High School Graduate", "Some College", "College Graduate", "Advanced Degree", "Midwest", "South", "West", "Northeast", "Experience") From 8daa36fb86dc134faef56da0205d0e61e728fab0 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 13:25:41 -0700 Subject: [PATCH 149/261] Made RDFlex.R deprecated Not clear what this is for. It does not fit the uniformity of only having irnb notebooks --- T/{ => deprecated}/RDFlex.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename T/{ => deprecated}/RDFlex.R (100%) diff --git a/T/RDFlex.R b/T/deprecated/RDFlex.R similarity index 100% rename from T/RDFlex.R rename to T/deprecated/RDFlex.R From 6a08d7d3788069b6354592037ab713449861ed90 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 13:29:20 -0700 Subject: [PATCH 150/261] testing github installation --- .github/workflows/transform-R-to-Rmd.yml | 1 + AC2/aa.irnb | 16 ++++------------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 74c82aea..d3c9cc31 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -88,6 +88,7 @@ jobs: log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' options(show.error.locations = TRUE) + options(timeout = max(6000, getOption("timeout"))) files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) log_con <- file("'$log_file'", open = "wt") sink(log_con, type = "output") diff --git a/AC2/aa.irnb b/AC2/aa.irnb index c3e00acb..5030be34 100644 --- a/AC2/aa.irnb +++ b/AC2/aa.irnb @@ -95,11 +95,8 @@ }, "outputs": [], "source": [ - "install.packages(\"DoubleML\")\n", "install.packages(\"mlr3learners\")\n", - "install.packages(\"mlr3\")\n", - "install.packages(\"data.table\")\n", - "install.packages(\"ranger\")" + "install.packages(\"mlr3\")" ] }, { @@ -113,11 +110,8 @@ }, "outputs": [], "source": [ - "library(DoubleML)\n", "library(mlr3learners)\n", - "library(mlr3)\n", - "library(data.table)\n", - "library(ranger)" + "library(mlr3)" ] }, { @@ -153,8 +147,7 @@ "# needed to run boosting\n", "options(timeout = max(6000, getOption(\"timeout\")))\n", "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", - "install.packages(\"mlr3extralearners\")\n", - "install.packages(\"mboost\")" + "install.packages(\"mlr3extralearners\")" ] }, { @@ -168,8 +161,7 @@ }, "outputs": [], "source": [ - "library(mlr3extralearners)\n", - "library(mboost)" + "library(mlr3extralearners)" ] } ], From 4514c8a7cc8af686ca1f1e64305c69d92efb3663 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Thu, 18 Jul 2024 22:43:23 -0700 Subject: [PATCH 151/261] github installation --- .github/workflows/transform-R-to-Rmd.yml | 1 + AC2/aa.irnb | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index d3c9cc31..a81e41b6 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -88,6 +88,7 @@ jobs: log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' options(show.error.locations = TRUE) + options(download.file.method = "libcurl") options(timeout = max(6000, getOption("timeout"))) files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) log_con <- file("'$log_file'", open = "wt") diff --git a/AC2/aa.irnb b/AC2/aa.irnb index 5030be34..7bae56e1 100644 --- a/AC2/aa.irnb +++ b/AC2/aa.irnb @@ -145,7 +145,6 @@ "outputs": [], "source": [ "# needed to run boosting\n", - "options(timeout = max(6000, getOption(\"timeout\")))\n", "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", "install.packages(\"mlr3extralearners\")" ] From d9d93761765cff21acbff8c634143c160293f956 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 04:18:30 -0700 Subject: [PATCH 152/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index a81e41b6..d8c02dec 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -41,7 +41,18 @@ jobs: - name: Install rmarkdown, knitr, and lintr packages run: | - R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun"), repos="https://cloud.r-project.org")' + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun", "remotes", "devtools"), repos="https://cloud.r-project.org")' + + - name: Check github installation + id: check-github + run: | + R -e ' + options(show.error.locations = TRUE) + options(download.file.method = "libcurl") + options(timeout = max(6000, getOption("timeout"))) + remotes::install_github("mlr-org/mlr3extralearners") + library(mlr3extralearners) + ' - name: Strip outputs from .irnb files run: | @@ -80,7 +91,7 @@ jobs: stop("Linting failed with warnings") } }) - ' + ' 2>/dev/null - name: Execute R scripts and log output id: execute From e909f3d8785860c6ddb5f97c015be04c76b54105 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 04:29:25 -0700 Subject: [PATCH 153/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 30 ++++++++++++++---------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index d8c02dec..dc0414f0 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,11 +14,28 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1'] #, 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 + - name: Set up R + uses: r-lib/actions/setup-r@v2 + + - name: Install rmarkdown, knitr, and lintr packages + run: | + R -e 'install.packages(c("remotes"), repos="https://cloud.r-project.org")' + + - name: Check github installation + id: check-github + run: | + R -e ' + remotes::install_github("mlr-org/mlr3extralearners") + library(mlr3extralearners) + ' + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + - name: Install system dependencies run: | sudo apt-get update @@ -43,17 +60,6 @@ jobs: run: | R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun", "remotes", "devtools"), repos="https://cloud.r-project.org")' - - name: Check github installation - id: check-github - run: | - R -e ' - options(show.error.locations = TRUE) - options(download.file.method = "libcurl") - options(timeout = max(6000, getOption("timeout"))) - remotes::install_github("mlr-org/mlr3extralearners") - library(mlr3extralearners) - ' - - name: Strip outputs from .irnb files run: | for notebook in ${{ matrix.directory }}/*.irnb; do From de5c753d277c99a3ed66f441774d53938c012343 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 04:34:03 -0700 Subject: [PATCH 154/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 31 ++++++------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index dc0414f0..71ed8d06 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -14,28 +14,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - directory: ['PM1'] #, 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] + directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository uses: actions/checkout@v2 - - name: Set up R - uses: r-lib/actions/setup-r@v2 - - - name: Install rmarkdown, knitr, and lintr packages - run: | - R -e 'install.packages(c("remotes"), repos="https://cloud.r-project.org")' - - - name: Check github installation - id: check-github - run: | - R -e ' - remotes::install_github("mlr-org/mlr3extralearners") - library(mlr3extralearners) - ' - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - - name: Install system dependencies run: | sudo apt-get update @@ -52,13 +35,13 @@ jobs: pip install nbstripout if [ -f requirements.txt ]; then pip install -r requirements.txt; fi shell: bash - + - name: Set up R uses: r-lib/actions/setup-r@v2 - name: Install rmarkdown, knitr, and lintr packages run: | - R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun", "remotes", "devtools"), repos="https://cloud.r-project.org")' + R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun", "remotes"), repos="https://cloud.r-project.org")' - name: Strip outputs from .irnb files run: | @@ -97,7 +80,7 @@ jobs: stop("Linting failed with warnings") } }) - ' 2>/dev/null + ' - name: Execute R scripts and log output id: execute @@ -105,8 +88,6 @@ jobs: log_file="${{ matrix.directory }}_r_script_execution.log" R -e ' options(show.error.locations = TRUE) - options(download.file.method = "libcurl") - options(timeout = max(6000, getOption("timeout"))) files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.R$", full.names = TRUE, recursive = FALSE) log_con <- file("'$log_file'", open = "wt") sink(log_con, type = "output") @@ -139,7 +120,9 @@ jobs: } quit(status = 1, save = "no") # Exit with an error status if errors are found } - ' + ' 2>/dev/null + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - name: Upload execution log uses: actions/upload-artifact@v2 From 5f17dd97ed3610d211688ef512a58d9e3e23d19f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 04:34:19 -0700 Subject: [PATCH 155/261] Delete AC2/aa.irnb --- AC2/aa.irnb | 199 ---------------------------------------------------- 1 file changed, 199 deletions(-) delete mode 100644 AC2/aa.irnb diff --git a/AC2/aa.irnb b/AC2/aa.irnb deleted file mode 100644 index 7bae56e1..00000000 --- a/AC2/aa.irnb +++ /dev/null @@ -1,199 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": { - "id": "f02fa044", - "papermill": { - "duration": 0.012988, - "end_time": "2022-04-19T09:06:48.772902", - "exception": false, - "start_time": "2022-04-19T09:06:48.759914", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models" - ] - }, - { - "cell_type": "markdown", - "id": "1", - "metadata": { - "id": "23154404", - "papermill": { - "duration": 0.009437, - "end_time": "2022-04-19T09:06:48.791895", - "exception": false, - "start_time": "2022-04-19T09:06:48.782458", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Impact of 401(k) on Financial Wealth\n", - "\n", - "We consider estimation of the effect of 401(k) participation\n", - "on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation.\n", - "\n", - "One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job." - ] - }, - { - "cell_type": "markdown", - "id": "66", - "metadata": { - "id": "akCGDMZJCN3h" - }, - "source": [ - "We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this." - ] - }, - { - "cell_type": "markdown", - "id": "67", - "metadata": { - "id": "01de9f24", - "papermill": { - "duration": 0.010725, - "end_time": "2022-04-19T09:06:51.098483", - "exception": false, - "start_time": "2022-04-19T09:06:51.087758", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# DoubleML package\n", - "\n", - "There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R).\n", - "\n", - "We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session.\n", - "\n", - "You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68", - "metadata": { - "id": "2846a36a", - "papermill": { - "duration": 20.239271, - "end_time": "2022-04-19T09:07:11.369618", - "exception": false, - "start_time": "2022-04-19T09:06:51.130347", - "status": "completed" - }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "install.packages(\"mlr3learners\")\n", - "install.packages(\"mlr3\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bd37969", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "library(mlr3learners)\n", - "library(mlr3)" - ] - }, - { - "cell_type": "markdown", - "id": "79", - "metadata": { - "id": "a7461966", - "papermill": { - "duration": 0.016468, - "end_time": "2022-04-19T09:12:22.311250", - "exception": false, - "start_time": "2022-04-19T09:12:22.294782", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Again, we repeat the procedure for the other machine learning methods:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80", - "metadata": { - "id": "59YzwIcpEnyV", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# needed to run boosting\n", - "remotes::install_github(\"mlr-org/mlr3extralearners\")\n", - "install.packages(\"mlr3extralearners\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66979b8b", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "library(mlr3extralearners)" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.0.5" - }, - "papermill": { - "default_parameters": {}, - "duration": 427.936706, - "end_time": "2022-04-19T09:13:53.230849", - "environment_variables": {}, - "exception": null, - "input_path": "__notebook__.ipynb", - "output_path": "__notebook__.ipynb", - "parameters": {}, - "start_time": "2022-04-19T09:06:45.294143", - "version": "2.3.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 999809a791341bebc11e8fc350944362cd43a223 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 19 Jul 2024 12:11:49 +0000 Subject: [PATCH 156/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in T --- T/T-3 Diff-in-Diff Minimum Wage Example.Rmd | 625 ++++++++++-------- ...ression_Discontinuity_on_Progresa_Data.Rmd | 424 ++++++------ ...r-conditional-average-treatment-effect.Rmd | 590 ++++++++--------- 3 files changed, 851 insertions(+), 788 deletions(-) diff --git a/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd b/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd index 985b7d70..501ba7b7 100644 --- a/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd +++ b/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd @@ -20,16 +20,22 @@ First, we will load some libraries. ```{r} dependencies <- c("BMisc", "glmnet", "randomForest", "rpart", "xtable", "data.table") install.packages(dependencies) +``` + +```{r} lapply(dependencies, library, character.only = TRUE) +``` +```{r} set.seed(772023) -options(warn=-1) +options(warn = -1) ``` ## Loading the data ```{r} -data <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv", row.names=1) +data <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv", + row.names = 1) data <- data.table(data) ``` @@ -42,10 +48,12 @@ head(data) We remove observations that are already treated in the first observed period (2001). We drop all variables that we won't use in our analysis. ```{r} -data <- subset(data, (G==0) | (G>2001)) -data <- data[, -c("countyreal","state_name","FIPS","emp0A01_BS", - "quarter", "censusdiv","pop","annual_avg_pay", - "state_mw","fed_mw", "ever_treated")] +data <- subset(data, (G == 0) | (G > 2001)) +data <- data[, -c( + "countyreal", "state_name", "FIPS", "emp0A01_BS", + "quarter", "censusdiv", "pop", "annual_avg_pay", + "state_mw", "fed_mw", "ever_treated" +)] ``` Next, we create the treatment groups. We focus our analysis exclusively on the set of counties that had wage increases away from the federal minimum wage in 2004. That is, we treat 2003 and earlier as the pre-treatment period. @@ -74,73 +82,73 @@ categories.) Consequently, we want to extract the control variables for both treatment and control group in 2001. ```{r} -treat1 <- treat1[ , -c("year","G","region","treated")] +treat1 <- treat1[, -c("year", "G", "region", "treated")] -cont1 <- cont1[ , -c("year","G","region","treated")] +cont1 <- cont1[, -c("year", "G", "region", "treated")] ``` 2003 serves as the pre-treatment period for both counties that do receive the treatment in 2004 and those that do not. ```{r} -treatB <- merge(treat3, treat1, by = "id", suffixes = c(".pre",".0")) -treatB <- treatB[ , -c("treated","lpop.pre","lavg_pay.pre","year","G")] +treatB <- merge(treat3, treat1, by = "id", suffixes = c(".pre", ".0")) +treatB <- treatB[, -c("treated", "lpop.pre", "lavg_pay.pre", "year", "G")] -contB <- merge(cont3, cont1, by = "id", suffixes = c(".pre",".0")) -contB <- contB[ , -c("treated","lpop.pre","lavg_pay.pre","year","G")] +contB <- merge(cont3, cont1, by = "id", suffixes = c(".pre", ".0")) +contB <- contB[, -c("treated", "lpop.pre", "lavg_pay.pre", "year", "G")] ``` We estimate the ATET in 2004-2007, which corresponds to the effect in the year of treatment as well as in the three years after the treatment. The control observations are the observations that still have the federal minimum wage in each year. (The control group is shrinking in each year as additional units receive treatment). ```{r} -treat4 <- treat4[ , -c("lpop","lavg_pay","year","G","region")] -treat5 <- treat5[ , -c("lpop","lavg_pay","year","G","region")] -treat6 <- treat6[ , -c("lpop","lavg_pay","year","G","region")] -treat7 <- treat7[ , -c("lpop","lavg_pay","year","G","region")] +treat4 <- treat4[, -c("lpop", "lavg_pay", "year", "G", "region")] +treat5 <- treat5[, -c("lpop", "lavg_pay", "year", "G", "region")] +treat6 <- treat6[, -c("lpop", "lavg_pay", "year", "G", "region")] +treat7 <- treat7[, -c("lpop", "lavg_pay", "year", "G", "region")] -cont4 <- cont4[ , -c("lpop","lavg_pay","year","G","region")] -cont5 <- cont5[ , -c("lpop","lavg_pay","year","G","region")] -cont6 <- cont6[ , -c("lpop","lavg_pay","year","G","region")] -cont7 <- cont7[ , -c("lpop","lavg_pay","year","G","region")] +cont4 <- cont4[, -c("lpop", "lavg_pay", "year", "G", "region")] +cont5 <- cont5[, -c("lpop", "lavg_pay", "year", "G", "region")] +cont6 <- cont6[, -c("lpop", "lavg_pay", "year", "G", "region")] +cont7 <- cont7[, -c("lpop", "lavg_pay", "year", "G", "region")] tdid04 <- merge(treat4, treatB, by = "id") -dy <- tdid04$lemp-tdid04$lemp.pre +dy <- tdid04$lemp - tdid04$lemp.pre tdid04$dy <- dy -tdid04 <- tdid04[ , -c("id","lemp","lemp.pre")] +tdid04 <- tdid04[, -c("id", "lemp", "lemp.pre")] tdid05 <- merge(treat5, treatB, by = "id") -dy <- tdid05$lemp-tdid05$lemp.pre +dy <- tdid05$lemp - tdid05$lemp.pre tdid05$dy <- dy -tdid05 <- tdid05[ , -c("id","lemp","lemp.pre")] +tdid05 <- tdid05[, -c("id", "lemp", "lemp.pre")] tdid06 <- merge(treat6, treatB, by = "id") -dy <- tdid06$lemp-tdid06$lemp.pre +dy <- tdid06$lemp - tdid06$lemp.pre tdid06$dy <- dy -tdid06 <- tdid06[ , -c("id","lemp","lemp.pre")] +tdid06 <- tdid06[, -c("id", "lemp", "lemp.pre")] tdid07 <- merge(treat7, treatB, by = "id") -dy <- tdid07$lemp-tdid07$lemp.pre +dy <- tdid07$lemp - tdid07$lemp.pre tdid07$dy <- dy -tdid07 <- tdid07[ , -c("id","lemp","lemp.pre")] +tdid07 <- tdid07[, -c("id", "lemp", "lemp.pre")] cdid04 <- merge(cont4, contB, by = "id") -dy <- cdid04$lemp-cdid04$lemp.pre +dy <- cdid04$lemp - cdid04$lemp.pre cdid04$dy <- dy -cdid04 <- cdid04[ , -c("id","lemp","lemp.pre")] +cdid04 <- cdid04[, -c("id", "lemp", "lemp.pre")] cdid05 <- merge(cont5, contB, by = "id") -dy <- cdid05$lemp-cdid05$lemp.pre +dy <- cdid05$lemp - cdid05$lemp.pre cdid05$dy <- dy -cdid05 <- cdid05[ , -c("id","lemp","lemp.pre")] +cdid05 <- cdid05[, -c("id", "lemp", "lemp.pre")] cdid06 <- merge(cont6, contB, by = "id") -dy <- cdid06$lemp-cdid06$lemp.pre +dy <- cdid06$lemp - cdid06$lemp.pre cdid06$dy <- dy -cdid06 <- cdid06[ , -c("id","lemp","lemp.pre")] +cdid06 <- cdid06[, -c("id", "lemp", "lemp.pre")] cdid07 <- merge(cont7, contB, by = "id") -dy <- cdid07$lemp-cdid07$lemp.pre +dy <- cdid07$lemp - cdid07$lemp.pre cdid07$dy <- dy -cdid07 <- cdid07[ , -c("id","lemp","lemp.pre")] +cdid07 <- cdid07[, -c("id", "lemp", "lemp.pre")] ``` ### Estimation of the ATET with DML @@ -151,189 +159,208 @@ The methods indicated with CV have their tuning parameter selected by cross-vali The following code block implements the DML cross-fitting procedure. ```{r} -att <- matrix(NA,4,10) -se.att <- matrix(NA,4,10) -RMSE.d <- matrix(NA,4,9) -RMSE.y <- matrix(NA,4,9) -trimmed <- matrix(NA,4,9) +att <- matrix(NA, 4, 10) +se_att <- matrix(NA, 4, 10) +rmse_d <- matrix(NA, 4, 9) +rmse_y <- matrix(NA, 4, 9) +trimmed <- matrix(NA, 4, 9) print("DML estimation starting, please wait") -for(ii in 1:4){ # ii refer to the 4 investigated post-treatment periods +for (ii in 1:4) { # ii refer to the 4 investigated post-treatment periods - tdata <- get(paste("tdid0",(3+ii),sep="")) # Treatment data - cdata <- get(paste("cdid0",(3+ii),sep="")) # Control data - usedata <- rbind(tdata,cdata) + tdata <- get(paste("tdid0", (3 + ii), sep = "")) # Treatment data + cdata <- get(paste("cdid0", (3 + ii), sep = "")) # Control data + usedata <- rbind(tdata, cdata) #----------------------------------------------------------------------------- # Cross-fit setup n <- nrow(usedata) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups # Initialize variables for CV predictions - yGd0x.fit <- matrix(NA,n,9) - dGx.fit <- matrix(NA,n,9) - pd.fit <-matrix(NA,n,1) + y_gd0x_fit <- matrix(NA, n, 9) + dgx_fit <- matrix(NA, n, 9) + pd_fit <- matrix(NA, n, 1) #----------------------------------------------------------------------------- # Cross-fit loop - for(k in 1:Kf) { - cat("year: ",ii+2003,"; fold: ",k,"\n") + for (k in 1:Kf) { + cat("year: ", ii + 2003, "; fold: ", k, "\n") indk <- cfgroup == k - ktrain <- usedata[!indk,] - ktest <- usedata[indk,] + ktrain <- usedata[!indk, ] + ktest <- usedata[indk, ] # Build some matrices for later - ytrain <- as.matrix(usedata[!indk,"dy"]) - ytest <- as.matrix(usedata[indk,"dy"]) - dtrain <- as.matrix(usedata[!indk,"treated"]) - dtest <- as.matrix(usedata[indk,"treated"]) + ytrain <- as.matrix(usedata[!indk, "dy"]) + ytest <- as.matrix(usedata[indk, "dy"]) + dtrain <- as.matrix(usedata[!indk, "treated"]) + dtest <- as.matrix(usedata[indk, "treated"]) # Expansion for lasso/ridge (region specific cubic polynomial) - X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 , - degree = 3, raw = TRUE)), - data = usedata) + Xexpand <- model.matrix( + ~ region * (polym(lemp.0, lpop.0, lavg_pay.0, + degree = 3, raw = TRUE + )), + data = usedata + ) - xtrain <- as.matrix(X.expand[!indk,]) - xtest <- as.matrix(X.expand[indk,]) + xtrain <- as.matrix(Xexpand[!indk, ]) + xtest <- as.matrix(Xexpand[indk, ]) #----------------------------------------------------------------------------- - # P(D = 1) - pd.fit[indk,1] <- mean(ktrain$treated) + # Estimating P(D = 1) + pd_fit[indk, 1] <- mean(ktrain$treated) #----------------------------------------------------------------------------- - # E[D|X] + # Estimating E[D|X] # 1) Constant - dGx.fit[indk,1] <- mean(ktrain$treated) + dgx_fit[indk, 1] <- mean(ktrain$treated) # 2) Baseline controls glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - family = "binomial", data = ktrain) - dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = "response") + family = "binomial", data = ktrain + ) + dgx_fit[indk, 2] <- predict(glmXdk, newdata = ktest, type = "response") # 3) Region specific linear index glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - family = "binomial", data = ktrain) - dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = "response") + family = "binomial", data = ktrain + ) + dgx_fit[indk, 3] <- predict(glmRXdk, newdata = ktest, type = "response") # 4) Lasso - expansion - default CV tuning - lassoXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", type.measure = "mse") - dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = "response" , - s = "lambda.min") + lassoXdk <- cv.glmnet(xtrain, dtrain, family = "binomial", type.measure = "mse") + dgx_fit[indk, 4] <- predict(lassoXdk, + newx = xtest, type = "response", + s = "lambda.min" + ) # 5) Ridge - expansion - default CV tuning - ridgeXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", - type.measure = "mse", alpha = 0) - dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = "response" , - s = "lambda.min") + ridgeXdk <- cv.glmnet(xtrain, dtrain, + family = "binomial", + type.measure = "mse", alpha = 0 + ) + dgx_fit[indk, 5] <- predict(ridgeXdk, + newx = xtest, type = "response", + s = "lambda.min" + ) # 6) Random forest - rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain , mtry = 4, ntree = 1000) - dGx.fit[indk,6] <- predict(rfXdk, ktest, type = "prob")[, 2] + rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, mtry = 4, ntree = 1000 + ) + dgx_fit[indk, 6] <- predict(rfXdk, ktest, type = "prob")[, 2] # 7) Tree (start big) - btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) + ) # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dGx.fit[indk,7] <- predict(btXdk, ktest) + dgx_fit[indk, 7] <- predict(btXdk, ktest) # 8) Tree (small tree) - stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) + ) # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dGx.fit[indk,8] <- predict(stXdk, ktest) + dgx_fit[indk, 8] <- predict(stXdk, ktest) # 9) Tree (cv) - bestcp <- btXdk$cptable[which.min(btXdk$cptable[,"xerror"]),"CP"] - cvXdk <- prune(btXdk , cp=bestcp) - dGx.fit[indk,9] <- predict(cvXdk, ktest) + bestcp <- btXdk$cptable[which.min(btXdk$cptable[, "xerror"]), "CP"] + cvXdk <- prune(btXdk, cp = bestcp) + dgx_fit[indk, 9] <- predict(cvXdk, ktest) #----------------------------------------------------------------------------- - # E[Y|D=0,X] + # Estimating E[Y|D=0,X] # subset to D = 0 - ktrain0 = ktrain[ktrain$treated == 0, ] + ktrain0 <- ktrain[ktrain$treated == 0, ] - ytrain0 = ytrain[ktrain$treated == 0, ] - xtrain0 = xtrain[ktrain$treated == 0, ] + ytrain0 <- ytrain[ktrain$treated == 0, ] + xtrain0 <- xtrain[ktrain$treated == 0, ] # 1) Constant - yGd0x.fit[indk,1] <- mean(ktrain0$dy) + y_gd0x_fit[indk, 1] <- mean(ktrain0$dy) # 2) Baseline controls lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) - yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest) + y_gd0x_fit[indk, 2] <- predict(lmXyk, newdata = ktest) # 3) Region specific linear index lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - data = ktrain) - yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest) + data = ktrain + ) + y_gd0x_fit[indk, 3] <- predict(lmRXyk, newdata = ktest) # 4) Lasso - expansion - default CV tuning - lassoXyk <- cv.glmnet(xtrain0 , ytrain0) - yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = "lambda.min") + lassoXyk <- cv.glmnet(xtrain0, ytrain0) + y_gd0x_fit[indk, 4] <- predict(lassoXyk, newx = xtest, s = "lambda.min") # 5) Ridge - expansion - default CV tuning - ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0) - yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") + ridgeXyk <- cv.glmnet(xtrain0, ytrain0, alpha = 0) + y_gd0x_fit[indk, 5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") # 6) Random forest - rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain0 , mtry = 4, ntree = 1000) - yGd0x.fit[indk,6] <- predict(rfXyk, ktest) + rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain0, mtry = 4, ntree = 1000 + ) + y_gd0x_fit[indk, 6] <- predict(rfXyk, ktest) # 7) Tree (start big) - btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain0, method = "anova", - control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) - yGd0x.fit[indk,7] <- predict(btXyk, ktest) + btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain0, method = "anova", + control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) + ) + y_gd0x_fit[indk, 7] <- predict(btXyk, ktest) # 8) Tree (small tree) - stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) - yGd0x.fit[indk,8] <- predict(stXyk, ktest) + stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) + ) + y_gd0x_fit[indk, 8] <- predict(stXyk, ktest) # 9) Tree (cv) - bestcp <- btXyk$cptable[which.min(btXyk$cptable[,"xerror"]),"CP"] - cvXyk <- prune(btXyk , cp=bestcp) - yGd0x.fit[indk,9] <- predict(cvXyk, ktest) - + bestcp <- btXyk$cptable[which.min(btXyk$cptable[, "xerror"]), "CP"] + cvXyk <- prune(btXyk, cp = bestcp) + y_gd0x_fit[indk, 9] <- predict(cvXyk, ktest) } - RMSE.d[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2)) - RMSE.y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - - yGd0x.fit[usedata$treated == 0, ])^2)) + rmse_d[ii, ] <- sqrt(colMeans((usedata$treated - dgx_fit)^2)) + rmse_y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - + y_gd0x_fit[usedata$treated == 0, ])^2)) # trim propensity scores of 1 to .95 - for(r in 1:9) { - trimmed[ii,r] = sum(dGx.fit[ , r] > .95) - dGx.fit[dGx.fit[ ,r] > .95,r] <- .95 + for (r in 1:9) { + trimmed[ii, r] <- sum(dgx_fit[, r] > .95) + dgx_fit[dgx_fit[, r] > .95, r] <- .95 } - att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* - (usedata$dy-yGd0x.fit)) , - mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) - /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* - (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))) - att.den <- mean(usedata$treated/pd.fit) - - att[ii, ] <- att.num/att.den - - phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* - (usedata$dy-yGd0x.fit) , - ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) - /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* - (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den - se.att[ii, ] <- sqrt(colMeans((phihat^2))/n) - - + att_num <- c( + colMeans(((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * + (usedata$dy - y_gd0x_fit)), + mean(((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) + / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * + (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])])) + ) + att_den <- mean(usedata$treated / pd_fit) + + att[ii, ] <- att_num / att_den + + phihat <- cbind( + ((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * + (usedata$dy - y_gd0x_fit), + ((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) + / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * + (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])]) + ) / att_den + se_att[ii, ] <- sqrt(colMeans((phihat^2)) / n) } ``` @@ -341,23 +368,27 @@ We start by reporting the RMSE obtained during cross-fitting for each learner in ```{r} table1y <- matrix(0, 9, 4) -table1y <- t(RMSE.y) -colnames(table1y)<- c("2004","2005","2006","2007") -rownames(table1y)<- c("No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest","Deep Tree", - "Shallow Tree", "Tree (CV)") +table1y <- t(rmse_y) +colnames(table1y) <- c("2004", "2005", "2006", "2007") +rownames(table1y) <- c( + "No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest", "Deep Tree", + "Shallow Tree", "Tree (CV)" +) table1y ``` ```{r} table1d <- matrix(0, 9, 4) -table1d <- t(RMSE.d) -colnames(table1d)<- c("2004","2005","2006","2007") -rownames(table1d)<- c("No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest","Deep Tree", - "Shallow Tree", "Tree (CV)") +table1d <- t(rmse_d) +colnames(table1d) <- c("2004", "2005", "2006", "2007") +rownames(table1d) <- c( + "No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest", "Deep Tree", + "Shallow Tree", "Tree (CV)" +) table1d ``` @@ -368,14 +399,16 @@ We report estimates of the ATET in each period in the following table. ```{r} table2 <- matrix(0, 20, 4) -table2[seq(1,20,2),] <- t(att) -table2[seq(2,20,2),] <- t(se.att) -colnames(table2)<- c("2004","2005","2006","2007") -rownames(table2)<- c("No Controls","s.e.","Basic","s.e.", - "Expansion","s.e.","Lasso (CV)","s.e.", - "Ridge (CV)","s.e.","Random Forest","s.e.", - "Deep Tree","s.e.","Shallow Tree","s.e.", - "Tree (CV)","s.e.","Best","s.e.") +table2[seq(1, 20, 2), ] <- t(att) +table2[seq(2, 20, 2), ] <- t(se_att) +colnames(table2) <- c("2004", "2005", "2006", "2007") +rownames(table2) <- c( + "No Controls", "s.e.", "Basic", "s.e.", + "Expansion", "s.e.", "Lasso (CV)", "s.e.", + "Ridge (CV)", "s.e.", "Random Forest", "s.e.", + "Deep Tree", "s.e.", "Shallow Tree", "s.e.", + "Tree (CV)", "s.e.", "Best", "s.e." +) table2 ``` @@ -390,206 +423,224 @@ Because we have data for the period 2001-2007, we can perform a so-called pre-tr We change the treatment status of those observations, which received treatment in 2004 in the 2002 data and create a placebo treatment as well as control group. ```{r} -treat2 <- treat2[ , -c("lpop","lavg_pay","year","G","region")] -treat2$treated <- 1 # Code these observations as treated +treat2 <- treat2[, -c("lpop", "lavg_pay", "year", "G", "region")] +treat2$treated <- 1 # Code these observations as treated tdid02 <- merge(treat2, treatB, by = "id") -dy <- tdid02$lemp-tdid02$lemp.pre +dy <- tdid02$lemp - tdid02$lemp.pre tdid02$dy <- dy -tdid02 <- tdid02[ , -c("id","lemp","lemp.pre")] +tdid02 <- tdid02[, -c("id", "lemp", "lemp.pre")] -cont2 <- cont2[ , -c("lpop","lavg_pay","year","G","region")] +cont2 <- cont2[, -c("lpop", "lavg_pay", "year", "G", "region")] cdid02 <- merge(cont2, contB, by = "id") -dy <- cdid02$lemp-cdid02$lemp.pre +dy <- cdid02$lemp - cdid02$lemp.pre cdid02$dy <- dy -cdid02 <- cdid02[ , -c("id","lemp","lemp.pre")] +cdid02 <- cdid02[, -c("id", "lemp", "lemp.pre")] ``` We repeat the exercise for obtaining our ATET estimates and standard error for 2004-2007. Particularly, we also use all the learners as mentioned above. ```{r} -attP <- matrix(NA,1,10) -se.attP <- matrix(NA,1,10) -RMSE.dP <- matrix(NA,1,9) -RMSE.yP <- matrix(NA,1,9) -trimmedP <- matrix(NA,1,9) -for(ii in 1){ - - tdata <- get(paste("tdid0",(3-ii),sep="")) # Treatment data - cdata <- get(paste("cdid0",(3-ii),sep="")) # Control data - usedata <- rbind(tdata,cdata) +att_p <- matrix(NA, 1, 10) +se_att_p <- matrix(NA, 1, 10) +rmse_d_p <- matrix(NA, 1, 9) +rmse_y_p <- matrix(NA, 1, 9) +trimmed_p <- matrix(NA, 1, 9) +for (ii in 1) { + tdata <- get(paste("tdid0", (3 - ii), sep = "")) # Treatment data + cdata <- get(paste("cdid0", (3 - ii), sep = "")) # Control data + usedata <- rbind(tdata, cdata) #----------------------------------------------------------------------------- # Cross-fit setup n <- nrow(usedata) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups # Initialize variables for CV predictions - yGd0x.fit <- matrix(NA,n,9) - dGx.fit <- matrix(NA,n,9) - pd.fit <-matrix(NA,n,1) + y_gd0x_fit <- matrix(NA, n, 9) + dgx_fit <- matrix(NA, n, 9) + pd_fit <- matrix(NA, n, 1) #----------------------------------------------------------------------------- # Cross-fit loop - for(k in 1:Kf) { - cat("year: ",ii+2001,"; fold: ",k,"\n") + for (k in 1:Kf) { + cat("year: ", ii + 2001, "; fold: ", k, "\n") indk <- cfgroup == k - ktrain <- usedata[!indk,] - ktest <- usedata[indk,] + ktrain <- usedata[!indk, ] + ktest <- usedata[indk, ] # Build some matrices for later - ytrain <- as.matrix(usedata[!indk,"dy"]) - ytest <- as.matrix(usedata[indk,"dy"]) - dtrain <- as.matrix(usedata[!indk,"treated"]) - dtest <- as.matrix(usedata[indk,"treated"]) + ytrain <- as.matrix(usedata[!indk, "dy"]) + ytest <- as.matrix(usedata[indk, "dy"]) + dtrain <- as.matrix(usedata[!indk, "treated"]) + dtest <- as.matrix(usedata[indk, "treated"]) # Expansion for lasso/ridge (region specific cubic polynomial) - X.expand <- model.matrix( ~ region*(polym(lemp.0 , lpop.0 , lavg_pay.0 , - degree = 3, raw = TRUE)), - data = usedata) + Xexpand <- model.matrix( + ~ region * (polym(lemp.0, lpop.0, lavg_pay.0, + degree = 3, raw = TRUE + )), + data = usedata + ) - xtrain <- as.matrix(X.expand[!indk,]) - xtest <- as.matrix(X.expand[indk,]) + xtrain <- as.matrix(Xexpand[!indk, ]) + xtest <- as.matrix(Xexpand[indk, ]) #----------------------------------------------------------------------------- - # P(D = 1) - pd.fit[indk,1] <- mean(ktrain$treated) + # Estimating P(D = 1) + pd_fit[indk, 1] <- mean(ktrain$treated) #----------------------------------------------------------------------------- - # E[D|X] + # Estimating E[D|X] # 1) Constant - dGx.fit[indk,1] <- mean(ktrain$treated) + dgx_fit[indk, 1] <- mean(ktrain$treated) # 2) Baseline controls glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - family = "binomial", data = ktrain) - dGx.fit[indk,2] <- predict(glmXdk, newdata = ktest, type = "response") + family = "binomial", data = ktrain + ) + dgx_fit[indk, 2] <- predict(glmXdk, newdata = ktest, type = "response") # 3) Region specific linear index glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - family = "binomial", data = ktrain) - dGx.fit[indk,3] <- predict(glmRXdk, newdata = ktest, type = "response") + family = "binomial", data = ktrain + ) + dgx_fit[indk, 3] <- predict(glmRXdk, newdata = ktest, type = "response") # 4) Lasso - expansion - default CV tuning - lassoXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", type.measure = "mse") - dGx.fit[indk,4] <- predict(lassoXdk, newx = xtest, type = "response" , - s = "lambda.min") + lassoXdk <- cv.glmnet(xtrain, dtrain, family = "binomial", type.measure = "mse") + dgx_fit[indk, 4] <- predict(lassoXdk, + newx = xtest, type = "response", + s = "lambda.min" + ) # 5) Ridge - expansion - default CV tuning - ridgeXdk <- cv.glmnet(xtrain , dtrain , family = "binomial", - type.measure = "mse", alpha = 0) - dGx.fit[indk,5] <- predict(ridgeXdk, newx = xtest, type = "response" , - s = "lambda.min") + ridgeXdk <- cv.glmnet(xtrain, dtrain, + family = "binomial", + type.measure = "mse", alpha = 0 + ) + dgx_fit[indk, 5] <- predict(ridgeXdk, + newx = xtest, type = "response", + s = "lambda.min" + ) # 6) Random forest - rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain , mtry = 4, ntree = 1000) - dGx.fit[indk,6] <- predict(rfXdk, ktest, type = "prob")[, 2] + rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, mtry = 4, ntree = 1000 + ) + dgx_fit[indk, 6] <- predict(rfXdk, ktest, type = "prob")[, 2] # 7) Tree (start big) - btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) + btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) + ) # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dGx.fit[indk,7] <- predict(btXdk, ktest) + dgx_fit[indk, 7] <- predict(btXdk, ktest) # 8) Tree (small tree) - stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) + stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) + ) # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dGx.fit[indk,8] <- predict(stXdk, ktest) + dgx_fit[indk, 8] <- predict(stXdk, ktest) # 9) Tree (cv) - bestcp <- btXdk$cptable[which.min(btXdk$cptable[,"xerror"]),"CP"] - cvXdk <- prune(btXdk , cp=bestcp) - dGx.fit[indk,9] <- predict(cvXdk, ktest) + bestcp <- btXdk$cptable[which.min(btXdk$cptable[, "xerror"]), "CP"] + cvXdk <- prune(btXdk, cp = bestcp) + dgx_fit[indk, 9] <- predict(cvXdk, ktest) #----------------------------------------------------------------------------- - # E[Y|D=0,X] + # Estimating E[Y|D=0,X] # subset to D = 0 - ktrain0 = ktrain[ktrain$treated == 0, ] + ktrain0 <- ktrain[ktrain$treated == 0, ] - ytrain0 = ytrain[ktrain$treated == 0, ] - xtrain0 = xtrain[ktrain$treated == 0, ] + ytrain0 <- ytrain[ktrain$treated == 0, ] + xtrain0 <- xtrain[ktrain$treated == 0, ] # 1) Constant - yGd0x.fit[indk,1] <- mean(ktrain0$dy) + y_gd0x_fit[indk, 1] <- mean(ktrain0$dy) # 2) Baseline controls lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) - yGd0x.fit[indk,2] <- predict(lmXyk, newdata = ktest) + y_gd0x_fit[indk, 2] <- predict(lmXyk, newdata = ktest) # 3) Region specific linear index lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - data = ktrain) - yGd0x.fit[indk,3] <- predict(lmRXyk, newdata = ktest) + data = ktrain + ) + y_gd0x_fit[indk, 3] <- predict(lmRXyk, newdata = ktest) # 4) Lasso - expansion - default CV tuning - lassoXyk <- cv.glmnet(xtrain0 , ytrain0) - yGd0x.fit[indk,4] <- predict(lassoXyk, newx = xtest , s = "lambda.min") + lassoXyk <- cv.glmnet(xtrain0, ytrain0) + y_gd0x_fit[indk, 4] <- predict(lassoXyk, newx = xtest, s = "lambda.min") # 5) Ridge - expansion - default CV tuning - ridgeXyk <- cv.glmnet(xtrain0 , ytrain0 , alpha = 0) - yGd0x.fit[indk,5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") + ridgeXyk <- cv.glmnet(xtrain0, ytrain0, alpha = 0) + y_gd0x_fit[indk, 5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") # 6) Random forest - rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain0 , mtry = 4, ntree = 1000) - yGd0x.fit[indk,6] <- predict(rfXyk, ktest) + rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain0, mtry = 4, ntree = 1000 + ) + y_gd0x_fit[indk, 6] <- predict(rfXyk, ktest) # 7) Tree (start big) - btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain0, method = "anova", - control=rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10)) - yGd0x.fit[indk,7] <- predict(btXyk, ktest) + btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain0, method = "anova", + control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) + ) + y_gd0x_fit[indk, 7] <- predict(btXyk, ktest) # 8) Tree (small tree) - stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0 , - data = ktrain, method = "anova", - control=rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10)) - yGd0x.fit[indk,8] <- predict(stXyk, ktest) + stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, + data = ktrain, method = "anova", + control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) + ) + y_gd0x_fit[indk, 8] <- predict(stXyk, ktest) # 9) Tree (cv) - bestcp <- btXyk$cptable[which.min(btXyk$cptable[,"xerror"]),"CP"] - cvXyk <- prune(btXyk , cp=bestcp) - yGd0x.fit[indk,9] <- predict(cvXyk, ktest) - + bestcp <- btXyk$cptable[which.min(btXyk$cptable[, "xerror"]), "CP"] + cvXyk <- prune(btXyk, cp = bestcp) + y_gd0x_fit[indk, 9] <- predict(cvXyk, ktest) } - RMSE.dP[ii, ] <- sqrt(colMeans((usedata$treated - dGx.fit)^2)) - RMSE.yP[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - - yGd0x.fit[usedata$treated == 0, ])^2)) + rmse_d_p[ii, ] <- sqrt(colMeans((usedata$treated - dgx_fit)^2)) + rmse_y_p[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - + y_gd0x_fit[usedata$treated == 0, ])^2)) # trim propensity scores of 1 to .95 - for(r in 1:9) { - trimmedP[ii,r] = sum(dGx.fit[ , r] > .95) - dGx.fit[dGx.fit[ ,r] > .95,r] <- .95 + for (r in 1:9) { + trimmed_p[ii, r] <- sum(dgx_fit[, r] > .95) + dgx_fit[dgx_fit[, r] > .95, r] <- .95 } - att.num <- c(colMeans(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* - (usedata$dy-yGd0x.fit)) , - mean(((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) - /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* - (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))) - att.den <- mean(usedata$treated/pd.fit) - - attP[ii, ] <- att.num/att.den - - phihat <- cbind(((usedata$treated - dGx.fit)/((pd.fit%*%matrix(1,1,9))*(1-dGx.fit)))* - (usedata$dy-yGd0x.fit) , - ((usedata$treated - dGx.fit[ ,which.min(RMSE.d[ii, ])]) - /(pd.fit*(1-dGx.fit[ ,which.min(RMSE.d[ii, ])])))* - (usedata$dy-yGd0x.fit[ ,which.min(RMSE.y[ii, ])]))/att.den - se.attP[ii, ] <- sqrt(colMeans((phihat^2))/n) - - + att_num <- c( + colMeans(((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * + (usedata$dy - y_gd0x_fit)), + mean(((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) + / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * + (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])])) + ) + att_den <- mean(usedata$treated / pd_fit) + + att_p[ii, ] <- att_num / att_den + + phihat <- cbind( + ((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * + (usedata$dy - y_gd0x_fit), + ((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) + / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * + (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])]) + ) / att_den + se_att_p[ii, ] <- sqrt(colMeans((phihat^2)) / n) } ``` @@ -597,16 +648,18 @@ We report the results in the following table. ```{r} tableP <- matrix(0, 4, 10) -tableP[1,] <- c(RMSE.yP, min(RMSE.yP)) -tableP[2,] <- c(RMSE.dP, min(RMSE.dP)) -tableP[3,] <- attP -tableP[4,] <- se.attP -rownames(tableP)<- c("RMSE Y","RMSE D","ATET","s.e.") -colnames(tableP)<- c("No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest","Deep Tree", - "Shallow Tree", "Tree (CV)" , "Best") -tableP = t(tableP) +tableP[1, ] <- c(rmse_y_p, min(rmse_y_p)) +tableP[2, ] <- c(rmse_d_p, min(rmse_d_p)) +tableP[3, ] <- att_p +tableP[4, ] <- se_att_p +rownames(tableP) <- c("RMSE Y", "RMSE D", "ATET", "s.e.") +colnames(tableP) <- c( + "No Controls", "Basic", "Expansion", + "Lasso (CV)", "Ridge (CV)", + "Random Forest", "Deep Tree", + "Shallow Tree", "Tree (CV)", "Best" +) +tableP <- t(tableP) tableP ``` diff --git a/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd b/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd index 8a066de6..8c4564e4 100644 --- a/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd +++ b/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd @@ -15,6 +15,9 @@ First, we need to install and load some packages. This can take up to 15 minutes ```{r} dependencies <- c("rdrobust", "fastDummies", "randomForest", "hdm", "gbm", "rdd") install.packages(dependencies) +``` + +```{r} lapply(dependencies, library, character.only = TRUE) ``` @@ -23,9 +26,10 @@ We use a dataset assembled by [Calonico et al. (2014)](https://rdpackages.github First, we open the data and remove any observations that have NaN values. ```{r} -df <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv", row.names=1) +df <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv", + row.names = 1) comp <- complete.cases(df) -df <- df[comp,] +df <- df[comp, ] print("Shape of Data:") print(dim(df)) print("Variable Names:") @@ -47,7 +51,7 @@ First, we will perform a very simple RD estimation with a weighted linear regres ```{r} triangular_kernel <- function(index, h) { - weights <- 1 - abs(index)/h + weights <- 1 - abs(index) / h weights[weights < 0] <- 0 return(weights) } @@ -56,15 +60,15 @@ triangular_kernel <- function(index, h) { The parameter `h` is the bandwidth that controls the range of observations that receive non-zero weights. We use the `IKbandwidth` function from the `rdd` package that implements the *Imbens-Kalyanaraman* method. Another standard approach would be to use the standard deviation of `index`. ```{r} -h <- IKbandwidth(X=df$index, Y=df$conspcfood_t1, cutpoint = 0) +h <- IKbandwidth(X = df$index, Y = df$conspcfood_t1, cutpoint = 0) ``` We use the triangular kernel function to calculate weights for each observation. After that, we can fit two seperate linear regressions for both treatment and control groups. ```{r} weights <- triangular_kernel(df$index, h) -model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0,], weights = weights[df$index > 0]) -model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0,], weights = weights[df$index < 0]) +model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0, ], weights = weights[df$index > 0]) +model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0, ], weights = weights[df$index < 0]) ``` The treatment effect at the cutoff point is estimated as the difference between the predictions of the two models at the cutoff point. @@ -72,7 +76,7 @@ The treatment effect at the cutoff point is estimated as the difference between ```{r} cutoff <- 0 treatment_effect <- predict(model_treated, newdata = data.frame(index = cutoff)) - - predict(model_control, newdata = data.frame(index = cutoff)) + predict(model_control, newdata = data.frame(index = cutoff)) treatment_effect ``` @@ -80,9 +84,9 @@ We estimate that the participation in the program reduced food consumption by $2 ```{r} result <- c() -for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")){ - rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1) - result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust",])) +for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")) { + rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho = 1) + result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } resframe <- as.data.frame(result) colnames(resframe) <- c("LATE", "s.e.") @@ -95,7 +99,7 @@ While the effects in the first year after the intervention are negative, we obse The following plot visualizes the two weighted regressions at the cut-off for the last outcome variable (non-food consumption in `t2`). We can clearly see the "jump" at the cut-off, which is our LATE. ```{r} -rdplot(df$conspcfood_t1, df$index, c=0, x.lim = c(-1,1), y.lim = c(250,400)) +rdplot(df$conspcfood_t1, df$index, c = 0, x.lim = c(-1, 1), y.lim = c(250, 400)) ``` ## Estimation with Covariates @@ -106,18 +110,24 @@ For identification and estimation of the average treatment effect at the cutoff The standard approach is simply to take up the regressors in the weighted least squares regression. ```{r} -model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index > 0,], weights = weights[df$index > 0]) -model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, data = df[df$index < 0,], weights = weights[df$index < 0]) -prediction_treated <- predict(model_treated, newdata = data.frame(index = cutoff, - hhownhouse = weighted.mean(df[df$index > 0,]$hhownhouse, w =weights[df$index > 0]), - headage = weighted.mean(df[df$index > 0,]$headage, w =weights[df$index > 0]), - heademp = weighted.mean(df[df$index > 0,]$heademp, w =weights[df$index > 0]), - headeduc = weighted.mean(df[df$index > 0,]$headeduc, w =weights[df$index > 0]))) -prediction_control <- predict(model_control, newdata = data.frame(index = cutoff, - hhownhouse = weighted.mean(df[df$index < 0,]$hhownhouse, w = weights[df$index < 0]), - headage = weighted.mean(df[df$index < 0,]$headage, w = weights[df$index < 0]), - heademp = weighted.mean(df[df$index < 0,]$heademp, w = weights[df$index < 0]), - headeduc = weighted.mean(df[df$index < 0,]$headeduc, w = weights[df$index < 0]))) +model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, + data = df[df$index > 0, ], weights = weights[df$index > 0]) +model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, + data = df[df$index < 0, ], weights = weights[df$index < 0]) +prediction_treated <- predict(model_treated, newdata = data.frame( + index = cutoff, + hhownhouse = weighted.mean(df[df$index > 0, ]$hhownhouse, w = weights[df$index > 0]), + headage = weighted.mean(df[df$index > 0, ]$headage, w = weights[df$index > 0]), + heademp = weighted.mean(df[df$index > 0, ]$heademp, w = weights[df$index > 0]), + headeduc = weighted.mean(df[df$index > 0, ]$headeduc, w = weights[df$index > 0]) +)) +prediction_control <- predict(model_control, newdata = data.frame( + index = cutoff, + hhownhouse = weighted.mean(df[df$index < 0, ]$hhownhouse, w = weights[df$index < 0]), + headage = weighted.mean(df[df$index < 0, ]$headage, w = weights[df$index < 0]), + heademp = weighted.mean(df[df$index < 0, ]$heademp, w = weights[df$index < 0]), + headeduc = weighted.mean(df[df$index < 0, ]$headeduc, w = weights[df$index < 0]) +)) treatment_effect <- prediction_treated - prediction_control treatment_effect ``` @@ -128,14 +138,14 @@ Again, we can also use `rdrobust` to repeat the estimation with all other outcom ```{r} result <- c() -for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")){ - rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho=1, covs = df[,c(1:8,10:17,19,22)]) - result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust",])) +for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")) { + rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho = 1, covs = df[, c(1:8, 10:17, 19, 22)]) + result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } resframe_adj <- as.data.frame(result) colnames(resframe_adj) <- c("LATE", "s.e.") rownames(resframe_adj) <- c("Food T_1", "Non-Food T_1", "Food T_2", "Non-Food T_2") -resframe_adj["% reduction"] = (resframe_adj["s.e."] - resframe[,2]) * 100 / resframe[,2] +resframe_adj["% reduction"] <- (resframe_adj["s.e."] - resframe[, 2]) * 100 / resframe[, 2] print(resframe_adj) ``` @@ -155,85 +165,89 @@ There are multiple ways to implement the estimators presented in the book, we wi ```{r} # Running Variable and Outcome -df_ml = df -investigated_outcome = "conspcfood_t1" +df_ml <- df +investigated_outcome <- "conspcfood_t1" names(df_ml)[names(df_ml) == "index"] <- "X" names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" # Baseline covariates including consumption -b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) +b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) # Fixed effects for localities -i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) +i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) # Flexible covariates including localities indicators -f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] -Z.lasso <- as.matrix(cbind(i.fe, f.covs)) +f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] +Zlasso <- as.matrix(cbind(i_fe, f_covs)) ``` We will use the package `rdrobust` for the RD estimation. Before starting the DML procedure, we have to estimate a bandwidth to restrict the samples in the first stage estimation. ```{r} -h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] +h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] ``` The next chunk sets up the crossfitting and estimates the function $\eta(Z)$, which we will use to adjust $Y$ for the second stage. We use Random Forest, a Boosting implementation, Linear Regression and Lasso with both a baseline and flexible covariate structure. ```{r} -first_stage <- function(){ +first_stage <- function() { # Set up the cross-fitting n <- nrow(df_ml) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Matrix to store eta predictions - eta.fit <- matrix(NA, n, 5) + eta_fit <- matrix(NA, n, 5) # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X)0 & !fold & weights>0,] - data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] + data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - data_fold <- df_ml[fold,] + data_fold <- df_ml[fold, ] - model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 - - gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 + + gbm1 <- gbm(model, + data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + gbm0 <- gbm(model, + data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 lm1 <- lm(model, data = data_treated) lm0 <- lm(model, data = data_control) - eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 las_base1 <- rlasso(model, data = data_treated) las_base0 <- rlasso(model, data = data_control) - eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) - data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) - data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) + data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) + data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) las_flex1 <- rlasso(model_flex, data = data_treated_extended) las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 } - return(eta.fit) + return(eta_fit) } -eta.fit <- first_stage() +eta_fit <- first_stage() ``` With the estimated $\hat{\eta}(Z)$ we can correct for confounding in $Y$ and now run the RDD estimation as second stage again. @@ -241,18 +255,18 @@ With the estimated $\hat{\eta}(Z)$ we can correct for confounding in $Y$ and now ```{r} methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") -second_stage <- function(eta.fit){ +second_stage <- function(eta_fit) { adj_results <- NULL - for(i in 1:length(methods)){ - M.Y <- df_ml$Y - eta.fit[,i] - rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + for (i in seq_along(methods)) { + m_y <- df_ml$Y - eta_fit[, i] + rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } return(adj_results) } -adj_frame <- as.data.frame(second_stage(eta.fit)) +adj_frame <- as.data.frame(second_stage(eta_fit)) rownames(adj_frame) <- methods colnames(adj_frame) <- c("LATE", "s.e.") print(adj_frame) @@ -261,28 +275,28 @@ print(adj_frame) Finally, we create a small simulation study with only $R=20$ repetitions to show the variance reducing effect of the inclusion of ML-based estimators for the covariates. The next block runs up to ten minutes. ```{r} -estimates <- adj_frame[,1] -std.err <- adj_frame[,2] +estimates <- adj_frame[, 1] +std_err <- adj_frame[, 2] R <- 19 -for (i in 1:R){ - eta.fit <- first_stage() - adj_results <- second_stage(eta.fit) - estimates <- cbind(estimates, adj_results[,1]) - std.err <- cbind(std.err, adj_results[,2]) +for (i in 1:R) { + eta_fit <- first_stage() + adj_results <- second_stage(eta_fit) + estimates <- cbind(estimates, adj_results[, 1]) + std_err <- cbind(std_err, adj_results[, 2]) } ``` We aggregate the median of the estimates, the mean of the standard errors and also calculate the mean reduction of standard error compared to the "no covariates" estimation. We see, that including covariates can reduce the standard error of estimation around 15-20%. ```{r} -med.est <- apply(estimates, 1, median) -mean.se <- apply(std.err, 1, mean) -adj_frame <- as.data.frame(cbind(med.est, mean.se)) +med_est <- apply(estimates, 1, median) +mean_se <- apply(std_err, 1, mean) +adj_frame <- as.data.frame(cbind(med_est, mean_se)) rownames(adj_frame) <- methods colnames(adj_frame) <- c("LATE", "s.e.") -adj_frame["% reduction"] <- (adj_frame["s.e."] - resframe[1,2]) * 100 / resframe[1,2] -adj_frame["Linear Adjusted (no cross-fit)", ] = resframe_adj[1,] +adj_frame["% reduction"] <- (adj_frame["s.e."] - resframe[1, 2]) * 100 / resframe[1, 2] +adj_frame["Linear Adjusted (no cross-fit)", ] <- resframe_adj[1, ] print(adj_frame) ``` @@ -292,92 +306,96 @@ Non-Food Consumption (Year 1) ```{r} # Running Variable and Outcome -df_ml = df -investigated_outcome = "conspcnonfood_t1" +df_ml <- df +investigated_outcome <- "conspcnonfood_t1" names(df_ml)[names(df_ml) == "index"] <- "X" names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" # Baseline covariates including consumption -b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) +b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) # Fixed effects for localities -i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) +i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) # Flexible covariates including localities indicators -f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] -Z.lasso <- as.matrix(cbind(i.fe, f.covs)) +f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] +Zlasso <- as.matrix(cbind(i_fe, f_covs)) -h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] +h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] -first_stage <- function(){ +first_stage <- function() { # Set up the cross-fitting n <- nrow(df_ml) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Matrix to store eta predictions - eta.fit <- matrix(NA, n, 5) + eta_fit <- matrix(NA, n, 5) # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X)0 & !fold & weights>0,] - data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] + data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - data_fold <- df_ml[fold,] + data_fold <- df_ml[fold, ] - model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 - - gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 + + gbm1 <- gbm(model, + data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + gbm0 <- gbm(model, + data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 lm1 <- lm(model, data = data_treated) lm0 <- lm(model, data = data_control) - eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 las_base1 <- rlasso(model, data = data_treated) las_base0 <- rlasso(model, data = data_control) - eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) - data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) - data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) + data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) + data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) las_flex1 <- rlasso(model_flex, data = data_treated_extended) las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 } - return(eta.fit) + return(eta_fit) } -eta.fit <- first_stage() +eta_fit <- first_stage() methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") -second_stage <- function(eta.fit){ +second_stage <- function(eta_fit) { adj_results <- NULL - for(i in 1:length(methods)){ - M.Y <- df_ml$Y - eta.fit[,i] - rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + for (i in seq_along(methods)) { + m_y <- df_ml$Y - eta_fit[, i] + rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } return(adj_results) } -adj_frame <- as.data.frame(second_stage(eta.fit)) +adj_frame <- as.data.frame(second_stage(eta_fit)) rownames(adj_frame) <- methods colnames(adj_frame) <- c("LATE", "s.e.") print(adj_frame) @@ -387,92 +405,96 @@ Food Consumption (Year 2) ```{r} # Running Variable and Outcome -df_ml = df -investigated_outcome = "conspcfood_t2" +df_ml <- df +investigated_outcome <- "conspcfood_t2" names(df_ml)[names(df_ml) == "index"] <- "X" names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" # Baseline covariates including consumption -b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) +b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) # Fixed effects for localities -i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) +i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) # Flexible covariates including localities indicators -f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] -Z.lasso <- as.matrix(cbind(i.fe, f.covs)) +f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] +Zlasso <- as.matrix(cbind(i_fe, f_covs)) -h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] +h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] -first_stage <- function(){ +first_stage <- function() { # Set up the cross-fitting n <- nrow(df_ml) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Matrix to store eta predictions - eta.fit <- matrix(NA, n, 5) + eta_fit <- matrix(NA, n, 5) # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X)0 & !fold & weights>0,] - data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] + data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - data_fold <- df_ml[fold,] + data_fold <- df_ml[fold, ] - model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 - - gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 + + gbm1 <- gbm(model, + data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + gbm0 <- gbm(model, + data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 lm1 <- lm(model, data = data_treated) lm0 <- lm(model, data = data_control) - eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 las_base1 <- rlasso(model, data = data_treated) las_base0 <- rlasso(model, data = data_control) - eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) - data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) - data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) + data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) + data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) las_flex1 <- rlasso(model_flex, data = data_treated_extended) las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 } - return(eta.fit) + return(eta_fit) } -eta.fit <- first_stage() +eta_fit <- first_stage() methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") -second_stage <- function(eta.fit){ +second_stage <- function(eta_fit) { adj_results <- NULL - for(i in 1:length(methods)){ - M.Y <- df_ml$Y - eta.fit[,i] - rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + for (i in seq_along(methods)) { + m_y <- df_ml$Y - eta_fit[, i] + rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } return(adj_results) } -adj_frame <- as.data.frame(second_stage(eta.fit)) +adj_frame <- as.data.frame(second_stage(eta_fit)) rownames(adj_frame) <- methods colnames(adj_frame) <- c("LATE", "s.e.") print(adj_frame) @@ -482,92 +504,96 @@ Non-Food Consumption (Year 2) ```{r} # Running Variable and Outcome -df_ml = df -investigated_outcome = "conspcnonfood_t2" +df_ml <- df +investigated_outcome <- "conspcnonfood_t2" names(df_ml)[names(df_ml) == "index"] <- "X" names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" # Baseline covariates including consumption -b.covs <- names(df_ml[,c(1:8,10:17,19,22)]) +b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) # Fixed effects for localities -i.fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) +i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) # Flexible covariates including localities indicators -f.covs <- as.matrix(model.matrix(~ .^2, data=df_ml[b.covs] ))[,-1] -Z.lasso <- as.matrix(cbind(i.fe, f.covs)) +f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] +Zlasso <- as.matrix(cbind(i_fe, f_covs)) -h.fs <- 2*rdrobust(df_ml$Y, df_ml$X, rho=1)$bws[[1]] +h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] -first_stage <- function(){ +first_stage <- function() { # Set up the cross-fitting n <- nrow(df_ml) Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n/Kf)) - cfgroup <- sample(sampleframe, size=n, replace = FALSE) + sampleframe <- rep(1:Kf, ceiling(n / Kf)) + cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Matrix to store eta predictions - eta.fit <- matrix(NA, n, 5) + eta_fit <- matrix(NA, n, 5) # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X)0 & !fold & weights>0,] - data_control <- df_ml[df_ml$X<0 & !fold & weights>0,] + data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] + data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - data_fold <- df_ml[fold,] + data_fold <- df_ml[fold, ] - model <- as.formula(paste("Y~", paste(b.covs, collapse = "+"))) + model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta.fit[fold,1] <- (predict(rf1, data_fold) + predict(rf0, data_fold))/2 - - gbm1 <- gbm(model, data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - gbm0 <- gbm(model, data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution="gaussian") - eta.fit[fold,2] <- (predict(gbm1, data_fold, n.trees=100) + predict(gbm0, data_fold, n.trees=100))/2 + eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 + + gbm1 <- gbm(model, + data = data_treated, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + gbm0 <- gbm(model, + data = data_control, n.trees = 100, interaction.depth = 1, + shrinkage = .1, distribution = "gaussian" + ) + eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 lm1 <- lm(model, data = data_treated) lm0 <- lm(model, data = data_control) - eta.fit[fold,3] <- (predict(lm1, data_fold) + predict(lm0, data_fold))/2 + eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 las_base1 <- rlasso(model, data = data_treated) las_base0 <- rlasso(model, data = data_control) - eta.fit[fold,4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold))/2 + eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - data_treated_extended = cbind(Z.lasso[rownames(data_treated),], data_treated) - data_control_extended = cbind(Z.lasso[rownames(data_control),], data_control) - data_fold_extended = cbind(Z.lasso[rownames(data_fold),], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b.covs, colnames(Z.lasso)), collapse = "+"))) + data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) + data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) + data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) + model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) las_flex1 <- rlasso(model_flex, data = data_treated_extended) las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta.fit[fold,5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended))/2 + eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 } - return(eta.fit) + return(eta_fit) } -eta.fit <- first_stage() +eta_fit <- first_stage() methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") -second_stage <- function(eta.fit){ +second_stage <- function(eta_fit) { adj_results <- NULL - for(i in 1:length(methods)){ - M.Y <- df_ml$Y - eta.fit[,i] - rdd_result <- rdrobust(M.Y, df$index, c = cutoff, rho=1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust",])) + for (i in seq_along(methods)) { + m_y <- df_ml$Y - eta_fit[, i] + rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) + adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) } return(adj_results) } -adj_frame <- as.data.frame(second_stage(eta.fit)) +adj_frame <- as.data.frame(second_stage(eta_fit)) rownames(adj_frame) <- methods colnames(adj_frame) <- c("LATE", "s.e.") print(adj_frame) diff --git a/T/dml-for-conditional-average-treatment-effect.Rmd b/T/dml-for-conditional-average-treatment-effect.Rmd index 484a1a7f..2dad6251 100644 --- a/T/dml-for-conditional-average-treatment-effect.Rmd +++ b/T/dml-for-conditional-average-treatment-effect.Rmd @@ -82,19 +82,20 @@ install.packages("foreign") install.packages("quantreg") install.packages("splines") install.packages("lattice") -#install.packages("mnormt") install.packages("Hmisc") install.packages("fda") install.packages("hdm") install.packages("randomForest") install.packages("ranger") install.packages("sandwich") +install.packages("ggplot2") +``` +```{r} library(foreign) library(quantreg) library(splines) library(lattice) -#library(mnormt) library(Hmisc) library(fda) library(hdm) @@ -107,36 +108,35 @@ library(ggplot2) ```{r} ## 401k dataset data(pension) -pension$net_tfa<-pension$net_tfa/10000 +pension$net_tfa <- pension$net_tfa / 10000 ## covariate of interest -- log income -- -pension$inc = log(pension$inc) -#pension$inc[is.na(pension$inc)]<-0 -pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,] +pension$inc <- log(pension$inc) +pension <- pension[!is.na(pension$inc) & pension$inc != -Inf & pension$inc != Inf, ] ## outcome variable -- total net financial assets -Y=pension$net_tfa +Y <- pension$net_tfa ## binary treatment -- indicator of 401(k) eligibility -D=pension$e401 +D <- pension$e401 -X=pension$inc +X <- pension$inc ## target parameter is CATE = E[ Y(1) - Y(0) | X] ## raw covariates so that Y(1) and Y(0) are independent of D given Z -Z = pension[,c("age","inc","fsize","educ","male","db","marr","twoearn","pira","hown","hval","hequity","hmort", - "nohs","hs","smcol")] - - -y_name <- "net_tfa"; -d_name <- "e401"; -form_z <- "(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2"; - - - -cat(sprintf("\n sample size is %g \n", length(Y) )) -cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2] )) +Z <- pension[, c( + "age", "inc", "fsize", "educ", "male", "db", "marr", "twoearn", "pira", "hown", "hval", "hequity", "hmort", + "nohs", "hs", "smcol" +)] + + +y_name <- "net_tfa" +d_name <- "e401" +form_z <- paste("(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + ", + "as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2") +cat(sprintf("\n sample size is %g \n", length(Y))) +cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2])) ``` In Step 1, we estimate three functions: @@ -152,109 +152,105 @@ For each function, we try random forest. First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by lasso ```{r} - -first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) { - +first_stage_lasso <- function(data, d_name, y_name, form_z, seed = 1) { # Sample size - N<-dim(data)[1] + N <- dim(data)[1] # Estimated regression function in control group - mu0.hat<-rep(1,N) + mu0_hat <- rep(1, N) # Estimated regression function in treated group - mu1.hat<-rep(1,N) + mu1_hat <- rep(1, N) # Propensity score - s.hat<-rep(1,N) - seed=1 + s_hat <- rep(1, N) + seed <- 1 ## define sample splitting set.seed(seed) - inds.train=sample(1:N,floor(N/2)) - inds.eval=setdiff(1:N,inds.train) + inds_train <- sample(1:N, floor(N / 2)) + inds_eval <- setdiff(1:N, inds_train) - print ("Estimate treatment probability, first half") + print("Estimate treatment probability, first half") ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.train,]) - - s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type="response") - print ("Estimate treatment probability, second half") - fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.eval,]) - s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type="response") - - + fitted_lasso_pscore <- rlassologit(as.formula(paste0(d_name, "~", form_z)), data = data[inds_train, ]) + s_hat[inds_eval] <- predict(fitted_lasso_pscore, data[inds_eval, ], type = "response") + print("Estimate treatment probability, second half") + fitted_lasso_pscore <- rlassologit(as.formula(paste0(d_name, "~", form_z)), data = data[inds_eval, ]) + s_hat[inds_train] <- predict(fitted_lasso_pscore, data[inds_train, ], type = "response") - data1<-data - data1[,d_name]<-1 + data1 <- data + data1[, d_name] <- 1 - data0<-data - data0[,d_name]<-0 + data0 <- data + data0[, d_name] <- 0 - print ("Estimate expectation function, first half") - fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.train,]) - mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,]) - mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,]) + print("Estimate expectation function, first half") + fitted_lasso_mu <- rlasso(as.formula(paste0(y_name, "~", d_name, "+(", form_z, ")")), data = data[inds_train, ]) + mu1_hat[inds_eval] <- predict(fitted_lasso_mu, data1[inds_eval, ]) + mu0_hat[inds_eval] <- predict(fitted_lasso_mu, data0[inds_eval, ]) - print ("Estimate expectation function, second half") - fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.eval,]) - mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,]) - mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,]) - - return (list(mu1.hat=mu1.hat, - mu0.hat=mu0.hat, - s.hat=s.hat)) + print("Estimate expectation function, second half") + fitted_lasso_mu <- rlasso(as.formula(paste0(y_name, "~", d_name, "+(", form_z, ")")), data = data[inds_eval, ]) + mu1_hat[inds_train] <- predict(fitted_lasso_mu, data1[inds_train, ]) + mu0_hat[inds_train] <- predict(fitted_lasso_mu, data0[inds_train, ]) + return(list( + mu1_hat = mu1_hat, + mu0_hat = mu0_hat, + s_hat = s_hat + )) } ``` First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by random forest ```{r} -first_stage_rf<-function(Y,D,Z,seed=1) { - +first_stage_rf <- function(Y, D, Z, seed = 1) { # Sample size - N<-length(D) + N <- length(D) # Estimated regression function in control group - mu0.hat<-rep(1,N) + mu0_hat <- rep(1, N) # Estimated regression function in treated group - mu1.hat<-rep(1,N) + mu1_hat <- rep(1, N) # Propensity score - s.hat<-rep(1,N) + s_hat <- rep(1, N) ## define sample splitting - set.seed(seed) - inds.train=sample(1:N,floor(N/2)) - inds.eval=setdiff(1:N,inds.train) + set.seed(seed) + inds_train <- sample(1:N, floor(N / 2)) + inds_eval <- setdiff(1:N, inds_train) - print ("Estimate treatment probability, first half") + print("Estimate treatment probability, first half") ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - D.f<-as.factor(as.character(D)) - fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train) - s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type="prob")[,2] - print ("Estimate treatment probability, second half") - fitted.rf<-randomForest(Z,D.f,subset=inds.eval) - s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type="prob")[,2] + Df <- as.factor(as.character(D)) + fitted_rf_pscore <- randomForest(Z, Df, subset = inds_train) + s_hat[inds_eval] <- predict(fitted_rf_pscore, Z[inds_eval, ], type = "prob")[, 2] + print("Estimate treatment probability, second half") + fitted_rf <- randomForest(Z, Df, subset = inds_eval) + s_hat[inds_train] <- predict(fitted_rf_pscore, Z[inds_train, ], type = "prob")[, 2] ## conditional expected net financial assets (i.e., regression function) based on random forest - covariates<-cbind(Z,D) + covariates <- cbind(Z, D) - covariates1<-cbind(Z,D=rep(1,N)) - covariates0<-cbind(Z,D=rep(0,N)) + covariates1 <- cbind(Z, D = rep(1, N)) + covariates0 <- cbind(Z, D = rep(0, N)) - print ("Estimate expectation function, first half") - fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train) - mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,]) - mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,]) + print("Estimate expectation function, first half") + fitted_rf_mu <- randomForest(cbind(Z, D), Y, subset = inds_train) + mu1_hat[inds_eval] <- predict(fitted_rf_mu, covariates1[inds_eval, ]) + mu0_hat[inds_eval] <- predict(fitted_rf_mu, covariates0[inds_eval, ]) - print ("Estimate expectation function, second half") - fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval) - mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,]) - mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,]) - - return (list(mu1.hat=mu1.hat, - mu0.hat=mu0.hat, - s.hat=s.hat)) + print("Estimate expectation function, second half") + fitted_rf_mu <- randomForest(cbind(Z, D), Y, subset = inds_eval) + mu1_hat[inds_train] <- predict(fitted_rf_mu, covariates1[inds_train, ]) + mu0_hat[inds_train] <- predict(fitted_rf_mu, covariates0[inds_train, ]) + return(list( + mu1_hat = mu1_hat, + mu0_hat = mu0_hat, + s_hat = s_hat + )) } ``` @@ -291,310 +287,298 @@ Then, the Best Linear Predictor $\beta_0$ vector shows the average treatment eff ```{r} ## estimate first stage functions by random forest ## may take a while -fs.hat.rf = first_stage_rf(Y,D,Z) +fs_hat_rf <- first_stage_rf(Y, D, Z) ``` ```{r} -X=pension$inc -fs.hat<-fs.hat.rf -min_cutoff=0.01 +X <- pension$inc +fs_hat <- fs_hat_rf +min_cutoff <- 0.01 # regression function -mu1.hat<-fs.hat[["mu1.hat"]] -mu0.hat<-fs.hat[["mu0.hat"]] +mu1_hat <- fs_hat[["mu1_hat"]] +mu0_hat <- fs_hat[["mu0_hat"]] # propensity score -s.hat<-fs.hat[["s.hat"]] -s.hat<-sapply(s.hat,max,min_cutoff) -### Construct Orthogonal Signal +s_hat <- fs_hat[["s_hat"]] +s_hat <- sapply(s_hat, max, min_cutoff) - -RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat +### Construct Orthogonal Signal +RobustSignal <- (Y - mu1_hat) * D / s_hat - (Y - mu0_hat) * (1 - D) / (1 - s_hat) + mu1_hat - mu0_hat ``` ```{r} -qtmax <- function(C, S=10000, alpha) - {; - p <- nrow(C); - tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max); - return(quantile(tmaxs, 1-alpha)); - }; +qtmax <- function(C, S = 10000, alpha) { + p <- nrow(C) + tmaxs <- apply(abs(matrix(rnorm(p * S), nrow = p, ncol = S)), 2, max) + return(quantile(tmaxs, 1 - alpha)) +} # This function computes the square root of a symmetric matrix using the spectral decomposition; +group_average_treatment_effect <- function(X, Y, max_grid = 5, alpha = 0.05, B = 10000) { + grid <- quantile(X, probs = c((0:max_grid) / max_grid)) + Xraw <- matrix(NA, nrow = length(Y), ncol = length(grid) - 1) -group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) { - - grid<-quantile(X,probs=c((0:max_grid)/max_grid)) - X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1) - - for (k in 2:((length(grid)))) { - X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) ) + for (k in 2:((length(grid)))) { + Xraw[, k - 1] <- sapply(X, function(x) ifelse(x >= grid[k - 1] & x < grid[k], 1, 0)) + } + k <- length(grid) + Xraw[, k - 1] <- sapply(X, function(x) ifelse(x >= grid[k - 1] & x <= grid[k], 1, 0)) - ols.fit<- lm(Y~X.raw-1) - coefs <- coef(ols.fit) - vars <- names(coefs) - HCV.coefs <- vcovHC(ols.fit, type = 'HC') - coefs.se <- sqrt(diag(HCV.coefs)) # White std errors - ## this is an identity matrix - ## qtmax is simplified - C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs)))); + ols_fit <- lm(Y ~ Xraw - 1) + coefs <- coef(ols_fit) + vars <- names(coefs) + hcv_coefs <- vcovHC(ols_fit, type = "HC") + coefs_se <- sqrt(diag(hcv_coefs)) # White std errors + ## this is an identity matrix + ## qtmax is simplified + c_coefs <- (diag(1 / sqrt(diag(hcv_coefs)))) %*% hcv_coefs %*% (diag(1 / sqrt(diag(hcv_coefs)))) - tes <- coefs - tes.se <- coefs.se - tes.cor <- C.coefs - crit.val <- qtmax(tes.cor,B,alpha); + tes <- coefs + tes_se <- coefs_se + tes_cor <- c_coefs + crit_val <- qtmax(tes_cor, B, alpha) - tes.ucb <- tes + crit.val * tes.se; - tes.lcb <- tes - crit.val * tes.se; + tes_ucb <- tes + crit_val * tes_se + tes_lcb <- tes - crit_val * tes_se - tes.uci <- tes + qnorm(1-alpha/2) * tes.se; - tes.lci <- tes + qnorm(alpha/2) * tes.se; + tes_uci <- tes + qnorm(1 - alpha / 2) * tes_se + tes_lci <- tes + qnorm(alpha / 2) * tes_se - return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci, - ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val )) + return(list( + beta_hat = coefs, ghat_lower_point = tes_lci, ghat_upper_point = tes_uci, + ghat_lower = tes_lcb, ghat_upper = tes_ucb, crit_val = crit_val + )) } ``` ```{r} -res<-group_average_treatment_effect(X=X,Y=RobustSignal) +res <- group_average_treatment_effect(X = X, Y = RobustSignal) ``` ```{r} ## this code is taken from L1 14.382 taught at MIT ## author: Mert Demirer -options(repr.plot.width=10, repr.plot.height=8) - -tes<-res$beta.hat -tes.lci<-res$ghat.lower.point -tes.uci<-res$ghat.upper.point - -tes.lcb<-res$ghat.lower -tes.ucb<-res$ghat.upper -tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%') - -plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type="n",xlab="Income group", - ylab="Average Effect on NET TFA (per 10 K)", main="Group Average Treatment Effects on NET TFA", xaxt="n"); -axis(1, at=1:5, labels=tes.lev); -for (i in 1:5) -{; - rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = "red", lwd = 3); - rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 ); - segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 ); -}; -abline(h=0); +options(repr.plot.width = 10, repr.plot.height = 8) + +tes <- res$beta_hat +tes_lci <- res$ghat_lower_point +tes_uci <- res$ghat_upper_point + +tes_lcb <- res$ghat_lower +tes_ucb <- res$ghat_upper +tes_lev <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") + +plot(c(1, 5), las = 2, xlim = c(0.6, 5.4), ylim = c(.05, 2.09), type = "n", xlab = "Income group", + ylab = "Average Effect on NET TFA (per 10 K)", + main = "Group Average Treatment Effects on NET TFA", + xaxt = "n") +axis(1, at = 1:5, labels = tes_lev) +for (i in 1:5) { + rect(i - 0.2, tes_lci[i], i + 0.2, tes_uci[i], col = NA, border = "red", lwd = 3) + rect(i - 0.2, tes_lcb[i], i + 0.2, tes_ucb[i], col = NA, border = 4, lwd = 3) + segments(i - 0.2, tes[i], i + 0.2, tes[i], lwd = 5) +} +abline(h = 0) -legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8); +legend(2.5, 2.0, + c("Regression Estimate", "95% Simultaneous Confidence Interval", "95% Pointwise Confidence Interval"), + col = c(1, 4, 2), lwd = c(4, 3, 3), horiz = FALSE, bty = "n", cex = 0.8) dev.off() ``` ```{r} -least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) { +least_squares_splines <- function(X, Y, max_knot = 9, norder, nderiv, ...) { ## Create technical regressors - cv.bsp<-rep(0,max_knot-1) - for (knot in 2:max_knot) { - breaks<- quantile(X, c(0:knot)/knot) - formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1] - fit <- lm(formula.bsp); - cv.bsp[knot-1] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); - } - ## Number of knots chosen by cross-validation - cv_knot<-which.min(cv.bsp)+1 - breaks<- quantile(X, c(0:cv_knot)/cv_knot) - formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1] - fit <- lm(formula.bsp); - - - return(list(cv_knot=cv_knot,fit=fit)) -} + cv_bsp <- rep(0, max_knot - 1) + for (knot in 2:max_knot) { + breaks <- quantile(X, c(0:knot) / knot) + formula.bsp <- Y ~ bsplineS(X, breaks = breaks, norder = norder, nderiv = nderiv)[, -1] + fit <- lm(formula.bsp) + cv_bsp[knot - 1] <- sum((fit$res / (1 - hatvalues(fit)))^2) + } + ## Number of knots chosen by cross-validation + cv_knot <- which.min(cv_bsp) + 1 + breaks <- quantile(X, c(0:cv_knot) / cv_knot) + formula.bsp <- Y ~ bsplineS(X, breaks = breaks, norder = norder, nderiv = 0)[, -1] + fit <- lm(formula.bsp) + return(list(cv_knot = cv_knot, fit = fit)) +} -least_squares_series<-function(X, Y,max_degree,...) { - cv.pol<-rep(0,max_degree) +least_squares_series <- function(X, Y, max_degree, ...) { + cv_pol <- rep(0, max_degree) for (degree in 1:max_degree) { - formula.pol <- Y ~ poly(X, degree) - fit <- lm(formula.pol ); - cv.pol[degree] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); + formula.pol <- Y ~ poly(X, degree) + fit <- lm(formula.pol) + cv_pol[degree] <- sum((fit$res / (1 - hatvalues(fit)))^2) } ## Number of knots chosen by cross-validation - cv_degree<-which.min(cv.pol) + cv_degree <- which.min(cv_pol) ## Estimate coefficients - formula.pol <- Y ~ poly(X, cv_degree) - fit <- lm(formula.pol); - + formula.pol <- Y ~ poly(X, cv_degree) + fit <- lm(formula.pol) - return(list(fit=fit,cv_degree=cv_degree)) + return(list(fit = fit, cv_degree = cv_degree)) } ``` ```{r} -msqrt <- function(C) - {; - C.eig <- eigen(C); - return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors)); - }; - - -tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) { - - - numerator_grid<-regressors_grid%*%msqrt( Omega.hat) - denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid))) +msqrt <- function(C) { + Ceig <- eigen(C) + return(Ceig$vectors %*% diag(sqrt(Ceig$values)) %*% solve(Ceig$vectors)) +} - norm_numerator_grid<-numerator_grid - for (k in 1:dim(numerator_grid)[1]) { - norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k] - } +tboot <- function(regressors_grid, omega_hat, alpha, B = 10000) { + numerator_grid <- regressors_grid %*% msqrt(omega_hat) + denominator_grid <- sqrt(diag(regressors_grid %*% omega_hat %*% t(regressors_grid))) - tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max) - return(quantile(tmaxs, 1-alpha)) + norm_numerator_grid <- numerator_grid + for (k in seq_len(dim(numerator_grid)[1])) { + norm_numerator_grid[k, ] <- numerator_grid[k, ] / denominator_grid[k] + } + tmaxs <- apply(abs(norm_numerator_grid %*% matrix(rnorm(dim(numerator_grid)[2] * B), + nrow = dim(numerator_grid)[2], ncol = B)), 2, max) + return(quantile(tmaxs, 1 - alpha)) } ``` ```{r} -second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method="poly",min_cutoff=0.01,alpha=0.05,eps=0.1,...) { - - X_grid = seq(min(X),max(X),eps) - mu1.hat<-fs.hat[["mu1.hat"]] - mu0.hat<-fs.hat[["mu0.hat"]] - # propensity score - s.hat<-fs.hat[["s.hat"]] - s.hat<-sapply(s.hat,max,min_cutoff) - ### Construct Orthogonal Signal - - RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat - - - - # Estimate the target function using least squares series - if (ss_method == "ortho_poly") { - res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree) - fit<-res$fit - cv_degree<-res$cv_degree - regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree)) - +second_stage <- function(fs_hat, Y, D, X, max_degree = 3, norder = 4, nderiv = 0, + ss_method = "poly", min_cutoff = 0.01, alpha = 0.05, eps = 0.1, ...) { + x_grid <- seq(min(X), max(X), eps) + mu1_hat <- fs_hat[["mu1_hat"]] + mu0_hat <- fs_hat[["mu0_hat"]] + # propensity score + s_hat <- fs_hat[["s_hat"]] + s_hat <- sapply(s_hat, max, min_cutoff) + ### Construct Orthogonal Signal + + RobustSignal <- (Y - mu1_hat) * D / s_hat - (Y - mu0_hat) * (1 - D) / (1 - s_hat) + mu1_hat - mu0_hat + + # Estimate the target function using least squares series + if (ss_method == "ortho_poly") { + res <- least_squares_series(X = X, Y = RobustSignal, eps = 0.1, max_degree = max_degree) + fit <- res$fit + cv_degree <- res$cv_degree + regressors_grid <- cbind(rep(1, length(x_grid)), poly(x_grid, cv_degree)) } if (ss_method == "splines") { - - res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv) - fit<-res$fit - cv_knot<-res$cv_knot - breaks<- quantile(X, c(0:cv_knot)/cv_knot) - regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]) - degree=cv_knot - - + res <- least_squares_splines(X = X, Y = RobustSignal, eps = 0.1, norder = norder, nderiv = nderiv) + fit <- res$fit + cv_knot <- res$cv_knot + breaks <- quantile(X, c(0:cv_knot) / cv_knot) + regressors_grid <- cbind(rep(1, length(x_grid)), + bsplineS(x_grid, breaks = breaks, norder = norder, nderiv = nderiv)[, -1]) + degree <- cv_knot } + g_hat <- regressors_grid %*% coef(fit) - g.hat<-regressors_grid%*%coef(fit) + hcv_coefs <- vcovHC(fit, type = "HC") + standard_error <- sqrt(diag(regressors_grid %*% hcv_coefs %*% t(regressors_grid))) - - HCV.coefs <- vcovHC(fit, type = 'HC') - #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit)) - standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid))) ### Lower Pointwise CI - ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error + ghat_lower_point <- g_hat + qnorm(alpha / 2) * standard_error ### Upper Pointwise CI - ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error - - max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha) + ghat_upper_point <- g_hat + qnorm(1 - alpha / 2) * standard_error + max_tstat <- tboot(regressors_grid = regressors_grid, omega_hat = hcv_coefs, alpha = alpha) ## Lower Uniform CI - ghat.lower<-g.hat-max_tstat*standard_error + ghat_lower <- g_hat - max_tstat * standard_error ## Upper Uniform CI - ghat.upper<-g.hat+max_tstat*standard_error - return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point, - ghat.upper.point=ghat.upper.point,X_grid=X_grid)) - - - + ghat_upper <- g_hat + max_tstat * standard_error + return(list( + ghat_lower = ghat_lower, g_hat = g_hat, ghat_upper = ghat_upper, fit = fit, + ghat_lower_point = ghat_lower_point, ghat_upper_point = ghat_upper_point, x_grid = x_grid + )) } ``` ```{r} -make_plot<-function(res,lowy,highy,degree,ss_method = "series",uniform=TRUE,...) { - - - title=paste0("Effect of 401(k) on Net TFA, ", ss_method) - X_grid=res$X_grid - len = length(X_grid) +make_plot <- function(res, lowy, highy, degree, ss_method = "series", uniform = TRUE, ...) { + title <- paste0("Effect of 401(k) on Net TFA, ", ss_method) + x_grid <- res$x_grid + len <- length(x_grid) if (uniform) { - group <-c(rep("UCI",len), rep("PCI",len), rep("Estimate",len),rep("PCIL",len),rep("UCIL",len)) - group_type<- c(rep("CI",len), rep("CI",len), rep("Estimate",len),rep("CI",len),rep("CI",len)) - group_ci_type<-c(rep("Uniform",len), rep("Point",len), rep("Uniform",len),rep("Point",len),rep("Uniform",len)) - - df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type ) - p<-ggplot(data=df)+ - aes(x=exp(income),y=outcome,colour=group )+ - theme_bw()+ - xlab("Income")+ - ylab("Net TFA, (thousand dollars)")+ - scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ - theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ - theme(legend.title=element_blank())+ - theme(legend.position="none")+ - ylim(low=lowy,high=highy)+ - geom_line(aes(linetype = group_line),size=1.5)+ - scale_linetype_manual(values=c("dashed","solid"))+ - ggtitle(title) + group <- c(rep("UCI", len), rep("PCI", len), rep("Estimate", len), rep("PCIL", len), rep("UCIL", len)) + group_type <- c(rep("CI", len), rep("CI", len), rep("Estimate", len), rep("CI", len), rep("CI", len)) + group_ci_type <- c(rep("Uniform", len), rep("Point", len), + rep("Uniform", len), rep("Point", len), rep("Uniform", len)) + + df <- data.frame(income = rep(x_grid, 5), + outcome = c(res$ghat_lower, res$ghat_lower_point, + res$g_hat, res$ghat_upper_point, res$ghat_upper), + group = group, group_col = group_type, group_line = group_ci_type) + p <- ggplot(data = df) + + aes(x = exp(income), y = outcome, colour = group) + + theme_bw() + + xlab("Income") + + ylab("Net TFA, (thousand dollars)") + + scale_colour_manual(values = c("black", "blue", "blue", "blue", "blue")) + + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 20, family = "serif")) + + theme(legend.title = element_blank()) + + theme(legend.position = "none") + + ylim(low = lowy, high = highy) + + geom_line(aes(linetype = group_line), size = 1.5) + + scale_linetype_manual(values = c("dashed", "solid")) + + ggtitle(title) } if (!uniform) { - group <-c( rep("PCI",len), rep("Estimate",len),rep("PCIL",len)) - group_type<- c(rep("CI",len), rep("Estimate",len),rep("CI",len)) - group_ci_type<-c(rep("Point",len), rep("Uniform",len),rep("Point",len)) - - df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type ) - - p<-ggplot(data=df)+ - aes(x=exp(income),y=outcome,colour=group )+ - theme_bw()+ - xlab("Income")+ - ylab("Net TFA, (thousand dollars)")+ - scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ - theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ - theme(legend.title=element_blank())+ - theme(legend.position="none")+ - ylim(low=lowy,high=highy)+ - geom_line(aes(linetype = group_line),size=1.5)+ - scale_linetype_manual(values=c("dashed","solid"))+ - ggtitle(title) - - } - - + group <- c(rep("PCI", len), rep("Estimate", len), rep("PCIL", len)) + group_type <- c(rep("CI", len), rep("Estimate", len), rep("CI", len)) + group_ci_type <- c(rep("Point", len), rep("Uniform", len), rep("Point", len)) + + df <- data.frame(income = rep(x_grid, 3), + outcome = c(res$ghat_lower_point, res$g_hat, res$ghat_upper_point), + group = group, group_col = group_type, group_line = group_ci_type) + + p <- ggplot(data = df) + + aes(x = exp(income), y = outcome, colour = group) + + theme_bw() + + xlab("Income") + + ylab("Net TFA, (thousand dollars)") + + scale_colour_manual(values = c("black", "blue", "blue", "blue", "blue")) + + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 20, family = "serif")) + + theme(legend.title = element_blank()) + + theme(legend.position = "none") + + ylim(low = lowy, high = highy) + + geom_line(aes(linetype = group_line), size = 1.5) + + scale_linetype_manual(values = c("dashed", "solid")) + + ggtitle(title) + } return(p) } ``` ```{r} -res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="splines",max_degree=3) +res_ortho_rf_splines <- second_stage(fs_hat = fs_hat_rf, X = X, D = D, Y = Y, + ss_method = "splines", max_degree = 3) ``` ```{r} -res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="ortho_poly",max_degree=3) +res_ortho_rf_ortho_poly <- second_stage(fs_hat = fs_hat_rf, X = X, D = D, Y = Y, + ss_method = "ortho_poly", max_degree = 3) ``` -#plot findings: +plot findings -- black solid line shows estimated function $p(x)' \widehat{\beta}$ -- blue dashed lines show pointwise confidence bands for this function ```{r} -p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho_poly",uniform=FALSE, lowy=-10,highy=20) -options(repr.plot.width=15, repr.plot.height=10) +p <- make_plot(res_ortho_rf_ortho_poly, ss_method = "ortho_poly", uniform = FALSE, lowy = -10, highy = 20) +options(repr.plot.width = 15, repr.plot.height = 10) print(p) ``` @@ -607,20 +591,20 @@ plot findings: -- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \rightarrow p(x)'\beta_0$ with probability 0.95 on some compact range ```{r} -p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho polynomials",uniform=TRUE,lowy=-10,highy=25) -options(repr.plot.width=15, repr.plot.height=10) +p <- make_plot(res_ortho_rf_ortho_poly, ss_method = "ortho polynomials", uniform = TRUE, lowy = -10, highy = 25) +options(repr.plot.width = 15, repr.plot.height = 10) print(p) ``` ```{r} -p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=FALSE, lowy=-15,highy=10) -options(repr.plot.width=15, repr.plot.height=10) +p <- make_plot(res_ortho_rf_splines, ss_method = "splines", uniform = FALSE, lowy = -15, highy = 10) +options(repr.plot.width = 15, repr.plot.height = 10) print(p) ``` ```{r} -p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=TRUE,lowy=-20,highy=20) -options(repr.plot.width=15, repr.plot.height=10) +p <- make_plot(res_ortho_rf_splines, ss_method = "splines", uniform = TRUE, lowy = -20, highy = 20) +options(repr.plot.width = 15, repr.plot.height = 10) print(p) ``` From 8930a1738c283030cab79933e07cc78debc5556a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 19 Jul 2024 12:12:23 +0000 Subject: [PATCH 157/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in AC2 --- ...iased-ml-for-partially-linear-iv-model.Rmd | 159 ++- AC2/r-dml-401k-IV.Rmd | 1236 ++++++++++------- AC2/r-dml-401k-IV.irnb | 174 +-- AC2/r-weak-iv-experiments.Rmd | 57 +- 4 files changed, 900 insertions(+), 726 deletions(-) diff --git a/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd b/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd index 8bace899..ce73b72a 100644 --- a/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd +++ b/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd @@ -20,10 +20,10 @@ The code is based on the book. # Partially Linear IV Model We consider the partially linear structural equation model: -\begin{eqnarray} - & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ - & Z = m_0(X) + V, & E[V \mid X] = 0. -\end{eqnarray} +\begin{align} + Y :=~& D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ + Z :=~& m_0(X) + V, & E[V \mid X] = 0. +\end{align} Note that this model is not a regression model unless $Z=D$. The model is a canonical @@ -104,12 +104,14 @@ install.packages("AER") install.packages("randomForest") install.packages("lfe") install.packages("glmnet") +``` +```{r} library(hdm) -library(AER) #applied econometrics library -library(randomForest) #random Forest library -library(lfe) #high-dimensional econometrics library -library(glmnet) #glm net +library(AER) # applied econometrics library +library(randomForest) # random Forest library +library(lfe) # high-dimensional econometrics library +library(glmnet) # glm net set.seed(1) ``` @@ -117,35 +119,34 @@ set.seed(1) ```{r} # DML for PLIVM -DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5) { +dml2_for_plivm <- function(x, d, z, y, dreg, yreg, zreg, nfold = 5) { # this implements DML2 algorithm, where there moments are estimated via DML, before constructing # the pooled estimate of theta randomly split data into folds nobs <- nrow(x) - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] I <- split(1:nobs, foldid) # create residualized objects to fill - ytil <- dtil <- ztil<- rep(NA, nobs) + ytil <- dtil <- ztil <- rep(NA, nobs) # obtain cross-fitted residuals cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out - zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual - ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial - cat(b," ") + for (b in seq_along(I)) { + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out + zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a folot out + dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the fold out + zhat <- predict(zfit, x[I[[b]], ], type = "response") # predict the fold out + yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the fold out + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual + ztil[I[[b]]] <- (z[I[[b]]] - zhat) # record residual + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial + cat(b, " ") } - ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) - coef.est <- ivfit$coef #extract coefficient - se <- ivfit$se #record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) + ivfit <- tsls(y = ytil, d = dtil, x = NULL, z = ztil, intercept = FALSE) + coef_est <- ivfit$coef # extract coefficient + se <- ivfit$se # record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) + return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil, ztil = ztil)) } - ``` ----- @@ -167,12 +168,11 @@ dim(AJR) ``` ```{r} -y = AJR$GDP; -d = AJR$Exprop; -z = AJR$logMort -xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) -x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + - Asia + Namer + Samer)^2, data=AJR) +y <- AJR$GDP +d <- AJR$Exprop +z <- AJR$logMort +xraw <- model.matrix(~ Latitude + Africa + Asia + Namer + Samer, data = AJR) +x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, data = AJR) dim(x) ``` @@ -181,38 +181,50 @@ set.seed(1) # DML with PostLasso cat(sprintf("\n DML with Post-Lasso \n")) -dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso -yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso -zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso +dreg <- function(x, d) { + hdm::rlasso(x, d) +} # ML method=lasso +yreg <- function(x, y) { + hdm::rlasso(x, y) +} # ML method=lasso +zreg <- function(x, z) { + hdm::rlasso(x, z) +} # ML method=lasso -DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=5) +dml2_lasso <- dml2_for_plivm(x, d, z, y, dreg, yreg, zreg, nfold = 5) # DML with Random Forest cat(sprintf("\n DML with Random Forest \n")) -dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest +dreg <- function(x, d) { + randomForest(x, d) +} # ML method=Forest +yreg <- function(x, y) { + randomForest(x, y) +} # ML method=Forest +zreg <- function(x, z) { + randomForest(x, z) +} # ML method=Forest -DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=5) +dml2_rf <- dml2_for_plivm(xraw, d, z, y, dreg, yreg, zreg, nfold = 5) # Compare Forest vs Lasso -comp.tab= matrix(NA, 3, 2) -comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) -comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) -comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) -rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") -colnames(comp.tab) = c("RF", "LASSO") -print(comp.tab, digits=3) +comp_tab <- matrix(NA, 3, 2) +comp_tab[1, ] <- c(sqrt(mean((dml2_rf$ytil)^2)), sqrt(mean((dml2_lasso$ytil)^2))) +comp_tab[2, ] <- c(sqrt(mean((dml2_rf$dtil)^2)), sqrt(mean((dml2_lasso$dtil)^2))) +comp_tab[3, ] <- c(sqrt(mean((dml2_rf$ztil)^2)), sqrt(mean((dml2_lasso$ztil)^2))) +rownames(comp_tab) <- c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") +colnames(comp_tab) <- c("RF", "LASSO") +print(comp_tab, digits = 3) ``` # Weak Instruments? ```{r} # using lfe package -summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) -summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) +summary(felm(dml2_lasso$dtil ~ dml2_lasso$ztil), robust = TRUE) +summary(felm(dml2_rf$dtil ~ dml2_rf$ztil), robust = TRUE) ``` ## Anderson-Rubin Idea for Inference with Weak Instruments @@ -243,29 +255,32 @@ is a vector, with a suitable adjustment of the statistic $C(\theta)$. ```{r} # DML-AR (DML with Anderson-Rubin) -DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ - n = length(rY) - Cstat = rep(0, length(grid)) - for (i in 1:length(grid)) { - Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) - }; - LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); - UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); - plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); - lines(grid, Cstat, lty = 1, col = 1); - abline(h=qchisq(1-alpha,1), lty = 3, col = 4); - abline(v=LB, lty = 3, col = 2); - abline(v=UB, lty = 3, col = 2); - return(list(UB=UB, LB=LB)) - } +dml_ar_pliv <- function(rY, rD, rZ, grid, alpha = .05) { + n <- length(rY) + Cstat <- rep(0, length(grid)) + for (i in seq_along(grid)) { + Cstat[i] <- n * (mean((rY - grid[i] * rD) * rZ))^2 / var((rY - grid[i] * rD) * rZ) + } + LB <- min(grid[Cstat < qchisq(1 - alpha, 1)]) + UB <- max(grid[Cstat < qchisq(1 - alpha, 1)]) + plot(range(grid), range(c(Cstat)), type = "n", xlab = "Effect of institutions", ylab = "Statistic", main = " ") + lines(grid, Cstat, lty = 1, col = 1) + abline(h = qchisq(1 - alpha, 1), lty = 3, col = 4) + abline(v = LB, lty = 3, col = 2) + abline(v = UB, lty = 3, col = 2) + return(list(UB = UB, LB = LB)) +} ``` ```{r} -DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, - grid = seq(-2, 2, by =.01)) - - -DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, - grid = seq(-2, 2, by =.01)) +dml_ar_pliv( + rY = dml2_lasso$ytil, rD = dml2_lasso$dtil, rZ = dml2_lasso$ztil, + grid = seq(-2, 2, by = .01) +) + +dml_ar_pliv( + rY = dml2_rf$ytil, rD = dml2_rf$dtil, rZ = dml2_rf$ztil, + grid = seq(-2, 2, by = .01) +) ``` diff --git a/AC2/r-dml-401k-IV.Rmd b/AC2/r-dml-401k-IV.Rmd index a85c9e24..602944aa 100644 --- a/AC2/r-dml-401k-IV.Rmd +++ b/AC2/r-dml-401k-IV.Rmd @@ -20,8 +20,11 @@ install.packages("ggplot2") install.packages("randomForest") install.packages("glmnet") install.packages("rpart") +install.packages("data.table") install.packages("gbm") +``` +```{r} library(xtable) library(hdm) library(sandwich) @@ -58,16 +61,18 @@ The data consist of 9,915 observations at the household level drawn from the 199 Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. ```{r} -hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() +hist_e401 <- ggplot(data, aes(x = e401, fill = factor(e401))) + + geom_bar() hist_e401 ``` Eligibility is highly associated with financial wealth: ```{r} -dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + - geom_density() + xlim(c(-20000, 150000)) + - facet_wrap(.~e401) +dens_net_tfa <- ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401))) + + geom_density() + + xlim(c(-20000, 150000)) + + facet_wrap(. ~ e401) dens_net_tfa ``` @@ -75,36 +80,37 @@ dens_net_tfa The unconditional APE of e401 is about $19559$: ```{r} -e1 <- data[data$e401==1,] -e0 <- data[data$e401==0,] -round(mean(e1$net_tfa)-mean(e0$net_tfa),0) +e1 <- data[data$e401 == 1, ] +e0 <- data[data$e401 == 0, ] +round(mean(e1$net_tfa) - mean(e0$net_tfa), 0) ``` Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: ```{r} -p1 <- data[data$p401==1,] -p0 <- data[data$p401==0,] -round(mean(p1$net_tfa)-mean(p0$net_tfa),0) +p1 <- data[data$p401 == 1, ] +p0 <- data[data$p401 == 0, ] +round(mean(p1$net_tfa) - mean(p0$net_tfa), 0) ``` As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. ```{r} # instrument variable -Z <- data[,'e401'] +Z <- data[, "e401"] # treatment variable -D <- data[, 'p401'] +D <- data[, "p401"] # outcome variable -y <- data[,'net_tfa'] +y <- data[, "net_tfa"] ``` ### We construct the engineered features for controls ```{r} # Constructing the controls -X_formula = "~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" -X = as.data.table(model.frame(X_formula, pension)) +x_formula <- paste("~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + ", + "poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") +X <- as.data.table(model.frame(x_formula, pension)) head(X) ``` @@ -113,13 +119,13 @@ head(X) ## Double ML IV under Partial Linearity Now, we consider estimation of average treatment effects of participation in 401k, i.e. `p401`, with the binary instrument being eligibility in 401k, i.e. `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. We consider a partially linear structural equation model: -\begin{eqnarray*} -Y & := & g_Y(\epsilon_Y) D + f_Y(A, X, \epsilon_Y), \\ -D & := & f_D(Z, X, A, \epsilon_D), \\ -Z & := & f_Z(X, \epsilon_Z),\\ -A & : = & f_A(X, \epsilon_A), \\ -X & := & \epsilon_X, -\end{eqnarray*} +\begin{align} +Y :=~& g_Y(\epsilon_Y) D + f_Y(A, X, \epsilon_Y), \\ +D :=~& f_D(Z, X, A, \epsilon_D), \\ +Z :=~& f_Z(X, \epsilon_Z),\\ +A :=~& f_A(X, \epsilon_A), \\ +X :=~& \epsilon_X, +\end{align} where $A$ is a vector of un-observed confounders. Under this structural equation model, the average treatment effect: @@ -134,21 +140,24 @@ where for any variable $V$, we denote with $\tilde{V} = V - E[V|X]$. ```{r} set.seed(1) -yfit.lasso.cv <- cv.glmnet(as.matrix(X), y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -Dfit.lasso.cv <- cv.glmnet(as.matrix(X), D, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -Zfit.lasso.cv <- cv.glmnet(as.matrix(X), Z, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +# family gaussian means that we'll be using square loss +yfit_lasso_cv <- cv.glmnet(as.matrix(X), y, family = "gaussian", alpha = 1) +# family gaussian means that we'll be using square loss +dfit_lasso_cv <- cv.glmnet(as.matrix(X), D, family = "gaussian", alpha = 1) +# family gaussian means that we'll be using square loss +zfit_lasso_cv <- cv.glmnet(as.matrix(X), Z, family = "gaussian", alpha = 1) -yhat.lasso.cv <- predict(yfit.lasso.cv, newx = as.matrix(X)) # predictions -Dhat.lasso.cv <- predict(Dfit.lasso.cv, newx = as.matrix(X)) # predictions -Zhat.lasso.cv <- predict(Zfit.lasso.cv, newx = as.matrix(X)) # predictions +yhat_lasso_cv <- predict(yfit_lasso_cv, newx = as.matrix(X)) # predictions +dhat_lasso_cv <- predict(dfit_lasso_cv, newx = as.matrix(X)) # predictions +zhat_lasso_cv <- predict(zfit_lasso_cv, newx = as.matrix(X)) # predictions -resy <- y-yhat.lasso.cv -resD <- D-Dhat.lasso.cv -resZ <- Z-Zhat.lasso.cv +resy <- y - yhat_lasso_cv +resD <- D - dhat_lasso_cv +resZ <- Z - zhat_lasso_cv # Estimate -mean(resy * resZ) / mean(resZ*resD) +mean(resy * resZ) / mean(resZ * resD) ``` Recall if we want to do inference, we need to either use the theoretically driven penalty paramter for Lasso or perform cross-fitting. @@ -157,69 +166,71 @@ Recall if we want to do inference, we need to either use the theoretically drive ```{r} # DML for PLIVM with D and Z as classifiers or regressors -DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=5, method="regression") { +dml2_for_plivm <- function(x, d, z, y, dreg, yreg, zreg, nfold = 5, method = "regression") { nobs <- nrow(x) - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] I <- split(1:nobs, foldid) # create residualized objects to fill - ytil <- dtil <- ztil<- rep(NA, nobs) + ytil <- dtil <- ztil <- rep(NA, nobs) # obtain cross-fitted residuals cat("fold: ") - for(b in 1:length(I)){ - if (method == "randomforest"){ - # take a fold out - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) - zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]],], type="prob")[,2] # type = "prob" is like predict_proba in scikitlearn - zhat <- predict(zfit, x[I[[b]],], type="prob")[,2] - yhat <- predict(yfit, x[I[[b]],]) # default type = "response" for regression for RF, type = "vector" for regression for Decision Trees - # record residual - dtil[I[[b]]] <- (as.numeric(d[I[[b]]])-1 - dhat) # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1! - ztil[I[[b]]] <- (as.numeric(z[I[[b]]])-1 - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) + for (b in seq_along(I)) { + if (method == "randomforest") { + # take a fold out + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) + zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]], ], type = "prob")[, 2] # type = "prob" is like predict_proba in scikitlearn + zhat <- predict(zfit, x[I[[b]], ], type = "prob")[, 2] + # default type = "response" for regression for RF, type = "vector" for regression for Decision Trees + yhat <- predict(yfit, x[I[[b]], ]) + # record residual + # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1! + dtil[I[[b]]] <- (as.numeric(d[I[[b]]]) - 1 - dhat) + ztil[I[[b]]] <- (as.numeric(z[I[[b]]]) - 1 - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) } else if (method == "regression") { # works for both boosted trees and glmnet - # take a fold out - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) - zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]],], type="response") - zhat <- predict(zfit, x[I[[b]],], type="response") - yhat <- predict(yfit, x[I[[b]],], type="response") - # record residual - dtil[I[[b]]] <- (d[I[[b]]] - dhat) - ztil[I[[b]]] <- (z[I[[b]]] - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) - } else if (method == "decisiontrees"){ - # take a fold out - dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) - zfit <- zreg(x[-I[[b]],], as.factor(z)[-I[[b]]]) - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]],])[,2] - zhat <- predict(zfit, x[I[[b]],])[,2] - yhat <- predict(yfit, x[I[[b]],]) - # record residual - dtil[I[[b]]] <- (d[I[[b]]] - dhat) - ztil[I[[b]]] <- (z[I[[b]]] - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) + # take a fold out + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) + zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]], ], type = "response") + zhat <- predict(zfit, x[I[[b]], ], type = "response") + yhat <- predict(yfit, x[I[[b]], ], type = "response") + # record residual + dtil[I[[b]]] <- (d[I[[b]]] - dhat) + ztil[I[[b]]] <- (z[I[[b]]] - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) + } else if (method == "decisiontrees") { + # take a fold out + dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) + zfit <- zreg(x[-I[[b]], ], as.factor(z)[-I[[b]]]) + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) + # predict the fold out + dhat <- predict(dfit, x[I[[b]], ])[, 2] + zhat <- predict(zfit, x[I[[b]], ])[, 2] + yhat <- predict(yfit, x[I[[b]], ]) + # record residual + dtil[I[[b]]] <- (d[I[[b]]] - dhat) + ztil[I[[b]]] <- (z[I[[b]]] - zhat) + ytil[I[[b]]] <- (y[I[[b]]] - yhat) } - cat(b," ") + cat(b, " ") } - ivfit = tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) - coef.est <- ivfit$coef #extract coefficient - se <- ivfit$se #record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + ivfit <- tsls(y = ytil, d = dtil, x = NULL, z = ztil, intercept = FALSE) + coef_est <- ivfit$coef # extract coefficient + se <- ivfit$se # record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) - return( list(coef.est=coef.est, se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) + return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil, ztil = ztil)) } ``` ```{r} -summary <- function(point, stderr, resy, resD, resZ, name) { +summary_for_plivm <- function(point, stderr, resy, resD, resZ, name) { data <- data.frame( estimate = point, # point estimate stderr = stderr, # standard error @@ -243,19 +254,27 @@ summary <- function(point, stderr, resy, resD, resZ, name) { set.seed(123) cat(sprintf("\nDML with Lasso CV \n")) -dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family="gaussian", alpha=1, nfolds=5)} -yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} -zreg.lasso.cv <- function(x,z){ cv.glmnet(x, z, family="gaussian", alpha=1, nfolds=5)} +dreg_lasso_cv <- function(x, d) { + cv.glmnet(x, d, family = "gaussian", alpha = 1, nfolds = 5) +} +yreg_lasso_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} +zreg_lasso_cv <- function(x, z) { + cv.glmnet(x, z, family = "gaussian", alpha = 1, nfolds = 5) +} -DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.lasso.cv, yreg.lasso.cv, zreg.lasso.cv, nfold=5, method="regression") -sum.lasso.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV') +dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_lasso_cv, yreg_lasso_cv, zreg_lasso_cv, + nfold = 5, method = "regression") +sum_lasso_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "LassoCV") tableplr <- data.frame() -tableplr <- rbind(sum.lasso.cv) +tableplr <- rbind(sum_lasso_cv) tableplr -ytil.lasso <- DML2.results$ytil -dtil.lasso <- DML2.results$dtil -ztil.lasso <- DML2.results$ztil +ytil_lasso <- dml2_results$ytil +dtil_lasso <- dml2_results$dtil +ztil_lasso <- dml2_results$ztil ``` #### Using a $\ell_2$ Penalized Logistic Regression for D and Z @@ -265,18 +284,26 @@ ztil.lasso <- DML2.results$ztil set.seed(123) cat(sprintf("\nDML with Lasso/Logistic \n")) -dreg.ridge.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} -yreg.ridge.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} -zreg.ridge.cv <- function(x,z){cv.glmnet(x, z, family="binomial", alpha=0, nfolds=5)} +dreg_ridge_cv <- function(x, d) { + cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) +} +yreg_ridge_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} +zreg_ridge_cv <- function(x, z) { + cv.glmnet(x, z, family = "binomial", alpha = 0, nfolds = 5) +} -DML2.results <- DML2.for.PLIVM(as.matrix(X), D, Z, y, dreg.ridge.cv, yreg.ridge.cv, zreg.ridge.cv, nfold=5, method="regression") -sum.lasso_ridge.cv <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'LassoCV/LogisticCV') -tableplr <- rbind(tableplr, sum.lasso_ridge.cv) +dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_ridge_cv, yreg_ridge_cv, zreg_ridge_cv, + nfold = 5, method = "regression") +sum_lasso_ridge_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "LassoCV/LogisticCV") +tableplr <- rbind(tableplr, sum_lasso_ridge_cv) tableplr -ytil.ridge <- DML2.results$ytil -dtil.ridge <- DML2.results$dtil -ztil.ridge <- DML2.results$ztil +ytil_ridge <- dml2_results$ytil +dtil_ridge <- dml2_results$dtil +ztil_ridge <- dml2_results$ztil ``` ### Random Forests @@ -286,18 +313,26 @@ ztil.ridge <- DML2.results$ztil set.seed(123) cat(sprintf("\nDML with Random Forest \n")) -dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest -yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest -zreg.rf <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest - -DML2.results = DML2.for.PLIVM(as.matrix(X), as.factor(D), as.factor(Z), y, dreg.rf, yreg.rf, zreg.rf, nfold=5, method="randomforest") -sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'RF') -tableplr <- rbind(tableplr, sum.rf) +dreg_rf <- function(x, d) { + randomForest(x, d, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg_rf <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest +zreg_rf <- function(x, z) { + randomForest(x, z, ntree = 1000, nodesize = 10) +} # ML method=Forest + +dml2_results <- dml2_for_plivm(as.matrix(X), as.factor(D), as.factor(Z), y, dreg_rf, yreg_rf, zreg_rf, + nfold = 5, method = "randomforest") +sum_rf <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "RF") +tableplr <- rbind(tableplr, sum_rf) tableplr -ytil.rf <- DML2.results$ytil -dtil.rf <- DML2.results$dtil -ztil.rf <- DML2.results$ztil +ytil_rf <- dml2_results$ytil +dtil_rf <- dml2_results$dtil +ztil_rf <- dml2_results$ztil ``` ### Decision Trees @@ -307,18 +342,27 @@ ztil.rf <- DML2.results$ztil set.seed(123) cat(sprintf("\nDML with Decision Trees \n")) -dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} -yreg.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} -zreg.tr <- function(x,z){rpart(as.formula("Z~."), cbind(data.frame(Z=z),x), method = "class", minbucket=10, cp = 0.001)} +# decision tree takes in X as dataframe, not matrix/array +dreg_tr <- function(x, d) { + rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) +} +yreg_tr <- function(x, y) { + rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} +zreg_tr <- function(x, z) { + rpart(as.formula("Z~."), cbind(data.frame(Z = z), x), method = "class", minbucket = 10, cp = 0.001) +} -DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.tr, yreg.tr, zreg.tr, nfold=5, method="decisiontrees") # decision tree takes in X as dataframe, not matrix/array -sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Decision Trees') -tableplr <- rbind(tableplr, sum.tr) +dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_tr, yreg_tr, zreg_tr, + nfold = 5, method = "decisiontrees") +sum_tr <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "Decision Trees") +tableplr <- rbind(tableplr, sum_tr) tableplr -ytil.tr <- DML2.results$ytil -dtil.tr <- DML2.results$dtil -ztil.tr <- DML2.results$ztil +ytil_tr <- dml2_results$ytil +dtil_tr <- dml2_results$dtil +ztil_tr <- dml2_results$ztil ``` ### Boosted Trees @@ -329,20 +373,31 @@ set.seed(123) cat(sprintf("\nDML with Boosted Trees \n")) # NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) -dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} -zreg.boost <- function(x,z){gbm(as.formula("Z~."), cbind(data.frame(Z=z),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) +dreg_boost <- function(x, d) { + gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg_boost <- function(x, y) { + gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +zreg_boost <- function(x, z) { + gbm(as.formula("Z~."), cbind(data.frame(Z = z), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} # passing these through regression as type="response", and D and Z should not be factors! -DML2.results = DML2.for.PLIVM(X, D, Z, y, dreg.boost, yreg.boost, zreg.boost, nfold=5, method = "regression") -sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, name = 'Boosted Trees') -tableplr <- rbind(tableplr, sum.boost) +dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_boost, yreg_boost, zreg_boost, + nfold = 5, method = "regression") +sum_boost <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "Boosted Trees") +tableplr <- rbind(tableplr, sum_boost) tableplr -ytil.boost <- DML2.results$ytil -dtil.boost <- DML2.results$dtil -ztil.boost <- DML2.results$ztil +ytil_boost <- dml2_results$ytil +dtil_boost <- dml2_results$dtil +ztil_boost <- dml2_results$ztil ``` ## Ensembles @@ -352,8 +407,9 @@ Boosted trees give the best RMSE for Y, D, and Z, so the ensemble based on choos ```{r} # Best fit is boosted trees for D, Z, Y -sum.best <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$dtil, name = 'Best') -tableplr <- rbind(tableplr, sum.best) +sum_best <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$ztil, name = "Best") +tableplr <- rbind(tableplr, sum_best) tableplr ``` @@ -362,34 +418,34 @@ We'll form a model average with unconstrained least squares weights. ```{r} # Least squares model average -dhat.lasso <- D - dtil.lasso -dhat.ridge <- D - dtil.ridge -dhat.rf <- D - dtil.rf -dhat.tr <- D - dtil.tr -dhat.boost <- D - dtil.boost - -yhat.lasso <- y - ytil.lasso -yhat.ridge <- y - ytil.ridge -yhat.rf <- y - ytil.rf -yhat.tr <- y - ytil.tr -yhat.boost <- y - ytil.boost - -zhat.lasso <- Z - ztil.lasso -zhat.ridge <- Z - ztil.ridge -zhat.rf <- Z - ztil.rf -zhat.tr <- Z - ztil.tr -zhat.boost <- Z - ztil.boost - -ma.dtil <- lm(D~dhat.lasso+dhat.ridge+dhat.rf+dhat.tr+dhat.boost)$residuals -ma.ytil <- lm(y~yhat.lasso+yhat.ridge+yhat.rf+yhat.tr+yhat.boost)$residuals -ma.ztil <- lm(Z~zhat.lasso+zhat.ridge+zhat.rf+zhat.tr+zhat.boost)$residuals - -ivfit = tsls(y=ma.ytil,d=ma.dtil, x=NULL, z=ma.ztil, intercept=FALSE) -coef.est <- ivfit$coef #extract coefficient -se <- ivfit$se #record standard error - -sum.ma <- summary(coef.est, se, ma.ytil, ma.dtil, ma.ztil, name = 'Model Average') -tableplr <- rbind(tableplr, sum.ma) +dhat_lasso <- D - dtil_lasso +dhat_ridge <- D - dtil_ridge +dhat_rf <- D - dtil_rf +dhat_tr <- D - dtil_tr +dhat_boost <- D - dtil_boost + +yhat_lasso <- y - ytil_lasso +yhat_ridge <- y - ytil_ridge +yhat_rf <- y - ytil_rf +yhat_tr <- y - ytil_tr +yhat_boost <- y - ytil_boost + +zhat_lasso <- Z - ztil_lasso +zhat_ridge <- Z - ztil_ridge +zhat_rf <- Z - ztil_rf +zhat_tr <- Z - ztil_tr +zhat_boost <- Z - ztil_boost + +ma_dtil <- lm(D ~ dhat_lasso + dhat_ridge + dhat_rf + dhat_tr + dhat_boost)$residuals +ma_ytil <- lm(y ~ yhat_lasso + yhat_ridge + yhat_rf + yhat_tr + yhat_boost)$residuals +ma_ztil <- lm(Z ~ zhat_lasso + zhat_ridge + zhat_rf + zhat_tr + zhat_boost)$residuals + +ivfit <- tsls(y = ma_ytil, d = ma_dtil, x = NULL, z = ma_ztil, intercept = FALSE) +coef_est <- ivfit$coef # extract coefficient +se <- ivfit$se # record standard error + +sum_ma <- summary_for_plivm(coef_est, se, ma_ytil, ma_dtil, ma_ztil, name = "Model Average") +tableplr <- rbind(tableplr, sum_ma) tableplr ``` @@ -403,34 +459,37 @@ Thus, in the below analysis of robust inference, we choose Boosted Trees as they ```{r} robust_inference <- function(point, stderr, resD, resy, resZ, grid, alpha = 0.05) { - # Inference in the partially linear IV model that is robust to weak identification. - # grid: grid of theta values to search over when trying to identify the confidence region - # alpha: confidence level - - n <- dim(X)[1] - thr <- qchisq(1 - alpha, df = 1) - accept <- c() - - for (theta in grid) { - moment <- (resy - theta * resD) * resZ - test <- n * mean(moment)^2 / var(moment) - if (test <= thr) { - accept <- c(accept, theta) - } + # Inference in the partially linear IV model that is robust to weak identification. + # grid: grid of theta values to search over when trying to identify the confidence region + # alpha: confidence level + + n <- dim(X)[1] + thr <- qchisq(1 - alpha, df = 1) + accept <- c() + + for (theta in grid) { + moment <- (resy - theta * resD) * resZ + test <- n * mean(moment)^2 / var(moment) + if (test <= thr) { + accept <- c(accept, theta) } + } - return(accept) + return(accept) } ``` ```{r} grid <- seq(0, 20000, length.out = 10000) -region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid) +region <- robust_inference(dml2_results$coef_est, dml2_results$stderr, dml2_results$dtil, dml2_results$ytil, + dml2_results$ztil, grid = grid) ``` ```{r} grid <- seq(0, 20000, length.out = 10000) -region <- robust_inference(DML2.results$coef.est, DML2.results$stderr, DML2.results$dtil, DML2.results$ytil, DML2.results$ztil, grid=grid)# Calculate min and max +region <- robust_inference(dml2_results$coef_est, dml2_results$stderr, dml2_results$dtil, dml2_results$ytil, + dml2_results$ztil, grid = grid) +# Calculate min and max min_region <- min(region) max_region <- max(region) @@ -441,12 +500,12 @@ print(max_region) # Interactive IV Model and LATE Now, we consider estimation of local average treatment effects (LATE) of participation `p401`, with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: -\begin{eqnarray} -Y &:=& f_Y (D, X, A, \epsilon_Y) \\ -D &:= & f_D(Z, X, A, \epsilon_D) \in \{0,1\}, \\ -Z &:= & f_Z(X,\epsilon_Z) \in \{0,1\}, \\ -X &:=& \epsilon_X, \quad A = \epsilon_A, -\end{eqnarray} +\begin{align} +Y :=~& f_Y (D, X, A, \epsilon_Y) \\ +D :=~& f_D(Z, X, A, \epsilon_D) \in \{0,1\}, \\ +Z :=~& f_Z(X,\epsilon_Z) \in \{0,1\}, \\ +X :=~& \epsilon_X, \quad A = \epsilon_A, +\end{align} where $\epsilon$'s are all exogenous and independent, and $$ @@ -470,8 +529,55 @@ H(Z) =~& \frac{Z}{Pr(Z=1|X)} - \frac{1 - Z}{1 - Pr(Z=1|X)} \end{align} ```{r} +get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL, DZ0factor = NULL) { + # train a treatment model on training data that received Z=0 and predict treatment on all data in test set + if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically + dreg0_ <- dreg0 + if (is.null(DZ0factor)) { + dfit0 <- dreg0_((XZ0), DZ0) + } else { + dfit0 <- dreg0_((XZ0), DZ0factor) + } + if (is.null(type)) { + return(predict(dfit0, (Xb))[, 2]) + } else if (type == "prob") { + return(predict(dfit0, (Xb), type = "prob")[, 2]) + } else if (type == "reponse") { + return(predict(dfit0, (Xb), type = "response")) + } else { + stop("Invalid argument `type`.") + } + } else { + return(0) + } +} + +get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL, DZ1factor = NULL) { + # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set + if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically + dreg1_ <- dreg1 + if (is.null(DZ1factor)) { + dfit1 <- dreg1_((XZ1), DZ1) + } else { + dfit1 <- dreg1_((XZ1), DZ1factor) + } + if (is.null(type)) { + return(predict(dfit1, (Xb))[, 2]) + } else if (type == "prob") { + return(predict(dfit1, (Xb), type = "prob")[, 2]) + } else if (type == "response") { + return(predict(dfit1, (Xb), type = "response")) + } else { + stop("Invalid argument `type`.") + } + } else { + return(1) + } +} + # DML for IIVM with D and Z as classifiers or regressors -DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="classification", dt=0, bt=0) { +dml2_for_iivm <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, + trimming = 0.01, nfold = 5, method = "classification", dt = 0, bt = 0) { # this implements DML2 algorithm, where there moments are estimated via DML, before constructing # the pooled estimate of theta randomly split data into folds @@ -486,152 +592,111 @@ DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming zhat <- rep(0, length(Z)) nobs <- nrow(X) - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] I <- split(1:nobs, foldid) # create residualized objects to fill - ytil <- dtil <- ztil<- rep(NA, nobs) + ytil <- dtil <- ztil <- rep(NA, nobs) # obtain cross-fitted residuals cat("fold: ") - for(b in 1:length(I)){ - + for (b in seq_along(I)) { # define helpful variables - Xb = X[I[[b]],] - Xnotb = X[-I[[b]],] - Znotb = Z[-I[[b]]] + Xb <- X[I[[b]], ] + Xnotb <- X[-I[[b]], ] + Znotb <- Z[-I[[b]]] # training dfs subsetted on the -I[[b]] fold - XZ0 = X[-I[[b]],][Z[-I[[b]]]==0] - yZ0 = y[-I[[b]]][Z[-I[[b]]]==0] - XZ1 = X[-I[[b]],][Z[-I[[b]]]==1] - yZ1 = y[-I[[b]]][Z[-I[[b]]]==1] - DZ0 = d[-I[[b]]][Z[-I[[b]]]==0] - DZ1 = d[-I[[b]]][Z[-I[[b]]]==1] + XZ0 <- X[-I[[b]], ][Z[-I[[b]]] == 0] + yZ0 <- y[-I[[b]]][Z[-I[[b]]] == 0] + XZ1 <- X[-I[[b]], ][Z[-I[[b]]] == 1] + yZ1 <- y[-I[[b]]][Z[-I[[b]]] == 1] + DZ0 <- d[-I[[b]]][Z[-I[[b]]] == 0] + DZ1 <- d[-I[[b]]][Z[-I[[b]]] == 1] if (method == "regression") { - XZ0 = as.matrix(XZ0) - XZ1 = as.matrix(XZ1) - Xb = as.matrix(Xb) - Xnotb = as.matrix(Xnotb) - - # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" - - # train an outcome model on training data that received Z=1 and predict outcome on all data in test set - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - # train a treatment model on training data that received Z=0 and predict treatment on all data in test set - if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically - dreg0_ <- dreg0 - dfit0 <- dreg0_((XZ0), DZ0) - dhat0[I[[b]]] <- predict(dfit0, (Xb), type="response") # default type = "response", but for family binomial it's logg odds - } - # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set - if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically - dreg1_ <- dreg1 - dfit1 <- dreg1_((XZ1), DZ1) - dhat1[I[[b]]] <- predict(dfit1, (Xb), type="response") - } else { - dhat1[I[[b]]] <- 1 - } + XZ0 <- as.matrix(XZ0) + XZ1 <- as.matrix(XZ1) + Xb <- as.matrix(Xb) + Xnotb <- as.matrix(Xnotb) + + # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" + + # train an outcome model on training data that received Z=1 and predict outcome on all data in test set + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + # train a treatment model on training data that received Z=0 and predict treatment on all data in test set + # default type = "response", but for family binomial it's logg odds + dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "response") + dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "response") } else if (method == "randomforest") { - DZ0factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==0] - DZ1factor = as.factor(D)[-I[[b]]][Z[-I[[b]]]==1] - Znotb = as.factor(Znotb) - - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb), type="response") - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb), type="response") - - if (mean(DZ0) > 0) { - dreg0_ <- dreg0 - dfit0 <- dreg0_((XZ0), DZ0factor) - dhat0[I[[b]]] <- predict(dfit0, (Xb), type="prob")[,2] # get second column because type = "prob" - } - if (mean(DZ1) < 1) { - dreg1_ <- dreg1 - dfit1 <- dreg1_((XZ1), DZ1factor) - dhat1[I[[b]]] <- predict(dfit1, (Xb), type="prob")[,2] - } else { - dhat1[I[[b]]] <- 1 - } + DZ0factor <- as.factor(D)[-I[[b]]][Z[-I[[b]]] == 0] + DZ1factor <- as.factor(D)[-I[[b]]][Z[-I[[b]]] == 1] + Znotb <- as.factor(Znotb) + + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb), type = "response") + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb), type = "response") + + dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "prob", DZ0factor = DZ0factor) + dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "prob", DZ1factor = DZ1factor) } else if (method == "decisiontrees") { - XZ0 = as.data.frame(XZ0) - XZ1 = as.data.frame(XZ1) - Xb = as.data.frame(Xb) - Xnotb = as.data.frame(Xnotb) - - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for decision trees for continuous response - - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - if (mean(DZ0) > 0) { - dreg0_ <- dreg0 - dfit0 <- dreg0_((XZ0), as.factor(DZ0)) - dhat0[I[[b]]] <- predict(dfit0, (Xb))[,2] # for decision trees, default = "prob" for decision trees with factor responses - } - - if (mean(DZ1) < 1) { - dreg1_ <- dreg1 - dfit1 <- dreg1_((XZ1), as.factor(DZ1)) - dhat1[I[[b]]] <- predict(dfit1, (Xb))[,2] - } else { - dhat1[I[[b]]] <- 1 - } + XZ0 <- as.data.frame(XZ0) + XZ1 <- as.data.frame(XZ1) + Xb <- as.data.frame(Xb) + Xnotb <- as.data.frame(Xnotb) + + yfit0 <- yreg0((XZ0), yZ0) + # default type = "response" for decision trees for continuous response + yhat0[I[[b]]] <- predict(yfit0, (Xb)) + + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, DZ0factor = as.factor(DZ0)) + dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, DZ1factor = as.factor(DZ1)) } else if (method == "boostedtrees") { - XZ0 = as.data.frame(XZ0) - XZ1 = as.data.frame(XZ1) - Xb = as.data.frame(Xb) - Xnotb = as.data.frame(Xnotb) - - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for boosted trees - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - if (mean(DZ0) > 0) { - dreg0_ <- dreg0 - dfit0 <- dreg0_((XZ0), DZ0) - dhat0[I[[b]]] <- predict(dfit0, (Xb), type = "response") # default for boosted trees is log odds. - } - if (mean(DZ1) < 1) { - dreg1_ <- dreg1 - dfit1 <- dreg1_((XZ1), DZ1) - dhat1[I[[b]]] <- predict(dfit1, (Xb), type = "response") - } else { - dhat1[I[[b]]] <- 1 - } + XZ0 <- as.data.frame(XZ0) + XZ1 <- as.data.frame(XZ1) + Xb <- as.data.frame(Xb) + Xnotb <- as.data.frame(Xnotb) + + yfit0 <- yreg0((XZ0), yZ0) + yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for boosted trees + yfit1 <- yreg1((XZ1), yZ1) + yhat1[I[[b]]] <- predict(yfit1, (Xb)) + + dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "response") + dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "response") } # propensity scores: - if (method == "regression"){ + if (method == "regression") { zfit_b <- zreg((Xnotb), Znotb) - zhat_b <- predict(zfit_b, (Xb), type="response") - } else if (method == "randomforest"){ + zhat_b <- predict(zfit_b, (Xb), type = "response") + } else if (method == "randomforest") { zfit_b <- zreg((Xnotb), Znotb) - zhat_b <- predict(zfit_b, (Xb), type = "prob")[,2] - } else if (method == "decisiontrees"){ + zhat_b <- predict(zfit_b, (Xb), type = "prob")[, 2] + } else if (method == "decisiontrees") { zfit_b <- zreg((Xnotb), as.factor(Znotb)) zhat_b <- predict(zfit_b, (Xb)) # default is prob, so get second column - zhat_b = zhat_b[,2] - } else if (method == "boostedtrees"){ + zhat_b <- zhat_b[, 2] + } else if (method == "boostedtrees") { zfit_b <- zreg((Xnotb), Znotb) zhat_b <- predict(zfit_b, (Xb), type = "response") } zhat_b <- pmax(pmin(zhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] zhat[I[[b]]] <- zhat_b - cat(b," ") + cat(b, " ") } @@ -640,37 +705,40 @@ DML2.for.IIVM <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming dhat <- dhat0 * (1 - Z) + dhat1 * Z # residuals - ytil <- y-yhat - dtil <- D-dhat - ztil <- Z-zhat + ytil <- y - yhat + dtil <- D - dhat + ztil <- Z - zhat # doubly robust quantity for every sample HZ <- Z / zhat - (1 - Z) / (1 - zhat) drZ <- yhat1 - yhat0 + (y - yhat) * HZ drD <- dhat1 - dhat0 + (D - dhat) * HZ - coef.est <- mean(drZ) / mean(drD) - cat("point", coef.est) - psi <- drZ - coef.est * drD + coef_est <- mean(drZ) / mean(drD) + cat("point", coef_est) + psi <- drZ - coef_est * drD Jhat <- mean(drD) variance <- mean(psi^2) / Jhat^2 se <- sqrt(variance / nrow(X)) cat("se", se) - return(list(coef.est = coef.est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1)) + return(list(coef_est = coef_est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, + dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, + yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1)) } ``` ```{r} -summary <- function(coef.est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) { - summary_data <- data.frame(estimate = coef.est, # point estimate - se = se, # standard error - lower = coef.est - 1.96 * se, # lower end of 95% confidence interval - upper = coef.est + 1.96 * se, # upper end of 95% confidence interval - rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y - rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D - rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z - accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D - accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z +summary_for_iivm <- function(coef_est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) { + summary_data <- data.frame( + estimate = coef_est, # point estimate + se = se, # standard error + lower = coef_est - 1.96 * se, # lower end of 95% confidence interval + upper = coef_est + 1.96 * se, # upper end of 95% confidence interval + rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y + rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D + rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z + accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D + accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z ) row.names(summary_data) <- name return(summary_data) @@ -682,25 +750,38 @@ summary <- function(coef.est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, set.seed(123) cat(sprintf("\nDML with Lasso/Logistic \n")) # DML with Lasso/Ridge -dreg0 <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} -dreg1 <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} -yreg0 <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} -yreg1 <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} -zreg <- function(x,z){cv.glmnet(x, z, family="binomial", alpha=0, nfolds=5)} - -DML2.results <- DML2.for.IIVM(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="regression") -sum.lasso_ridge.cv <-summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'LassoCV/LogisticCV') +dreg0 <- function(x, d) { + cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) +} +dreg1 <- function(x, d) { + cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) +} +yreg0 <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} +yreg1 <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} +zreg <- function(x, z) { + cv.glmnet(x, z, family = "binomial", alpha = 0, nfolds = 5) +} + +dml2_results <- dml2_for_iivm(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, + trimming = 0.01, nfold = 5, method = "regression") +sum_lasso_ridge_cv <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, + dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, + dml2_results$drZ, dml2_results$drD, name = "LassoCV/LogisticCV") table <- data.frame() -table <- rbind(table, sum.lasso_ridge.cv) +table <- rbind(table, sum_lasso_ridge_cv) table -yhat.lasso = DML2.results$yhat -dhat.lasso = DML2.results$dhat -yhat0.lasso = DML2.results$yhat0 -yhat1.lasso = DML2.results$yhat1 -dhat0.lasso = DML2.results$dhat0 -dhat1.lasso = DML2.results$dhat1 -zhat.lasso = DML2.results$zhat +yhat_lasso <- dml2_results$yhat +dhat_lasso <- dml2_results$dhat +yhat0_lasso <- dml2_results$yhat0 +yhat1_lasso <- dml2_results$yhat1 +dhat0_lasso <- dml2_results$dhat0 +dhat1_lasso <- dml2_results$dhat1 +zhat_lasso <- dml2_results$zhat ``` ```{r} @@ -708,24 +789,37 @@ zhat.lasso = DML2.results$zhat set.seed(123) cat(sprintf("\nDML with Random Forest \n")) -dreg0 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest -dreg1 <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest -yreg0 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest -yreg1 <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest -zreg <- function(x,z){randomForest(x, z, ntree=1000, nodesize=10)} #ML method=Forest - -DML2.results <- DML2.for.IIVM(X,D,Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="randomforest") -sum.rf <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'RF') -table <- rbind(table, sum.rf) +dreg0 <- function(x, d) { + randomForest(x, d, ntree = 1000, nodesize = 10) +} # ML method=Forest +dreg1 <- function(x, d) { + randomForest(x, d, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg0 <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg1 <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest +zreg <- function(x, z) { + randomForest(x, z, ntree = 1000, nodesize = 10) +} # ML method=Forest + +dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, + trimming = 0.01, nfold = 5, method = "randomforest") +sum_rf <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, + dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, + dml2_results$drZ, dml2_results$drD, name = "RF") +table <- rbind(table, sum_rf) table -yhat.rf = DML2.results$yhat -dhat.rf = DML2.results$dhat -yhat0.rf = DML2.results$yhat0 -yhat1.rf = DML2.results$yhat1 -dhat0.rf = DML2.results$dhat0 -dhat1.rf = DML2.results$dhat1 -zhat.rf = DML2.results$zhat +yhat_rf <- dml2_results$yhat +dhat_rf <- dml2_results$dhat +yhat0_rf <- dml2_results$yhat0 +yhat1_rf <- dml2_results$yhat1 +dhat0_rf <- dml2_results$dhat0 +dhat1_rf <- dml2_results$dhat1 +zhat_rf <- dml2_results$zhat ``` ```{r} @@ -733,24 +827,37 @@ zhat.rf = DML2.results$zhat set.seed(123) cat(sprintf("\nDML with Decision Trees \n")) -dreg0 <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} -dreg1 <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} -yreg0 <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} -yreg1 <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} -zreg <- function(x,z){rpart(as.formula("Z~."), cbind(data.frame(Z=z),x), method = "class", minbucket=10, cp = 0.001)} +dreg0 <- function(x, d) { + rpart(as.formula("D ~ ."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) +} +dreg1 <- function(x, d) { + rpart(as.formula("D ~ ."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) +} +yreg0 <- function(x, y) { + rpart(as.formula("y ~ ."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} +yreg1 <- function(x, y) { + rpart(as.formula("y ~ ."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} +zreg <- function(x, z) { + rpart(as.formula("Z ~ ."), cbind(data.frame(Z = z), x), method = "class", minbucket = 10, cp = 0.001) +} -DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="decisiontrees") -sum.tr <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Decision Trees') -table <- rbind(table, sum.tr) +dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, + trimming = 0.01, nfold = 5, method = "decisiontrees") +sum_tr <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, + dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, + dml2_results$drZ, dml2_results$drD, name = "Decision Trees") +table <- rbind(table, sum_tr) table -yhat.tr = DML2.results$yhat -dhat.tr = DML2.results$dhat -yhat0.tr = DML2.results$yhat0 -yhat1.tr = DML2.results$yhat1 -dhat0.tr = DML2.results$dhat0 -dhat1.tr = DML2.results$dhat1 -zhat.tr = DML2.results$zhat +yhat_tr <- dml2_results$yhat +dhat_tr <- dml2_results$dhat +yhat0_tr <- dml2_results$yhat0 +yhat1_tr <- dml2_results$yhat1 +dhat0_tr <- dml2_results$dhat0 +dhat1_tr <- dml2_results$dhat1 +zhat_tr <- dml2_results$zhat ``` ```{r} @@ -759,26 +866,44 @@ set.seed(123) cat(sprintf("\nDML with Boosted Trees \n")) # NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) -dreg0 <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} -dreg1 <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg0 <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg1 <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} -zreg <- function(x,z){gbm(as.formula("Z~."), cbind(data.frame(Z=z),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} +## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) +dreg0 <- function(x, d) { + gbm(as.formula("D ~ ."), cbind(data.frame(D = d), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +dreg1 <- function(x, d) { + gbm(as.formula("D ~ ."), cbind(data.frame(D = d), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg0 <- function(x, y) { + gbm(as.formula("y ~ ."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg1 <- function(x, y) { + gbm(as.formula("y ~ ."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +zreg <- function(x, z) { + gbm(as.formula("Z ~ ."), cbind(data.frame(Z = z), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} # passing these through regression as type="response", and D and Z should not be factors! -DML2.results <- DML2.for.IIVM(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, trimming=0.01, nfold=5, method="boostedtrees") -sum.boost <- summary(DML2.results$coef.est, DML2.results$se, DML2.results$yhat, DML2.results$dhat, DML2.results$zhat, DML2.results$ytil, DML2.results$dtil, DML2.results$ztil, DML2.results$drZ, DML2.results$drD, name = 'Boosted Trees') -table <- rbind(table, sum.boost) +dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, + trimming = 0.01, nfold = 5, method = "boostedtrees") +sum_boost <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, + dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, + dml2_results$drZ, dml2_results$drD, name = "Boosted Trees") +table <- rbind(table, sum_boost) table -yhat.boost = DML2.results$yhat -dhat.boost = DML2.results$dhat -yhat0.boost = DML2.results$yhat0 -yhat1.boost = DML2.results$yhat1 -dhat0.boost = DML2.results$dhat0 -dhat1.boost = DML2.results$dhat1 -zhat.boost = DML2.results$zhat +yhat_boost <- dml2_results$yhat +dhat_boost <- dml2_results$dhat +yhat0_boost <- dml2_results$yhat0 +yhat1_boost <- dml2_results$yhat1 +dhat0_boost <- dml2_results$dhat0 +dhat1_boost <- dml2_results$dhat1 +zhat_boost <- dml2_results$zhat ``` ## Ensembles @@ -788,32 +913,33 @@ Boosted trees give the best RMSE for D and Z and random forests give the best RM ```{r} # Best fit is boosted trees for D, Z and random forests for Y -best.yhat0 <- yhat0.rf -best.yhat1 <- yhat1.rf -best.yhat <- yhat.rf +best_yhat0 <- yhat0_rf +best_yhat1 <- yhat1_rf +best_yhat <- yhat_rf -best.dhat0 <- dhat0.boost -best.dhat1 <- dhat1.boost -best.dhat <- dhat.boost +best_dhat0 <- dhat0_boost +best_dhat1 <- dhat1_boost +best_dhat <- dhat_boost -best.zhat <- zhat.boost +best_zhat <- zhat_boost -ytil.best <- y - best.yhat -dtil.best <- D - best.dhat -ztil.best <- Z - best.zhat +ytil_best <- y - best_yhat +dtil_best <- D - best_dhat +ztil_best <- Z - best_zhat # doubly robust quantity for every sample -HZ <- Z / best.zhat - (1 - Z) / (1 - best.zhat) -drZ <- best.yhat1 - best.yhat0 + (y - best.yhat) * HZ -drD <- best.dhat1 - best.dhat0 + (D - best.dhat) * HZ -coef.est <- mean(drZ) / mean(drD) -psi <- drZ - coef.est * drD +HZ <- Z / best_zhat - (1 - Z) / (1 - best_zhat) +drZ <- best_yhat1 - best_yhat0 + (y - best_yhat) * HZ +drD <- best_dhat1 - best_dhat0 + (D - best_dhat) * HZ +coef_est <- mean(drZ) / mean(drD) +psi <- drZ - coef_est * drD Jhat <- mean(drD) variance <- mean(psi^2) / Jhat^2 se <- sqrt(variance / nrow(X)) -sum.best <- summary(coef.est, se, best.yhat, best.dhat, best.zhat, ytil.best, dtil.best, ztil.best, drZ, drD, name = 'Best') -table <- rbind(table, sum.best) +sum_best <- summary_for_iivm(coef_est, se, best_yhat, best_dhat, best_zhat, + ytil_best, dtil_best, ztil_best, drZ, drD, name = "Best") +table <- rbind(table, sum_best) table ``` @@ -821,37 +947,38 @@ We'll form a model average with unconstrained least squares weights. ```{r} # Least squares model average -ma.dcoef <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost-1)$coef -ma.ycoef <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost-1)$coef -ma.zcoef <- lm(Z~zhat.lasso+zhat.rf+zhat.tr+zhat.boost-1)$coef +ma_dcoef <- lm(D ~ dhat_lasso + dhat_rf + dhat_tr + dhat_boost - 1)$coef +ma_ycoef <- lm(y ~ yhat_lasso + yhat_rf + yhat_tr + yhat_boost - 1)$coef +ma_zcoef <- lm(Z ~ zhat_lasso + zhat_rf + zhat_tr + zhat_boost - 1)$coef -ma.yhat0 <- cbind(yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost)%*%as.matrix(ma.ycoef) -ma.yhat1 <- cbind(yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost)%*%as.matrix(ma.ycoef) -ma.dhat0 <- cbind(dhat0.lasso,dhat0.rf,dhat0.tr,dhat0.boost)%*%as.matrix(ma.dcoef) -ma.dhat1 <- cbind(dhat1.lasso,dhat1.rf,dhat1.tr,dhat1.boost)%*%as.matrix(ma.dcoef) -ma.zhat <- cbind(zhat.lasso,zhat.rf,zhat.tr,zhat.boost)%*%as.matrix(ma.zcoef) +ma_yhat0 <- cbind(yhat0_lasso, yhat0_rf, yhat0_tr, yhat0_boost) %*% as.matrix(ma_ycoef) +ma_yhat1 <- cbind(yhat1_lasso, yhat1_rf, yhat1_tr, yhat1_boost) %*% as.matrix(ma_ycoef) +ma_dhat0 <- cbind(dhat0_lasso, dhat0_rf, dhat0_tr, dhat0_boost) %*% as.matrix(ma_dcoef) +ma_dhat1 <- cbind(dhat1_lasso, dhat1_rf, dhat1_tr, dhat1_boost) %*% as.matrix(ma_dcoef) +ma_zhat <- cbind(zhat_lasso, zhat_rf, zhat_tr, zhat_boost) %*% as.matrix(ma_zcoef) # Prediction of treatment and outcome for observed instrument -ma.yhat <- ma.yhat0 * (1 - Z) + ma.yhat1 * Z -ma.dhat <- ma.dhat0 * (1 - Z) + ma.dhat1 * Z +ma_yhat <- ma_yhat0 * (1 - Z) + ma_yhat1 * Z +ma_dhat <- ma_dhat0 * (1 - Z) + ma_dhat1 * Z # residuals -ma.ytil <- y-ma.yhat -ma.dtil <- D-ma.dhat -ma.ztil <- Z-ma.zhat +ma_ytil <- y - ma_yhat +ma_dtil <- D - ma_dhat +ma_ztil <- Z - ma_zhat # doubly robust quantity for every sample -HZ <- Z / ma.zhat - (1 - Z) / (1 - ma.zhat) -drZ <- ma.yhat1 - ma.yhat0 + (y - ma.yhat) * HZ -drD <- ma.dhat1 - ma.dhat0 + (D - ma.dhat) * HZ -coef.est <- mean(drZ) / mean(drD) -psi <- drZ - coef.est * drD +HZ <- Z / ma_zhat - (1 - Z) / (1 - ma_zhat) +drZ <- ma_yhat1 - ma_yhat0 + (y - ma_yhat) * HZ +drD <- ma_dhat1 - ma_dhat0 + (D - ma_dhat) * HZ +coef_est <- mean(drZ) / mean(drD) +psi <- drZ - coef_est * drD Jhat <- mean(drD) variance <- mean(psi^2) / Jhat^2 se <- sqrt(variance / nrow(X)) -sum.ma <- summary(coef.est, se, ma.yhat, ma.dhat, ma.zhat, ma.ytil, ma.dtil, ma.ztil, drZ, drD, name = 'Model Average') -table <- rbind(table, sum.ma) +sum_ma <- summary_for_iivm(coef_est, se, ma_yhat, ma_dhat, ma_zhat, + ma_ytil, ma_dtil, ma_ztil, drZ, drD, name = "Model Average") +table <- rbind(table, sum_ma) table ``` @@ -869,30 +996,35 @@ Again as before, ideally we would do (semi) cross-fitting with AutoML in order t As before, in the below analysis of robust inference, we choose Boosted Trees as they perform well in RMSE and accuracy on first-stage models. ```{r} -iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, drZ, drD, X, Z, D, y, grid, alpha = 0.05) { - # Inference in the partially linear IV model that is robust to weak identification. - # grid: grid of theta values to search over when trying to identify the confidence region - # alpha: confidence level - - n <- dim(X)[1] - thr <- qchisq(1 - alpha, df = 1) - accept <- c() - - for (theta in grid) { - moment <- drZ - theta * drD - test <- n * mean(moment)^2 / var(moment) - if (test <= thr) { - accept <- c(accept, theta) - } +iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, + drZ, drD, X, Z, D, y, grid, alpha = 0.05) { + # Inference in the partially linear IV model that is robust to weak identification. + # grid: grid of theta values to search over when trying to identify the confidence region + # alpha: confidence level + + n <- dim(X)[1] + thr <- qchisq(1 - alpha, df = 1) + accept <- c() + + for (theta in grid) { + moment <- drZ - theta * drD + test <- n * mean(moment)^2 / var(moment) + if (test <= thr) { + accept <- c(accept, theta) } + } - return(accept) + return(accept) } ``` ```{r} grid <- seq(0, 20000, length.out = 10000) -region <- iivm_robust_inference(point = DML2.results$coef.est, stderr = DML2.results$se, yhat = DML2.results$yhat, Dhat = DML2.results$dhat, Zhat = DML2.results$zhat, resy = DML2.results$ytil, resD = DML2.results$dtil, resZ = DML2.results$ztil, drZ = DML2.results$drZ, drD = DML2.results$drD, X=X, Z=Z, D=D, y=y, grid=grid) +region <- iivm_robust_inference(point = dml2_results$coef_est, stderr = dml2_results$se, + yhat = dml2_results$yhat, Dhat = dml2_results$dhat, Zhat = dml2_results$zhat, + resy = dml2_results$ytil, resD = dml2_results$dtil, resZ = dml2_results$ztil, + drZ = dml2_results$drZ, drD = dml2_results$drD, + X = X, Z = Z, D = D, y = y, grid = grid) # Calculate min and max min_region <- min(region) @@ -918,7 +1050,9 @@ install.packages("mlr3learners") install.packages("mlr3") install.packages("data.table") install.packages("ranger") +``` +```{r} library(DoubleML) library(mlr3learners) library(mlr3) @@ -932,37 +1066,42 @@ library(ranger) Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: -\begin{eqnarray} -& Y = g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ -& D = r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ -& Z = m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. -\end{eqnarray} +\begin{align} +Y :=~& g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ +D :=~& r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ +Z :=~& m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. +\end{align} ```{r} # Constructing the data (as DoubleMLData) -formula_flex2 = "net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" -model_flex2 = as.data.table(model.frame(formula_flex2, data)) -x_cols = colnames(model_flex2)[-c(1,2,3)] -data_IV = DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols ="e401",x_cols=x_cols) +formula_flex2 <- paste("net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + ", + "poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") +model_flex2 <- as.data.table(model.frame(formula_flex2, data)) +x_cols <- colnames(model_flex2)[-c(1, 2, 3)] +data_iv <- DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols = "e401", x_cols = x_cols) ``` ```{r} lgr::get_logger("mlr3")$set_threshold("warn") -lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") +lasso <- lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min") lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso, - ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -lasso_MLIIVM <- dml_MLIIVM$coef -lasso_std_MLIIVM <- dml_MLIIVM$se +dml_mliivm <- DoubleMLIIVM$new(data_iv, + ml_g = lasso, + ml_m = lasso_class, ml_r = lasso_class, n_folds = 5, subgroups = list( + always_takers = FALSE, + never_takers = TRUE + ) +) +dml_mliivm$fit(store_predictions = TRUE) +dml_mliivm$summary() +lasso_mliivm <- dml_mliivm$coef +lasso_std_mliivm <- dml_mliivm$se ``` The confidence interval for the local average treatment effect of participation is given by ```{r} -dml_MLIIVM$confint(level = 0.95) +dml_mliivm$confint(level = 0.95) ``` Here we can also check the accuracy of the model: @@ -974,28 +1113,28 @@ d <- as.matrix(pension$p401) z <- as.matrix(pension$e401) # predictions -dml_MLIIVM$params_names() -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +dml_mliivm$params_names() +g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(z=0, X) +g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(z=1, X) +g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(z=0, X) +r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(z=1, X) +r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o ``` ```{r} # cross-fitted RMSE: outcome -lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -lasso_y_MLIIVM +lasso_y_mliivm <- sqrt(mean((y - g_hat)^2)) +lasso_y_mliivm # cross-fitted RMSE: treatment -lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -lasso_d_MLIIVM +lasso_d_mliivm <- sqrt(mean((d - r_hat)^2)) +lasso_d_mliivm # cross-fitted RMSE: instrument -lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -lasso_z_MLIIVM +lasso_z_mliivm <- sqrt(mean((z - m_hat)^2)) +lasso_z_mliivm ``` Again, we repeat the procedure for the other machine learning methods: @@ -1005,6 +1144,9 @@ Again, we repeat the procedure for the other machine learning methods: remotes::install_github("mlr-org/mlr3extralearners") install.packages("mlr3extralearners") install.packages("mboost") +``` + +```{r} library(mlr3extralearners) library(mboost) ``` @@ -1012,7 +1154,7 @@ library(mboost) ```{r} # Forest randomForest <- lrn("regr.ranger") -randomForest_class <- lrn("classif.ranger") +random_forest_class <- lrn("classif.ranger") # Trees trees <- lrn("regr.rpart") @@ -1027,108 +1169,120 @@ boost_class <- lrn("classif.glmboost") ### random forest ### lgr::get_logger("mlr3")$set_threshold("warn") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, - ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -forest_MLIIVM <- dml_MLIIVM$coef -forest_std_MLIIVM <- dml_MLIIVM$se +dml_mliivm <- DoubleMLIIVM$new(data_iv, + ml_g = randomForest, + ml_m = random_forest_class, ml_r = random_forest_class, n_folds = 3, subgroups = list( + always_takers = FALSE, + never_takers = TRUE + ) +) +dml_mliivm$fit(store_predictions = TRUE) +dml_mliivm$summary() +forest_mliivm <- dml_mliivm$coef +forest_std_mliivm <- dml_mliivm$se # predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome -forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -forest_y_MLIIVM +forest_y_mliivm <- sqrt(mean((y - g_hat)^2)) +forest_y_mliivm # cross-fitted RMSE: treatment -forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -forest_d_MLIIVM +forest_d_mliivm <- sqrt(mean((d - r_hat)^2)) +forest_d_mliivm # cross-fitted RMSE: instrument -forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -forest_z_MLIIVM +forest_z_mliivm <- sqrt(mean((z - m_hat)^2)) +forest_z_mliivm ### trees ### -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees, - ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -tree_MLIIVM <- dml_MLIIVM$coef -tree_std_MLIIVM <- dml_MLIIVM$se +dml_mliivm <- DoubleMLIIVM$new(data_iv, + ml_g = trees, + ml_m = trees_class, ml_r = trees_class, n_folds = 3, subgroups = list( + always_takers = FALSE, + never_takers = TRUE + ) +) +dml_mliivm$fit(store_predictions = TRUE) +dml_mliivm$summary() +tree_mliivm <- dml_mliivm$coef +tree_std_mliivm <- dml_mliivm$se # predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome -tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -tree_y_MLIIVM +tree_y_mliivm <- sqrt(mean((y - g_hat)^2)) +tree_y_mliivm # cross-fitted RMSE: treatment -tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -tree_d_MLIIVM +tree_d_mliivm <- sqrt(mean((d - r_hat)^2)) +tree_d_mliivm # cross-fitted RMSE: instrument -tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -tree_z_MLIIVM +tree_z_mliivm <- sqrt(mean((z - m_hat)^2)) +tree_z_mliivm ### boosting ### -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost, - ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -boost_MLIIVM <- dml_MLIIVM$coef -boost_std_MLIIVM <- dml_MLIIVM$se +dml_mliivm <- DoubleMLIIVM$new(data_iv, + ml_g = boost, + ml_m = boost_class, ml_r = boost_class, n_folds = 3, subgroups = list( + always_takers = FALSE, + never_takers = TRUE + ) +) +dml_mliivm$fit(store_predictions = TRUE) +dml_mliivm$summary() +boost_mliivm <- dml_mliivm$coef +boost_std_mliivm <- dml_mliivm$se # predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome -boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -boost_y_MLIIVM +boost_y_mliivm <- sqrt(mean((y - g_hat)^2)) +boost_y_mliivm # cross-fitted RMSE: treatment -boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -boost_d_MLIIVM +boost_d_mliivm <- sqrt(mean((d - r_hat)^2)) +boost_d_mliivm # cross-fitted RMSE: instrument -boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -boost_z_MLIIVM +boost_z_mliivm <- sqrt(mean((z - m_hat)^2)) +boost_z_mliivm ``` ```{r} table <- matrix(0, 5, 4) -table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM) -table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM) -table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM) -table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM) -table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D","RMSE Z") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) +table[1, 1:4] <- c(lasso_mliivm, forest_mliivm, tree_mliivm, boost_mliivm) +table[2, 1:4] <- c(lasso_std_mliivm, forest_std_mliivm, tree_std_mliivm, boost_std_mliivm) +table[3, 1:4] <- c(lasso_y_mliivm, forest_y_mliivm, tree_y_mliivm, boost_y_mliivm) +table[4, 1:4] <- c(lasso_d_mliivm, forest_d_mliivm, tree_d_mliivm, boost_d_mliivm) +table[5, 1:4] <- c(lasso_z_mliivm, forest_z_mliivm, tree_z_mliivm, boost_z_mliivm) +rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D", "RMSE Z") +colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") +tab <- xtable(table, digits = 2) tab ``` @@ -1140,12 +1294,16 @@ We might rerun the model using the best ML method for each equation to get a fin ```{r} lgr::get_logger("mlr3")$set_threshold("warn") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, - ml_m = lasso_class, ml_r = lasso_class,n_folds=5, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -best_MLIIVM <- dml_MLIIVM$coef -best_std_MLIIVM <- dml_MLIIVM$se +dml_mliivm <- DoubleMLIIVM$new(data_iv, + ml_g = randomForest, + ml_m = lasso_class, ml_r = lasso_class, n_folds = 5, subgroups = list( + always_takers = FALSE, + never_takers = TRUE + ) +) +dml_mliivm$fit(store_predictions = TRUE) +dml_mliivm$summary() +best_mliivm <- dml_mliivm$coef +best_std_mliivm <- dml_mliivm$se ``` diff --git a/AC2/r-dml-401k-IV.irnb b/AC2/r-dml-401k-IV.irnb index bb1f40ad..213fda6c 100644 --- a/AC2/r-dml-401k-IV.irnb +++ b/AC2/r-dml-401k-IV.irnb @@ -67,7 +67,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98d3d8e0", + "id": "3", "metadata": { "vscode": { "languageId": "r" @@ -90,7 +90,7 @@ }, { "cell_type": "markdown", - "id": "3", + "id": "4", "metadata": { "id": "7e23cba0", "papermill": { @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4", + "id": "5", "metadata": { "id": "c442abdc", "papermill": { @@ -137,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "5", + "id": "6", "metadata": { "id": "e47fa9d3", "papermill": { @@ -156,7 +156,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "7", "metadata": { "id": "00e04b82", "papermill": { @@ -178,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "8", "metadata": { "id": "24b41e4a", "papermill": { @@ -196,7 +196,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": { "id": "ed9d4e82", "papermill": { @@ -215,7 +215,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": { "id": "63519184", "papermill": { @@ -239,7 +239,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": { "id": "823d2628", "papermill": { @@ -258,7 +258,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": { "id": "5d8faf9c", "papermill": { @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": { "id": "0f4f86a7", "papermill": { @@ -304,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": { "id": "836c6af7", "papermill": { @@ -328,7 +328,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": { "id": "22b09926", "papermill": { @@ -347,7 +347,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": { "id": "e78aaa58", "papermill": { @@ -371,7 +371,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": { "id": "e0af3c81", "papermill": { @@ -390,7 +390,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "18", "metadata": { "id": "A03YWrvUW0Sm", "vscode": { @@ -409,7 +409,7 @@ }, { "cell_type": "markdown", - "id": "18", + "id": "19", "metadata": { "id": "RVUbOMRRWwBm" }, @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "20", "metadata": { "id": "7vt1hbdBG8cb", "vscode": { @@ -438,7 +438,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "21", "metadata": { "id": "yzNigd7YYVuA" }, @@ -448,7 +448,7 @@ }, { "cell_type": "markdown", - "id": "21", + "id": "22", "metadata": { "id": "FI2u5KU7YWIF" }, @@ -458,7 +458,7 @@ }, { "cell_type": "markdown", - "id": "22", + "id": "23", "metadata": { "id": "uhDK6Em_YWSm" }, @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "24", "metadata": { "id": "bdUGB53AYf3S", "vscode": { @@ -519,7 +519,7 @@ }, { "cell_type": "markdown", - "id": "24", + "id": "25", "metadata": { "id": "Fw1ZxeBKZcRm" }, @@ -529,7 +529,7 @@ }, { "cell_type": "markdown", - "id": "25", + "id": "26", "metadata": { "id": "jnBOtXXuZnkz" }, @@ -540,7 +540,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26", + "id": "27", "metadata": { "id": "K_vQlMYmz91I", "vscode": { @@ -616,7 +616,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "28", "metadata": { "id": "puSCLNvofQxA", "vscode": { @@ -644,7 +644,7 @@ }, { "cell_type": "markdown", - "id": "28", + "id": "29", "metadata": { "id": "1Z5vrvrlbuPj" }, @@ -655,7 +655,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29", + "id": "30", "metadata": { "id": "vBJm7BkUYgsG", "vscode": { @@ -693,7 +693,7 @@ }, { "cell_type": "markdown", - "id": "30", + "id": "31", "metadata": { "id": "pyrem2YniNls" }, @@ -704,7 +704,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "32", "metadata": { "id": "FM6WvQXKYgxL", "vscode": { @@ -741,7 +741,7 @@ }, { "cell_type": "markdown", - "id": "32", + "id": "33", "metadata": { "id": "yfTdX3__jcwI" }, @@ -752,7 +752,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33", + "id": "34", "metadata": { "id": "mMvJT6NZHW1_", "vscode": { @@ -789,7 +789,7 @@ }, { "cell_type": "markdown", - "id": "34", + "id": "35", "metadata": { "id": "4I1oVQutjeqE" }, @@ -800,7 +800,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "36", "metadata": { "id": "ayrnTPeBHW88", "vscode": { @@ -838,7 +838,7 @@ }, { "cell_type": "markdown", - "id": "36", + "id": "37", "metadata": { "id": "h7Jo_WXUjgjb" }, @@ -849,7 +849,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37", + "id": "38", "metadata": { "id": "nzlszy9zjiSy", "vscode": { @@ -892,7 +892,7 @@ }, { "cell_type": "markdown", - "id": "38", + "id": "39", "metadata": { "id": "oQpoYedAc4Ic" }, @@ -902,7 +902,7 @@ }, { "cell_type": "markdown", - "id": "39", + "id": "40", "metadata": { "id": "_LLsorarc8Mh" }, @@ -913,7 +913,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40", + "id": "41", "metadata": { "id": "kAePILCadEVh", "vscode": { @@ -932,7 +932,7 @@ }, { "cell_type": "markdown", - "id": "41", + "id": "42", "metadata": { "id": "KaaDX4kkdIMx" }, @@ -943,7 +943,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42", + "id": "43", "metadata": { "id": "mCsyY3iJdHm_", "vscode": { @@ -987,7 +987,7 @@ }, { "cell_type": "markdown", - "id": "43", + "id": "44", "metadata": { "id": "8OUusM2BpZH4" }, @@ -1004,7 +1004,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44", + "id": "45", "metadata": { "id": "UeNF5j1ApYYy", "vscode": { @@ -1037,7 +1037,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "46", "metadata": { "id": "X21PuuUnsa25", "vscode": { @@ -1054,7 +1054,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46", + "id": "47", "metadata": { "id": "x-ZSzMkVqI45", "vscode": { @@ -1076,7 +1076,7 @@ }, { "cell_type": "markdown", - "id": "47", + "id": "48", "metadata": { "id": "nKQGPfXWIKmh" }, @@ -1086,7 +1086,7 @@ }, { "cell_type": "markdown", - "id": "48", + "id": "49", "metadata": { "id": "bCayhAlaINjL" }, @@ -1124,7 +1124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49", + "id": "50", "metadata": { "id": "rQYifUnFIt5z", "vscode": { @@ -1334,7 +1334,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50", + "id": "51", "metadata": { "id": "iArB2WQHBXuV", "vscode": { @@ -1363,7 +1363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51", + "id": "52", "metadata": { "id": "Tj-8FFF3BXxV", "vscode": { @@ -1413,7 +1413,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52", + "id": "53", "metadata": { "id": "sXjbvMbEkYJd", "vscode": { @@ -1462,7 +1462,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53", + "id": "54", "metadata": { "id": "ZZRXpY8YkYNN", "vscode": { @@ -1511,7 +1511,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "55", "metadata": { "id": "RYqykjPskYQJ", "vscode": { @@ -1567,7 +1567,7 @@ }, { "cell_type": "markdown", - "id": "55", + "id": "56", "metadata": { "id": "29vcyCYsktQ3" }, @@ -1577,7 +1577,7 @@ }, { "cell_type": "markdown", - "id": "56", + "id": "57", "metadata": { "id": "vuyfd9UJkw9G" }, @@ -1588,7 +1588,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57", + "id": "58", "metadata": { "id": "Y9_T5SMUk3Rd", "vscode": { @@ -1631,7 +1631,7 @@ }, { "cell_type": "markdown", - "id": "58", + "id": "59", "metadata": { "id": "RyRS9zNUlMCF" }, @@ -1642,7 +1642,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "60", "metadata": { "id": "1H4sCbO2lLpJ", "vscode": { @@ -1689,7 +1689,7 @@ }, { "cell_type": "markdown", - "id": "60", + "id": "61", "metadata": { "id": "UflbjTEG5SXV" }, @@ -1700,7 +1700,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "62", "metadata": { "id": "CIS-58oi4sa1", "vscode": { @@ -1714,7 +1714,7 @@ }, { "cell_type": "markdown", - "id": "62", + "id": "63", "metadata": { "id": "M4Zi0FPH5VZG" }, @@ -1724,7 +1724,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "64", "metadata": { "id": "VrBkj_pc5qgm" }, @@ -1738,7 +1738,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64", + "id": "65", "metadata": { "id": "bj67nsgcCDoS", "vscode": { @@ -1772,7 +1772,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65", + "id": "66", "metadata": { "id": "KqgPk1Jm4sdo", "vscode": { @@ -1798,7 +1798,7 @@ }, { "cell_type": "markdown", - "id": "66", + "id": "67", "metadata": { "id": "akCGDMZJCN3h" }, @@ -1808,7 +1808,7 @@ }, { "cell_type": "markdown", - "id": "67", + "id": "68", "metadata": { "id": "01de9f24", "papermill": { @@ -1833,7 +1833,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68", + "id": "69", "metadata": { "id": "2846a36a", "papermill": { @@ -1860,7 +1860,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1bd37969", + "id": "70", "metadata": { "vscode": { "languageId": "r" @@ -1877,7 +1877,7 @@ }, { "cell_type": "markdown", - "id": "69", + "id": "71", "metadata": { "id": "2259ae1c", "papermill": { @@ -1895,7 +1895,7 @@ }, { "cell_type": "markdown", - "id": "70", + "id": "72", "metadata": { "id": "9c27e413", "papermill": { @@ -1913,7 +1913,7 @@ }, { "cell_type": "markdown", - "id": "71", + "id": "73", "metadata": { "id": "4fa23c70", "papermill": { @@ -1938,7 +1938,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72", + "id": "74", "metadata": { "id": "cb223b75", "papermill": { @@ -1966,7 +1966,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73", + "id": "75", "metadata": { "id": "e652ffad", "papermill": { @@ -2001,7 +2001,7 @@ }, { "cell_type": "markdown", - "id": "74", + "id": "76", "metadata": { "id": "63103667", "papermill": { @@ -2020,7 +2020,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75", + "id": "77", "metadata": { "id": "322855c4", "papermill": { @@ -2042,7 +2042,7 @@ }, { "cell_type": "markdown", - "id": "76", + "id": "78", "metadata": { "id": "2965410d", "papermill": { @@ -2061,7 +2061,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77", + "id": "79", "metadata": { "id": "1476fd27", "papermill": { @@ -2097,7 +2097,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78", + "id": "80", "metadata": { "id": "444c53f4", "papermill": { @@ -2129,7 +2129,7 @@ }, { "cell_type": "markdown", - "id": "79", + "id": "81", "metadata": { "id": "a7461966", "papermill": { @@ -2148,7 +2148,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80", + "id": "82", "metadata": { "id": "59YzwIcpEnyV", "vscode": { @@ -2166,7 +2166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66979b8b", + "id": "83", "metadata": { "vscode": { "languageId": "r" @@ -2181,7 +2181,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81", + "id": "84", "metadata": { "id": "Ec0g3ch3EjAl", "vscode": { @@ -2206,7 +2206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82", + "id": "85", "metadata": { "id": "3935dfc5", "papermill": { @@ -2333,7 +2333,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83", + "id": "86", "metadata": { "id": "7187fc74", "papermill": { @@ -2364,7 +2364,7 @@ }, { "cell_type": "markdown", - "id": "84", + "id": "87", "metadata": { "id": "f4ce7be1", "papermill": { @@ -2384,7 +2384,7 @@ }, { "cell_type": "markdown", - "id": "85", + "id": "88", "metadata": { "id": "4939cd9c", "papermill": { @@ -2403,7 +2403,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86", + "id": "89", "metadata": { "id": "ca612b71", "papermill": { diff --git a/AC2/r-weak-iv-experiments.Rmd b/AC2/r-weak-iv-experiments.Rmd index 211fd149..daf93cc7 100644 --- a/AC2/r-weak-iv-experiments.Rmd +++ b/AC2/r-weak-iv-experiments.Rmd @@ -7,7 +7,9 @@ output: html_document ```{r} install.packages("hdm") +``` +```{r} library(hdm) ``` @@ -17,22 +19,20 @@ Simulation Design # Simulation Design set.seed(1) -n=100 -beta = .1 # .1 weak IV -#beta = 1 # 1 strong IV +n <- 100 +beta <- .1 # .1 weak IV (change to 1.0 for strong IV) # One realization -U = rnorm(n) -Z = rnorm(n) # generate instrument -D = beta*Z + U # generate endogenougs variable -Y = D + U # the true causal effect is 1 +U <- rnorm(n) +Z <- rnorm(n) # generate instrument +D <- beta * Z + U # generate endogenougs variable +Y <- D + U # the true causal effect is 1 - -summary(lm(D~Z)) # first stage is very weak here when we set beta = .1 +summary(lm(D ~ Z)) # first stage is very weak here when we set beta = .1 ``` ```{r} -summary(tsls(x=NULL, d=D, y=Y, z=Z)) +summary(tsls(x = NULL, d = D, y = Y, z = Z)) ``` Note that the instrument is weak here (strength of the instrument is controlled by setting $\beta$) -- the t-stat is smaller than any rule-of-thumb suggested in the literature (e.g. $\sqrt{10}$) . @@ -43,32 +43,33 @@ Note that the instrument is weak here (strength of the instrument is controlled # Simulation Design set.seed(1) -B= 10000 # trials -IVEst = rep(0, B) - -for(i in 1:B){ -U = rnorm(n) -Z = rnorm(n) #generate instrument -D = beta*Z + U #generate endogenougs variable -Y = D+ U # the true causal effect is 1 -IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1] +B <- 10000 # trials +IVEst <- rep(0, B) + +for (i in 1:B) { + U <- rnorm(n) + Z <- rnorm(n) # generate instrument + D <- beta * Z + U # generate endogenougs variable + Y <- D + U # the true causal effect is 1 + IVEst[i] <- coef(tsls(x = NULL, d = D, y = Y, z = Z))[1, 1] } - ``` # Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption) ```{r} -plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5), - xlab= "IV Estimator -True Effect", main="Actual Distribution vs Gaussian") +plot(density(IVEst - 1, n = 1000, from = -5, to = 5), + col = 4, xlim = c(-5, 5), + xlab = "IV Estimator -True Effect", main = "Actual Distribution vs Gaussian" +) -val=seq(-5, 5, by=.05) -var = (1/beta^2)*(1/100) # theoretical variance of IV -sd = sqrt(var) -lines(val, dnorm(val, sd=sd), col=2, lty=2) +val <- seq(-5, 5, by = .05) +var <- (1 / beta^2) * (1 / 100) # theoretical variance of IV +sd <- sqrt(var) +lines(val, dnorm(val, sd = sd), col = 2, lty = 2) -rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B +rejection_frequency <- sum((abs(IVEst - 1) / sd > 1.96)) / B -cat(c("Rejection Frequency is ", rejection.frequency, " while we expect it to be .05")) +cat(c("Rejection Frequency is ", rejection_frequency, " while we expect it to be .05")) ``` From 68e45e8469ea4531821912dabcde03aad8d27dec Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 05:44:31 -0700 Subject: [PATCH 158/261] Update r_ml_for_wage_prediction.irnb --- PM2/r_ml_for_wage_prediction.irnb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PM2/r_ml_for_wage_prediction.irnb b/PM2/r_ml_for_wage_prediction.irnb index c5e32d01..408434a2 100644 --- a/PM2/r_ml_for_wage_prediction.irnb +++ b/PM2/r_ml_for_wage_prediction.irnb @@ -551,8 +551,8 @@ }, "outputs": [], "source": [ - "x_flex <- \"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \" +\n", - " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\"\n", + "x_flex <- paste(\"sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we \",\n", + " \"+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)\")\n", "formula_flex <- as.formula(paste(\"lwage\", \"~\", x_flex))\n", "model_x_flex_train <- model.matrix(formula_flex, data_train)\n", "model_x_flex_test <- model.matrix(formula_flex, data_test)\n", From 21285c3a108bc5c87faedafc421c201cbd0b7409 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 07:53:23 -0700 Subject: [PATCH 159/261] Mistakes in PM2 from linting changes --- PM2/r_convergence_hypothesis_double_lasso.irnb | 4 ++-- PM2/r_linear_penalized_regs.irnb | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index abcc3c29..f8e20081 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -938,7 +938,7 @@ "outputs": [], "source": [ "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", - "colnames(tmp_df) <- c(\"res_y\", \"res_D\")" + "colnames(tmp_df) <- c(\"res_y\", \"res_d\")" ] }, { @@ -953,7 +953,7 @@ "outputs": [], "source": [ "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", - "est_cv <- summary(fit_cv)$coef[\"res_D\", 1]\n", + "est_cv <- summary(fit_cv)$coef[\"res_d\", 1]\n", "\n", "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index 6bd19010..a2983739 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -33,7 +33,8 @@ "install.packages(\"xtable\")\n", "install.packages(\"hdm\")\n", "install.packages(\"glmnet\")\n", - "install.packages(\"ggplot2\")" + "install.packages(\"ggplot2\")\n", + "install.packages(\"tidyr\")" ] }, { @@ -49,7 +50,8 @@ "library(hdm)\n", "library(xtable)\n", "library(glmnet)\n", - "library(ggplot2)" + "library(ggplot2)\n", + "library(tidyr)" ] }, { From 6f3d685a481daed13a132fcb4e3f5984c7fe44f9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 07:58:07 -0700 Subject: [PATCH 160/261] Update r_linear_penalized_regs.irnb --- PM2/r_linear_penalized_regs.irnb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PM2/r_linear_penalized_regs.irnb b/PM2/r_linear_penalized_regs.irnb index a2983739..16ff6a14 100644 --- a/PM2/r_linear_penalized_regs.irnb +++ b/PM2/r_linear_penalized_regs.irnb @@ -50,8 +50,7 @@ "library(hdm)\n", "library(xtable)\n", "library(glmnet)\n", - "library(ggplot2)\n", - "library(tidyr)" + "library(ggplot2)" ] }, { From 9e86cedb49e1d469031e5edb57a25518adfaf44f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 19 Jul 2024 16:39:28 +0000 Subject: [PATCH 161/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM2 --- PM2/r_convergence_hypothesis_double_lasso.Rmd | 12 +- ...r_convergence_hypothesis_double_lasso.irnb | 1653 ++++++----------- PM2/r_experiment_non_orthogonal.Rmd | 39 +- PM2/r_heterogenous_wage_effects.Rmd | 15 +- PM2/r_linear_penalized_regs.Rmd | 25 +- PM2/r_ml_for_wage_prediction.Rmd | 27 +- PM2/r_orthogonal_orig.Rmd | 31 +- 7 files changed, 643 insertions(+), 1159 deletions(-) diff --git a/PM2/r_convergence_hypothesis_double_lasso.Rmd b/PM2/r_convergence_hypothesis_double_lasso.Rmd index b5b7f9e7..0ab246fd 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.Rmd +++ b/PM2/r_convergence_hypothesis_double_lasso.Rmd @@ -12,6 +12,9 @@ install.packages("lmtest") install.packages("sandwich") install.packages("glmnet") install.packages("ggplot2") +``` + +```{r} library(hdm) library(xtable) library(lmtest) @@ -121,16 +124,15 @@ We report the results using cross validation at the end of this notebook for com ```{r} double_lasso <- function(y, D, W) { - require(hdm) # residualize outcome with Lasso - yfit_rlasso <- rlasso(W, y, post = FALSE) + yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) yres <- y - as.numeric(yhat_rlasso) # residualize treatment with Lasso - dfit_rlasso <- rlasso(W, D, post = FALSE) + dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) dres <- D - as.numeric(dhat_rlasso) @@ -208,12 +210,12 @@ res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W)) ```{r} tmp_df <- as.data.frame(cbind(res_y, res_d)) -colnames(tmp_df) = c("res_y", "res_D") +colnames(tmp_df) <- c("res_y", "res_d") ``` ```{r} fit_cv <- lm(res_y ~ res_d, data = tmp_df) -est_cv <- summary(fit_cv)$coef["res_D", 1] +est_cv <- summary(fit_cv)$coef["res_d", 1] hcv_cv_coefs <- vcovHC(fit_cv, type = "HC1") # HC - "heteroskedasticity cosistent" se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors diff --git a/PM2/r_convergence_hypothesis_double_lasso.irnb b/PM2/r_convergence_hypothesis_double_lasso.irnb index f8e20081..d05325b7 100644 --- a/PM2/r_convergence_hypothesis_double_lasso.irnb +++ b/PM2/r_convergence_hypothesis_double_lasso.irnb @@ -1,1104 +1,575 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "79U65py1grzb" - }, - "source": [ - "# Testing the Convergence Hypothesis" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GK-MMvLseA2Q", - "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’, ‘glmnet’, ‘checkmate’, ‘Formula’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "also installing the dependency ‘zoo’\n", - "\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Installing package into ‘/usr/local/lib/R/site-library’\n", - "(as ‘lib’ is unspecified)\n", - "\n", - "Loading required package: zoo\n", - "\n", - "\n", - "Attaching package: ‘zoo’\n", - "\n", - "\n", - "The following objects are masked from ‘package:base’:\n", - "\n", - " as.Date, as.Date.numeric\n", - "\n", - "\n", - "Loading required package: Matrix\n", - "\n", - "Loaded glmnet 4.1-8\n", - "\n" - ] - } - ], - "source": [ - "install.packages(\"hdm\")\n", - "install.packages(\"xtable\")\n", - "install.packages(\"lmtest\")\n", - "install.packages(\"sandwich\")\n", - "install.packages(\"glmnet\")\n", - "install.packages(\"ggplot2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "library(hdm)\n", - "library(xtable)\n", - "library(lmtest)\n", - "library(sandwich)\n", - "library(glmnet) # For LassoCV\n", - "library(ggplot2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nlpSLLV6g1pc" - }, - "source": [ - "## Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xXkzGJWag02O" - }, - "source": [ - "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", - " $$\n", - " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", - " $$\n", - " \n", - "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", - " \n", - "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5Ul2ppLfUBQ" - }, - "source": [ - "## Data Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9GgPNICafYuK" - }, - "source": [ - "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "_B9DWuS6fcVW", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "getdata <- function(...) {\n", - " e <- new.env()\n", - " name <- data(..., envir = e)[1]\n", - " e[[name]]\n", - "}\n", - "\n", - "# now load your data calling getdata()\n", - "growth <- getdata(GrowthData)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smYhqwpbffVh" - }, - "source": [ - "The sample contains $90$ countries and $63$ controls." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1dsF7_R4j-Qv", - "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\n", - "\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\t\n", - "\n", - "
A data.frame: 90 × 63
Outcomeinterceptgdpsh465bmp1lfreeopfreetarh65hm65hf65p65seccf65syr65syrm65syrf65teapri65teasec65ex1im1xr65tot1
<dbl><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
-0.0243357516.5916740.28370.1534910.0438880.0070.0130.0010.29 0.040.0330.0570.01047.617.30.07290.0667 0.348-0.014727
0.1004725716.8297940.61410.3135090.0618270.0190.0320.0070.91 0.640.1730.2740.06757.118.00.09400.1438 0.525 0.005750
0.0670514818.8950820.00000.2042440.0091860.2600.3250.2011.0018.142.5732.4782.66726.520.70.17410.1750 1.082-0.010040
0.0640891717.5652750.19970.2487140.0362700.0610.0700.0511.00 2.630.4380.4530.42427.822.70.12650.1496 6.625-0.002195
0.0279295517.1623970.17400.2992520.0373670.0170.0270.0070.82 2.110.2570.2870.22934.517.60.12110.1308 2.500 0.003283
0.0464074417.2189100.00000.2588650.0208800.0230.0380.0060.50 1.460.1600.1740.14634.3 8.10.06340.0762 1.000-0.001747
0.0673323417.8536050.00000.1825250.0143850.0390.0630.0140.92 1.590.3420.4840.20746.614.70.03420.0428 12.499 0.009092
0.0209776817.7039100.27760.2152750.0297130.0240.0350.0130.69 1.630.1840.2190.15234.016.10.08640.0931 7.000 0.011630
0.0335512419.0634630.00000.1096140.0021710.4020.4880.3141.0024.723.2063.1543.25328.220.60.05940.0460 1.000 0.008169
0.0391465218.1519100.14840.1108850.0285790.1450.1730.1141.00 6.760.7030.7850.62020.3 7.20.05240.0523 2.119 0.007584
0.0761265116.9295170.02960.1657840.0201150.0460.0660.0250.73 6.211.3161.6830.96927.817.20.05600.0826 11.879 0.086032
0.1279512117.2377780.21510.0784880.0115810.0220.0310.0141.00 3.960.5940.6740.51528.214.80.02700.0275 1.938 0.007666
-0.0243260918.1158200.43180.1374820.0265470.0590.0730.0451.0011.361.1321.1261.13852.118.80.08040.0930 0.003 0.016968
0.0782934217.2717040.16890.1645980.0444460.0290.0450.0130.84 3.100.5680.6950.45035.913.10.06170.0678 10.479 0.004573
0.1129115517.1212520.18320.1880160.0456780.0330.0510.0150.91 3.160.4400.5120.36937.412.70.07750.0780 18.476-0.020322
0.0523081916.9772810.09620.2046110.0778520.0370.0430.0301.00 2.400.4190.5480.29930.3 7.90.06680.0787125.990 0.028916
0.0363908917.6496930.02270.1362870.0467300.0810.1050.0560.99 3.510.5620.6990.42735.714.70.08720.0938 26.800 0.020228
0.0297382318.0567440.02080.1978530.0372240.0830.0970.0691.00 3.300.7220.7650.68036.612.60.05570.0624 0.052 0.013407
-0.0566435818.7809410.26540.1898670.0317470.0680.0890.0460.94 2.990.3720.4620.28134.020.30.31780.1583 4.500-0.024761
0.0192048016.2878590.42070.1306820.1099210.0530.0390.0110.74 0.340.1420.2230.05535.519.10.02010.0341 4.762-0.021656
0.0852060016.1377270.13710.1238180.0158970.0280.0250.0070.72 0.560.1480.2320.06541.321.30.02980.0297 4.125-0.054872
0.1339822118.1288800.00000.1672100.0033110.1290.1960.0631.0013.161.7271.9101.56028.123.20.05700.0609360.000-0.054874
0.1730247416.6808550.47130.2284240.0293280.0620.0900.0321.00 3.950.9741.5260.47062.434.90.02060.0618265.690 0.018194
0.1096991517.1770190.01780.1852400.0154530.0200.0260.0130.90 1.890.5710.8430.28626.924.10.22950.1990 3.061-0.034733
0.0159899016.6489850.47620.1711810.0589370.0180.0280.0070.40 0.760.3570.5120.18539.926.90.01780.0634 4.762-0.000222
0.0622497716.8793560.29270.1795080.0358420.1880.1690.2081.00 3.690.6510.7590.54731.431.20.06950.0728 4.017 0.033636
0.1098706917.3473000.10170.2476260.0373920.0800.1330.0270.78 0.720.1950.3030.08536.221.60.08600.0898 3.177 0.010162
0.0921062816.7250340.02660.1799330.0463760.0150.0200.0100.78 0.860.2580.3820.13734.616.50.05580.0613 20.800-0.018514
0.0833760418.4510530.00000.3585560.0164680.0900.1330.0441.00 2.910.7661.0870.51021.915.30.16870.1635 26.000 0.010943
0.0762334518.6024530.00000.4162340.0147210.1480.1940.1001.0012.171.5541.7241.39820.6 7.20.26290.2698 50.000-0.001521
-0.0340453918.3461680.31990.1108850.0285790.2720.2890.2721.00 9.580.9190.9360.90218.2 7.70.06250.0578 36.603 0.014286
-0.0338063517.3031700.31330.1657840.0201150.1120.1320.0650.85 5.601.1581.4730.86222.718.20.10710.1028 20.000 0.111198
0.0699148817.8590270.12220.0784880.0115810.1070.1030.0920.88 2.800.5960.6450.54821.714.40.03570.0466 8.127 0.006002
-0.0817256017.9983351.63780.1374820.0265470.1560.1810.1501.0013.741.3391.2221.44534.815.20.07830.0847 4.911-0.127025
0.0460100517.6558640.13450.1645980.0444460.0800.0970.0581.00 8.251.0761.1431.01332.119.40.05250.0572 30.929-0.004592
0.0665980917.6750820.08980.1880160.0456780.2690.3380.2001.00 5.800.6870.7450.63037.516.40.09060.0959 25.000 0.191066
-0.0113842417.8300280.48800.1362870.0467300.1460.1930.0941.00 6.420.9501.1290.77239.123.80.07640.0866 40.500-0.007018
-0.1009899018.4986220.00100.1898670.0317470.1810.1900.1590.97 7.630.8010.8500.75230.216.80.21310.1437 4.285 0.168536
0.0547508716.2166060.75570.2143450.0734950.0230.0510.0060.73 0.440.2820.4880.05150.621.80.02320.0407 8.876-0.084064
0.0946181718.4144960.00000.3743280.0000000.1010.1470.0531.0011.801.8462.3691.30131.124.30.59580.5819 4.935 0.021808
0.0457152916.3835070.35560.1306820.1099210.0860.1300.0420.79 1.000.4460.7130.16342.319.80.01880.0222 8.653-0.012443
0.0654911118.7823230.00000.1672100.0033110.2460.3310.1600.9915.521.9692.1211.82825.317.50.10320.0958296.800-0.057094
0.0212465117.2513450.05160.2638130.0452250.0900.0530.0300.88 4.000.8171.2050.41334.721.10.07300.2227 0.320 0.128443
0.1414454817.5115250.10530.2284240.0293280.1030.1390.0541.00 9.331.7002.3691.06051.737.10.09030.1229484.000 0.007257
0.0968162317.7137850.00500.1852400.0154530.0310.0420.0160.91 4.350.8911.2550.51731.827.30.19220.1821 2.402 0.030424
0.0405342016.7286290.61900.1711810.0589370.0190.0270.0090.46 1.010.6701.0390.27140.118.00.02810.0459 9.900-0.012137
0.0105884117.1861440.07600.1795080.0358420.1840.1730.2171.00 5.340.9431.0490.83729.031.50.07030.0716 7.248 0.009640
0.1855264918.3260330.00500.3216580.0051060.0900.1090.0751.00 4.641.1271.4270.81730.523.10.74700.8489 2.371 0.051395
0.0931049117.8946910.10620.2476260.0373920.1210.1750.0630.96 1.470.4810.7610.20033.819.60.07970.1018 3.017 0.207492
0.0652285617.1754900.00000.1799330.0463760.0350.0400.0270.83 1.230.3320.4510.21927.927.20.06360.0721 20.379 0.018019
0.0380950219.0309740.00000.2931380.0055170.2450.2510.2381.00 7.501.1671.2101.12822.515.50.16620.1617 4.286-0.006642
0.0342130018.9955370.00000.3047200.0116580.2460.2600.1901.00 6.750.6670.7760.57523.515.00.25970.2288 2.460-0.003241
0.0527591418.2348300.03630.2884050.0115890.1830.2220.1421.00 8.181.0101.2200.82130.228.30.10440.1796 32.051-0.034352
0.0384156418.3325490.00000.3454850.0065030.1880.2480.1361.0013.121.5761.5671.58531.014.30.28660.3500 0.452-0.001660
0.0318947918.6455860.00000.2884400.0059950.2560.3010.1991.00 6.911.3071.5791.06218.911.30.12960.1458652.850-0.046278
0.0311959818.9910640.00000.3718980.0145860.2550.3360.1700.9811.412.2262.4941.97127.515.90.44070.4257 2.529-0.011883
0.0340956618.0251890.00500.2964370.0136150.1080.1170.0931.00 1.950.5100.6940.36220.215.70.16690.2201 25.553-0.039080
0.0469004619.0301370.00000.2657780.0086290.2880.3370.2371.0025.642.7272.6642.78820.4 9.40.32380.3134 4.152 0.005175
0.0397733718.8653120.00000.2829390.0050480.1880.2360.1391.0010.761.8881.9201.86020.016.00.18450.1940 0.452-0.029551
0.0406415418.9123390.00000.1503660.0243770.2570.3380.2151.0024.403.0513.2352.87518.529.10.18760.2007 0.886-0.036482
\n" - ], - "text/latex": [ - "A data.frame: 90 × 63\n", - "\\begin{tabular}{lllllllllllllllllllll}\n", - " Outcome & intercept & gdpsh465 & bmp1l & freeop & freetar & h65 & hm65 & hf65 & p65 & ⋯ & seccf65 & syr65 & syrm65 & syrf65 & teapri65 & teasec65 & ex1 & im1 & xr65 & tot1\\\\\n", - " & & & & & & & & & & ⋯ & & & & & & & & & & \\\\\n", - "\\hline\n", - "\t -0.02433575 & 1 & 6.591674 & 0.2837 & 0.153491 & 0.043888 & 0.007 & 0.013 & 0.001 & 0.29 & ⋯ & 0.04 & 0.033 & 0.057 & 0.010 & 47.6 & 17.3 & 0.0729 & 0.0667 & 0.348 & -0.014727\\\\\n", - "\t 0.10047257 & 1 & 6.829794 & 0.6141 & 0.313509 & 0.061827 & 0.019 & 0.032 & 0.007 & 0.91 & ⋯ & 0.64 & 0.173 & 0.274 & 0.067 & 57.1 & 18.0 & 0.0940 & 0.1438 & 0.525 & 0.005750\\\\\n", - "\t 0.06705148 & 1 & 8.895082 & 0.0000 & 0.204244 & 0.009186 & 0.260 & 0.325 & 0.201 & 1.00 & ⋯ & 18.14 & 2.573 & 2.478 & 2.667 & 26.5 & 20.7 & 0.1741 & 0.1750 & 1.082 & -0.010040\\\\\n", - "\t 0.06408917 & 1 & 7.565275 & 0.1997 & 0.248714 & 0.036270 & 0.061 & 0.070 & 0.051 & 1.00 & ⋯ & 2.63 & 0.438 & 0.453 & 0.424 & 27.8 & 22.7 & 0.1265 & 0.1496 & 6.625 & -0.002195\\\\\n", - "\t 0.02792955 & 1 & 7.162397 & 0.1740 & 0.299252 & 0.037367 & 0.017 & 0.027 & 0.007 & 0.82 & ⋯ & 2.11 & 0.257 & 0.287 & 0.229 & 34.5 & 17.6 & 0.1211 & 0.1308 & 2.500 & 0.003283\\\\\n", - "\t 0.04640744 & 1 & 7.218910 & 0.0000 & 0.258865 & 0.020880 & 0.023 & 0.038 & 0.006 & 0.50 & ⋯ & 1.46 & 0.160 & 0.174 & 0.146 & 34.3 & 8.1 & 0.0634 & 0.0762 & 1.000 & -0.001747\\\\\n", - "\t 0.06733234 & 1 & 7.853605 & 0.0000 & 0.182525 & 0.014385 & 0.039 & 0.063 & 0.014 & 0.92 & ⋯ & 1.59 & 0.342 & 0.484 & 0.207 & 46.6 & 14.7 & 0.0342 & 0.0428 & 12.499 & 0.009092\\\\\n", - "\t 0.02097768 & 1 & 7.703910 & 0.2776 & 0.215275 & 0.029713 & 0.024 & 0.035 & 0.013 & 0.69 & ⋯ & 1.63 & 0.184 & 0.219 & 0.152 & 34.0 & 16.1 & 0.0864 & 0.0931 & 7.000 & 0.011630\\\\\n", - "\t 0.03355124 & 1 & 9.063463 & 0.0000 & 0.109614 & 0.002171 & 0.402 & 0.488 & 0.314 & 1.00 & ⋯ & 24.72 & 3.206 & 3.154 & 3.253 & 28.2 & 20.6 & 0.0594 & 0.0460 & 1.000 & 0.008169\\\\\n", - "\t 0.03914652 & 1 & 8.151910 & 0.1484 & 0.110885 & 0.028579 & 0.145 & 0.173 & 0.114 & 1.00 & ⋯ & 6.76 & 0.703 & 0.785 & 0.620 & 20.3 & 7.2 & 0.0524 & 0.0523 & 2.119 & 0.007584\\\\\n", - "\t 0.07612651 & 1 & 6.929517 & 0.0296 & 0.165784 & 0.020115 & 0.046 & 0.066 & 0.025 & 0.73 & ⋯ & 6.21 & 1.316 & 1.683 & 0.969 & 27.8 & 17.2 & 0.0560 & 0.0826 & 11.879 & 0.086032\\\\\n", - "\t 0.12795121 & 1 & 7.237778 & 0.2151 & 0.078488 & 0.011581 & 0.022 & 0.031 & 0.014 & 1.00 & ⋯ & 3.96 & 0.594 & 0.674 & 0.515 & 28.2 & 14.8 & 0.0270 & 0.0275 & 1.938 & 0.007666\\\\\n", - "\t -0.02432609 & 1 & 8.115820 & 0.4318 & 0.137482 & 0.026547 & 0.059 & 0.073 & 0.045 & 1.00 & ⋯ & 11.36 & 1.132 & 1.126 & 1.138 & 52.1 & 18.8 & 0.0804 & 0.0930 & 0.003 & 0.016968\\\\\n", - "\t 0.07829342 & 1 & 7.271704 & 0.1689 & 0.164598 & 0.044446 & 0.029 & 0.045 & 0.013 & 0.84 & ⋯ & 3.10 & 0.568 & 0.695 & 0.450 & 35.9 & 13.1 & 0.0617 & 0.0678 & 10.479 & 0.004573\\\\\n", - "\t 0.11291155 & 1 & 7.121252 & 0.1832 & 0.188016 & 0.045678 & 0.033 & 0.051 & 0.015 & 0.91 & ⋯ & 3.16 & 0.440 & 0.512 & 0.369 & 37.4 & 12.7 & 0.0775 & 0.0780 & 18.476 & -0.020322\\\\\n", - "\t 0.05230819 & 1 & 6.977281 & 0.0962 & 0.204611 & 0.077852 & 0.037 & 0.043 & 0.030 & 1.00 & ⋯ & 2.40 & 0.419 & 0.548 & 0.299 & 30.3 & 7.9 & 0.0668 & 0.0787 & 125.990 & 0.028916\\\\\n", - "\t 0.03639089 & 1 & 7.649693 & 0.0227 & 0.136287 & 0.046730 & 0.081 & 0.105 & 0.056 & 0.99 & ⋯ & 3.51 & 0.562 & 0.699 & 0.427 & 35.7 & 14.7 & 0.0872 & 0.0938 & 26.800 & 0.020228\\\\\n", - "\t 0.02973823 & 1 & 8.056744 & 0.0208 & 0.197853 & 0.037224 & 0.083 & 0.097 & 0.069 & 1.00 & ⋯ & 3.30 & 0.722 & 0.765 & 0.680 & 36.6 & 12.6 & 0.0557 & 0.0624 & 0.052 & 0.013407\\\\\n", - "\t -0.05664358 & 1 & 8.780941 & 0.2654 & 0.189867 & 0.031747 & 0.068 & 0.089 & 0.046 & 0.94 & ⋯ & 2.99 & 0.372 & 0.462 & 0.281 & 34.0 & 20.3 & 0.3178 & 0.1583 & 4.500 & -0.024761\\\\\n", - "\t 0.01920480 & 1 & 6.287859 & 0.4207 & 0.130682 & 0.109921 & 0.053 & 0.039 & 0.011 & 0.74 & ⋯ & 0.34 & 0.142 & 0.223 & 0.055 & 35.5 & 19.1 & 0.0201 & 0.0341 & 4.762 & -0.021656\\\\\n", - "\t 0.08520600 & 1 & 6.137727 & 0.1371 & 0.123818 & 0.015897 & 0.028 & 0.025 & 0.007 & 0.72 & ⋯ & 0.56 & 0.148 & 0.232 & 0.065 & 41.3 & 21.3 & 0.0298 & 0.0297 & 4.125 & -0.054872\\\\\n", - "\t 0.13398221 & 1 & 8.128880 & 0.0000 & 0.167210 & 0.003311 & 0.129 & 0.196 & 0.063 & 1.00 & ⋯ & 13.16 & 1.727 & 1.910 & 1.560 & 28.1 & 23.2 & 0.0570 & 0.0609 & 360.000 & -0.054874\\\\\n", - "\t 0.17302474 & 1 & 6.680855 & 0.4713 & 0.228424 & 0.029328 & 0.062 & 0.090 & 0.032 & 1.00 & ⋯ & 3.95 & 0.974 & 1.526 & 0.470 & 62.4 & 34.9 & 0.0206 & 0.0618 & 265.690 & 0.018194\\\\\n", - "\t 0.10969915 & 1 & 7.177019 & 0.0178 & 0.185240 & 0.015453 & 0.020 & 0.026 & 0.013 & 0.90 & ⋯ & 1.89 & 0.571 & 0.843 & 0.286 & 26.9 & 24.1 & 0.2295 & 0.1990 & 3.061 & -0.034733\\\\\n", - "\t 0.01598990 & 1 & 6.648985 & 0.4762 & 0.171181 & 0.058937 & 0.018 & 0.028 & 0.007 & 0.40 & ⋯ & 0.76 & 0.357 & 0.512 & 0.185 & 39.9 & 26.9 & 0.0178 & 0.0634 & 4.762 & -0.000222\\\\\n", - "\t 0.06224977 & 1 & 6.879356 & 0.2927 & 0.179508 & 0.035842 & 0.188 & 0.169 & 0.208 & 1.00 & ⋯ & 3.69 & 0.651 & 0.759 & 0.547 & 31.4 & 31.2 & 0.0695 & 0.0728 & 4.017 & 0.033636\\\\\n", - "\t 0.10987069 & 1 & 7.347300 & 0.1017 & 0.247626 & 0.037392 & 0.080 & 0.133 & 0.027 & 0.78 & ⋯ & 0.72 & 0.195 & 0.303 & 0.085 & 36.2 & 21.6 & 0.0860 & 0.0898 & 3.177 & 0.010162\\\\\n", - "\t 0.09210628 & 1 & 6.725034 & 0.0266 & 0.179933 & 0.046376 & 0.015 & 0.020 & 0.010 & 0.78 & ⋯ & 0.86 & 0.258 & 0.382 & 0.137 & 34.6 & 16.5 & 0.0558 & 0.0613 & 20.800 & -0.018514\\\\\n", - "\t 0.08337604 & 1 & 8.451053 & 0.0000 & 0.358556 & 0.016468 & 0.090 & 0.133 & 0.044 & 1.00 & ⋯ & 2.91 & 0.766 & 1.087 & 0.510 & 21.9 & 15.3 & 0.1687 & 0.1635 & 26.000 & 0.010943\\\\\n", - "\t 0.07623345 & 1 & 8.602453 & 0.0000 & 0.416234 & 0.014721 & 0.148 & 0.194 & 0.100 & 1.00 & ⋯ & 12.17 & 1.554 & 1.724 & 1.398 & 20.6 & 7.2 & 0.2629 & 0.2698 & 50.000 & -0.001521\\\\\n", - "\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋱ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", - "\t -0.03404539 & 1 & 8.346168 & 0.3199 & 0.110885 & 0.028579 & 0.272 & 0.289 & 0.272 & 1.00 & ⋯ & 9.58 & 0.919 & 0.936 & 0.902 & 18.2 & 7.7 & 0.0625 & 0.0578 & 36.603 & 0.014286\\\\\n", - "\t -0.03380635 & 1 & 7.303170 & 0.3133 & 0.165784 & 0.020115 & 0.112 & 0.132 & 0.065 & 0.85 & ⋯ & 5.60 & 1.158 & 1.473 & 0.862 & 22.7 & 18.2 & 0.1071 & 0.1028 & 20.000 & 0.111198\\\\\n", - "\t 0.06991488 & 1 & 7.859027 & 0.1222 & 0.078488 & 0.011581 & 0.107 & 0.103 & 0.092 & 0.88 & ⋯ & 2.80 & 0.596 & 0.645 & 0.548 & 21.7 & 14.4 & 0.0357 & 0.0466 & 8.127 & 0.006002\\\\\n", - "\t -0.08172560 & 1 & 7.998335 & 1.6378 & 0.137482 & 0.026547 & 0.156 & 0.181 & 0.150 & 1.00 & ⋯ & 13.74 & 1.339 & 1.222 & 1.445 & 34.8 & 15.2 & 0.0783 & 0.0847 & 4.911 & -0.127025\\\\\n", - "\t 0.04601005 & 1 & 7.655864 & 0.1345 & 0.164598 & 0.044446 & 0.080 & 0.097 & 0.058 & 1.00 & ⋯ & 8.25 & 1.076 & 1.143 & 1.013 & 32.1 & 19.4 & 0.0525 & 0.0572 & 30.929 & -0.004592\\\\\n", - "\t 0.06659809 & 1 & 7.675082 & 0.0898 & 0.188016 & 0.045678 & 0.269 & 0.338 & 0.200 & 1.00 & ⋯ & 5.80 & 0.687 & 0.745 & 0.630 & 37.5 & 16.4 & 0.0906 & 0.0959 & 25.000 & 0.191066\\\\\n", - "\t -0.01138424 & 1 & 7.830028 & 0.4880 & 0.136287 & 0.046730 & 0.146 & 0.193 & 0.094 & 1.00 & ⋯ & 6.42 & 0.950 & 1.129 & 0.772 & 39.1 & 23.8 & 0.0764 & 0.0866 & 40.500 & -0.007018\\\\\n", - "\t -0.10098990 & 1 & 8.498622 & 0.0010 & 0.189867 & 0.031747 & 0.181 & 0.190 & 0.159 & 0.97 & ⋯ & 7.63 & 0.801 & 0.850 & 0.752 & 30.2 & 16.8 & 0.2131 & 0.1437 & 4.285 & 0.168536\\\\\n", - "\t 0.05475087 & 1 & 6.216606 & 0.7557 & 0.214345 & 0.073495 & 0.023 & 0.051 & 0.006 & 0.73 & ⋯ & 0.44 & 0.282 & 0.488 & 0.051 & 50.6 & 21.8 & 0.0232 & 0.0407 & 8.876 & -0.084064\\\\\n", - "\t 0.09461817 & 1 & 8.414496 & 0.0000 & 0.374328 & 0.000000 & 0.101 & 0.147 & 0.053 & 1.00 & ⋯ & 11.80 & 1.846 & 2.369 & 1.301 & 31.1 & 24.3 & 0.5958 & 0.5819 & 4.935 & 0.021808\\\\\n", - "\t 0.04571529 & 1 & 6.383507 & 0.3556 & 0.130682 & 0.109921 & 0.086 & 0.130 & 0.042 & 0.79 & ⋯ & 1.00 & 0.446 & 0.713 & 0.163 & 42.3 & 19.8 & 0.0188 & 0.0222 & 8.653 & -0.012443\\\\\n", - "\t 0.06549111 & 1 & 8.782323 & 0.0000 & 0.167210 & 0.003311 & 0.246 & 0.331 & 0.160 & 0.99 & ⋯ & 15.52 & 1.969 & 2.121 & 1.828 & 25.3 & 17.5 & 0.1032 & 0.0958 & 296.800 & -0.057094\\\\\n", - "\t 0.02124651 & 1 & 7.251345 & 0.0516 & 0.263813 & 0.045225 & 0.090 & 0.053 & 0.030 & 0.88 & ⋯ & 4.00 & 0.817 & 1.205 & 0.413 & 34.7 & 21.1 & 0.0730 & 0.2227 & 0.320 & 0.128443\\\\\n", - "\t 0.14144548 & 1 & 7.511525 & 0.1053 & 0.228424 & 0.029328 & 0.103 & 0.139 & 0.054 & 1.00 & ⋯ & 9.33 & 1.700 & 2.369 & 1.060 & 51.7 & 37.1 & 0.0903 & 0.1229 & 484.000 & 0.007257\\\\\n", - "\t 0.09681623 & 1 & 7.713785 & 0.0050 & 0.185240 & 0.015453 & 0.031 & 0.042 & 0.016 & 0.91 & ⋯ & 4.35 & 0.891 & 1.255 & 0.517 & 31.8 & 27.3 & 0.1922 & 0.1821 & 2.402 & 0.030424\\\\\n", - "\t 0.04053420 & 1 & 6.728629 & 0.6190 & 0.171181 & 0.058937 & 0.019 & 0.027 & 0.009 & 0.46 & ⋯ & 1.01 & 0.670 & 1.039 & 0.271 & 40.1 & 18.0 & 0.0281 & 0.0459 & 9.900 & -0.012137\\\\\n", - "\t 0.01058841 & 1 & 7.186144 & 0.0760 & 0.179508 & 0.035842 & 0.184 & 0.173 & 0.217 & 1.00 & ⋯ & 5.34 & 0.943 & 1.049 & 0.837 & 29.0 & 31.5 & 0.0703 & 0.0716 & 7.248 & 0.009640\\\\\n", - "\t 0.18552649 & 1 & 8.326033 & 0.0050 & 0.321658 & 0.005106 & 0.090 & 0.109 & 0.075 & 1.00 & ⋯ & 4.64 & 1.127 & 1.427 & 0.817 & 30.5 & 23.1 & 0.7470 & 0.8489 & 2.371 & 0.051395\\\\\n", - "\t 0.09310491 & 1 & 7.894691 & 0.1062 & 0.247626 & 0.037392 & 0.121 & 0.175 & 0.063 & 0.96 & ⋯ & 1.47 & 0.481 & 0.761 & 0.200 & 33.8 & 19.6 & 0.0797 & 0.1018 & 3.017 & 0.207492\\\\\n", - "\t 0.06522856 & 1 & 7.175490 & 0.0000 & 0.179933 & 0.046376 & 0.035 & 0.040 & 0.027 & 0.83 & ⋯ & 1.23 & 0.332 & 0.451 & 0.219 & 27.9 & 27.2 & 0.0636 & 0.0721 & 20.379 & 0.018019\\\\\n", - "\t 0.03809502 & 1 & 9.030974 & 0.0000 & 0.293138 & 0.005517 & 0.245 & 0.251 & 0.238 & 1.00 & ⋯ & 7.50 & 1.167 & 1.210 & 1.128 & 22.5 & 15.5 & 0.1662 & 0.1617 & 4.286 & -0.006642\\\\\n", - "\t 0.03421300 & 1 & 8.995537 & 0.0000 & 0.304720 & 0.011658 & 0.246 & 0.260 & 0.190 & 1.00 & ⋯ & 6.75 & 0.667 & 0.776 & 0.575 & 23.5 & 15.0 & 0.2597 & 0.2288 & 2.460 & -0.003241\\\\\n", - "\t 0.05275914 & 1 & 8.234830 & 0.0363 & 0.288405 & 0.011589 & 0.183 & 0.222 & 0.142 & 1.00 & ⋯ & 8.18 & 1.010 & 1.220 & 0.821 & 30.2 & 28.3 & 0.1044 & 0.1796 & 32.051 & -0.034352\\\\\n", - "\t 0.03841564 & 1 & 8.332549 & 0.0000 & 0.345485 & 0.006503 & 0.188 & 0.248 & 0.136 & 1.00 & ⋯ & 13.12 & 1.576 & 1.567 & 1.585 & 31.0 & 14.3 & 0.2866 & 0.3500 & 0.452 & -0.001660\\\\\n", - "\t 0.03189479 & 1 & 8.645586 & 0.0000 & 0.288440 & 0.005995 & 0.256 & 0.301 & 0.199 & 1.00 & ⋯ & 6.91 & 1.307 & 1.579 & 1.062 & 18.9 & 11.3 & 0.1296 & 0.1458 & 652.850 & -0.046278\\\\\n", - "\t 0.03119598 & 1 & 8.991064 & 0.0000 & 0.371898 & 0.014586 & 0.255 & 0.336 & 0.170 & 0.98 & ⋯ & 11.41 & 2.226 & 2.494 & 1.971 & 27.5 & 15.9 & 0.4407 & 0.4257 & 2.529 & -0.011883\\\\\n", - "\t 0.03409566 & 1 & 8.025189 & 0.0050 & 0.296437 & 0.013615 & 0.108 & 0.117 & 0.093 & 1.00 & ⋯ & 1.95 & 0.510 & 0.694 & 0.362 & 20.2 & 15.7 & 0.1669 & 0.2201 & 25.553 & -0.039080\\\\\n", - "\t 0.04690046 & 1 & 9.030137 & 0.0000 & 0.265778 & 0.008629 & 0.288 & 0.337 & 0.237 & 1.00 & ⋯ & 25.64 & 2.727 & 2.664 & 2.788 & 20.4 & 9.4 & 0.3238 & 0.3134 & 4.152 & 0.005175\\\\\n", - "\t 0.03977337 & 1 & 8.865312 & 0.0000 & 0.282939 & 0.005048 & 0.188 & 0.236 & 0.139 & 1.00 & ⋯ & 10.76 & 1.888 & 1.920 & 1.860 & 20.0 & 16.0 & 0.1845 & 0.1940 & 0.452 & -0.029551\\\\\n", - "\t 0.04064154 & 1 & 8.912339 & 0.0000 & 0.150366 & 0.024377 & 0.257 & 0.338 & 0.215 & 1.00 & ⋯ & 24.40 & 3.051 & 3.235 & 2.875 & 18.5 & 29.1 & 0.1876 & 0.2007 & 0.886 & -0.036482\\\\\n", - "\\end{tabular}\n" - ], - "text/markdown": [ - "\n", - "A data.frame: 90 × 63\n", - "\n", - "| Outcome <dbl> | intercept <int> | gdpsh465 <dbl> | bmp1l <dbl> | freeop <dbl> | freetar <dbl> | h65 <dbl> | hm65 <dbl> | hf65 <dbl> | p65 <dbl> | ⋯ ⋯ | seccf65 <dbl> | syr65 <dbl> | syrm65 <dbl> | syrf65 <dbl> | teapri65 <dbl> | teasec65 <dbl> | ex1 <dbl> | im1 <dbl> | xr65 <dbl> | tot1 <dbl> |\n", - "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", - "| -0.02433575 | 1 | 6.591674 | 0.2837 | 0.153491 | 0.043888 | 0.007 | 0.013 | 0.001 | 0.29 | ⋯ | 0.04 | 0.033 | 0.057 | 0.010 | 47.6 | 17.3 | 0.0729 | 0.0667 | 0.348 | -0.014727 |\n", - "| 0.10047257 | 1 | 6.829794 | 0.6141 | 0.313509 | 0.061827 | 0.019 | 0.032 | 0.007 | 0.91 | ⋯ | 0.64 | 0.173 | 0.274 | 0.067 | 57.1 | 18.0 | 0.0940 | 0.1438 | 0.525 | 0.005750 |\n", - "| 0.06705148 | 1 | 8.895082 | 0.0000 | 0.204244 | 0.009186 | 0.260 | 0.325 | 0.201 | 1.00 | ⋯ | 18.14 | 2.573 | 2.478 | 2.667 | 26.5 | 20.7 | 0.1741 | 0.1750 | 1.082 | -0.010040 |\n", - "| 0.06408917 | 1 | 7.565275 | 0.1997 | 0.248714 | 0.036270 | 0.061 | 0.070 | 0.051 | 1.00 | ⋯ | 2.63 | 0.438 | 0.453 | 0.424 | 27.8 | 22.7 | 0.1265 | 0.1496 | 6.625 | -0.002195 |\n", - "| 0.02792955 | 1 | 7.162397 | 0.1740 | 0.299252 | 0.037367 | 0.017 | 0.027 | 0.007 | 0.82 | ⋯ | 2.11 | 0.257 | 0.287 | 0.229 | 34.5 | 17.6 | 0.1211 | 0.1308 | 2.500 | 0.003283 |\n", - "| 0.04640744 | 1 | 7.218910 | 0.0000 | 0.258865 | 0.020880 | 0.023 | 0.038 | 0.006 | 0.50 | ⋯ | 1.46 | 0.160 | 0.174 | 0.146 | 34.3 | 8.1 | 0.0634 | 0.0762 | 1.000 | -0.001747 |\n", - "| 0.06733234 | 1 | 7.853605 | 0.0000 | 0.182525 | 0.014385 | 0.039 | 0.063 | 0.014 | 0.92 | ⋯ | 1.59 | 0.342 | 0.484 | 0.207 | 46.6 | 14.7 | 0.0342 | 0.0428 | 12.499 | 0.009092 |\n", - "| 0.02097768 | 1 | 7.703910 | 0.2776 | 0.215275 | 0.029713 | 0.024 | 0.035 | 0.013 | 0.69 | ⋯ | 1.63 | 0.184 | 0.219 | 0.152 | 34.0 | 16.1 | 0.0864 | 0.0931 | 7.000 | 0.011630 |\n", - "| 0.03355124 | 1 | 9.063463 | 0.0000 | 0.109614 | 0.002171 | 0.402 | 0.488 | 0.314 | 1.00 | ⋯ | 24.72 | 3.206 | 3.154 | 3.253 | 28.2 | 20.6 | 0.0594 | 0.0460 | 1.000 | 0.008169 |\n", - "| 0.03914652 | 1 | 8.151910 | 0.1484 | 0.110885 | 0.028579 | 0.145 | 0.173 | 0.114 | 1.00 | ⋯ | 6.76 | 0.703 | 0.785 | 0.620 | 20.3 | 7.2 | 0.0524 | 0.0523 | 2.119 | 0.007584 |\n", - "| 0.07612651 | 1 | 6.929517 | 0.0296 | 0.165784 | 0.020115 | 0.046 | 0.066 | 0.025 | 0.73 | ⋯ | 6.21 | 1.316 | 1.683 | 0.969 | 27.8 | 17.2 | 0.0560 | 0.0826 | 11.879 | 0.086032 |\n", - "| 0.12795121 | 1 | 7.237778 | 0.2151 | 0.078488 | 0.011581 | 0.022 | 0.031 | 0.014 | 1.00 | ⋯ | 3.96 | 0.594 | 0.674 | 0.515 | 28.2 | 14.8 | 0.0270 | 0.0275 | 1.938 | 0.007666 |\n", - "| -0.02432609 | 1 | 8.115820 | 0.4318 | 0.137482 | 0.026547 | 0.059 | 0.073 | 0.045 | 1.00 | ⋯ | 11.36 | 1.132 | 1.126 | 1.138 | 52.1 | 18.8 | 0.0804 | 0.0930 | 0.003 | 0.016968 |\n", - "| 0.07829342 | 1 | 7.271704 | 0.1689 | 0.164598 | 0.044446 | 0.029 | 0.045 | 0.013 | 0.84 | ⋯ | 3.10 | 0.568 | 0.695 | 0.450 | 35.9 | 13.1 | 0.0617 | 0.0678 | 10.479 | 0.004573 |\n", - "| 0.11291155 | 1 | 7.121252 | 0.1832 | 0.188016 | 0.045678 | 0.033 | 0.051 | 0.015 | 0.91 | ⋯ | 3.16 | 0.440 | 0.512 | 0.369 | 37.4 | 12.7 | 0.0775 | 0.0780 | 18.476 | -0.020322 |\n", - "| 0.05230819 | 1 | 6.977281 | 0.0962 | 0.204611 | 0.077852 | 0.037 | 0.043 | 0.030 | 1.00 | ⋯ | 2.40 | 0.419 | 0.548 | 0.299 | 30.3 | 7.9 | 0.0668 | 0.0787 | 125.990 | 0.028916 |\n", - "| 0.03639089 | 1 | 7.649693 | 0.0227 | 0.136287 | 0.046730 | 0.081 | 0.105 | 0.056 | 0.99 | ⋯ | 3.51 | 0.562 | 0.699 | 0.427 | 35.7 | 14.7 | 0.0872 | 0.0938 | 26.800 | 0.020228 |\n", - "| 0.02973823 | 1 | 8.056744 | 0.0208 | 0.197853 | 0.037224 | 0.083 | 0.097 | 0.069 | 1.00 | ⋯ | 3.30 | 0.722 | 0.765 | 0.680 | 36.6 | 12.6 | 0.0557 | 0.0624 | 0.052 | 0.013407 |\n", - "| -0.05664358 | 1 | 8.780941 | 0.2654 | 0.189867 | 0.031747 | 0.068 | 0.089 | 0.046 | 0.94 | ⋯ | 2.99 | 0.372 | 0.462 | 0.281 | 34.0 | 20.3 | 0.3178 | 0.1583 | 4.500 | -0.024761 |\n", - "| 0.01920480 | 1 | 6.287859 | 0.4207 | 0.130682 | 0.109921 | 0.053 | 0.039 | 0.011 | 0.74 | ⋯ | 0.34 | 0.142 | 0.223 | 0.055 | 35.5 | 19.1 | 0.0201 | 0.0341 | 4.762 | -0.021656 |\n", - "| 0.08520600 | 1 | 6.137727 | 0.1371 | 0.123818 | 0.015897 | 0.028 | 0.025 | 0.007 | 0.72 | ⋯ | 0.56 | 0.148 | 0.232 | 0.065 | 41.3 | 21.3 | 0.0298 | 0.0297 | 4.125 | -0.054872 |\n", - "| 0.13398221 | 1 | 8.128880 | 0.0000 | 0.167210 | 0.003311 | 0.129 | 0.196 | 0.063 | 1.00 | ⋯ | 13.16 | 1.727 | 1.910 | 1.560 | 28.1 | 23.2 | 0.0570 | 0.0609 | 360.000 | -0.054874 |\n", - "| 0.17302474 | 1 | 6.680855 | 0.4713 | 0.228424 | 0.029328 | 0.062 | 0.090 | 0.032 | 1.00 | ⋯ | 3.95 | 0.974 | 1.526 | 0.470 | 62.4 | 34.9 | 0.0206 | 0.0618 | 265.690 | 0.018194 |\n", - "| 0.10969915 | 1 | 7.177019 | 0.0178 | 0.185240 | 0.015453 | 0.020 | 0.026 | 0.013 | 0.90 | ⋯ | 1.89 | 0.571 | 0.843 | 0.286 | 26.9 | 24.1 | 0.2295 | 0.1990 | 3.061 | -0.034733 |\n", - "| 0.01598990 | 1 | 6.648985 | 0.4762 | 0.171181 | 0.058937 | 0.018 | 0.028 | 0.007 | 0.40 | ⋯ | 0.76 | 0.357 | 0.512 | 0.185 | 39.9 | 26.9 | 0.0178 | 0.0634 | 4.762 | -0.000222 |\n", - "| 0.06224977 | 1 | 6.879356 | 0.2927 | 0.179508 | 0.035842 | 0.188 | 0.169 | 0.208 | 1.00 | ⋯ | 3.69 | 0.651 | 0.759 | 0.547 | 31.4 | 31.2 | 0.0695 | 0.0728 | 4.017 | 0.033636 |\n", - "| 0.10987069 | 1 | 7.347300 | 0.1017 | 0.247626 | 0.037392 | 0.080 | 0.133 | 0.027 | 0.78 | ⋯ | 0.72 | 0.195 | 0.303 | 0.085 | 36.2 | 21.6 | 0.0860 | 0.0898 | 3.177 | 0.010162 |\n", - "| 0.09210628 | 1 | 6.725034 | 0.0266 | 0.179933 | 0.046376 | 0.015 | 0.020 | 0.010 | 0.78 | ⋯ | 0.86 | 0.258 | 0.382 | 0.137 | 34.6 | 16.5 | 0.0558 | 0.0613 | 20.800 | -0.018514 |\n", - "| 0.08337604 | 1 | 8.451053 | 0.0000 | 0.358556 | 0.016468 | 0.090 | 0.133 | 0.044 | 1.00 | ⋯ | 2.91 | 0.766 | 1.087 | 0.510 | 21.9 | 15.3 | 0.1687 | 0.1635 | 26.000 | 0.010943 |\n", - "| 0.07623345 | 1 | 8.602453 | 0.0000 | 0.416234 | 0.014721 | 0.148 | 0.194 | 0.100 | 1.00 | ⋯ | 12.17 | 1.554 | 1.724 | 1.398 | 20.6 | 7.2 | 0.2629 | 0.2698 | 50.000 | -0.001521 |\n", - "| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |\n", - "| -0.03404539 | 1 | 8.346168 | 0.3199 | 0.110885 | 0.028579 | 0.272 | 0.289 | 0.272 | 1.00 | ⋯ | 9.58 | 0.919 | 0.936 | 0.902 | 18.2 | 7.7 | 0.0625 | 0.0578 | 36.603 | 0.014286 |\n", - "| -0.03380635 | 1 | 7.303170 | 0.3133 | 0.165784 | 0.020115 | 0.112 | 0.132 | 0.065 | 0.85 | ⋯ | 5.60 | 1.158 | 1.473 | 0.862 | 22.7 | 18.2 | 0.1071 | 0.1028 | 20.000 | 0.111198 |\n", - "| 0.06991488 | 1 | 7.859027 | 0.1222 | 0.078488 | 0.011581 | 0.107 | 0.103 | 0.092 | 0.88 | ⋯ | 2.80 | 0.596 | 0.645 | 0.548 | 21.7 | 14.4 | 0.0357 | 0.0466 | 8.127 | 0.006002 |\n", - "| -0.08172560 | 1 | 7.998335 | 1.6378 | 0.137482 | 0.026547 | 0.156 | 0.181 | 0.150 | 1.00 | ⋯ | 13.74 | 1.339 | 1.222 | 1.445 | 34.8 | 15.2 | 0.0783 | 0.0847 | 4.911 | -0.127025 |\n", - "| 0.04601005 | 1 | 7.655864 | 0.1345 | 0.164598 | 0.044446 | 0.080 | 0.097 | 0.058 | 1.00 | ⋯ | 8.25 | 1.076 | 1.143 | 1.013 | 32.1 | 19.4 | 0.0525 | 0.0572 | 30.929 | -0.004592 |\n", - "| 0.06659809 | 1 | 7.675082 | 0.0898 | 0.188016 | 0.045678 | 0.269 | 0.338 | 0.200 | 1.00 | ⋯ | 5.80 | 0.687 | 0.745 | 0.630 | 37.5 | 16.4 | 0.0906 | 0.0959 | 25.000 | 0.191066 |\n", - "| -0.01138424 | 1 | 7.830028 | 0.4880 | 0.136287 | 0.046730 | 0.146 | 0.193 | 0.094 | 1.00 | ⋯ | 6.42 | 0.950 | 1.129 | 0.772 | 39.1 | 23.8 | 0.0764 | 0.0866 | 40.500 | -0.007018 |\n", - "| -0.10098990 | 1 | 8.498622 | 0.0010 | 0.189867 | 0.031747 | 0.181 | 0.190 | 0.159 | 0.97 | ⋯ | 7.63 | 0.801 | 0.850 | 0.752 | 30.2 | 16.8 | 0.2131 | 0.1437 | 4.285 | 0.168536 |\n", - "| 0.05475087 | 1 | 6.216606 | 0.7557 | 0.214345 | 0.073495 | 0.023 | 0.051 | 0.006 | 0.73 | ⋯ | 0.44 | 0.282 | 0.488 | 0.051 | 50.6 | 21.8 | 0.0232 | 0.0407 | 8.876 | -0.084064 |\n", - "| 0.09461817 | 1 | 8.414496 | 0.0000 | 0.374328 | 0.000000 | 0.101 | 0.147 | 0.053 | 1.00 | ⋯ | 11.80 | 1.846 | 2.369 | 1.301 | 31.1 | 24.3 | 0.5958 | 0.5819 | 4.935 | 0.021808 |\n", - "| 0.04571529 | 1 | 6.383507 | 0.3556 | 0.130682 | 0.109921 | 0.086 | 0.130 | 0.042 | 0.79 | ⋯ | 1.00 | 0.446 | 0.713 | 0.163 | 42.3 | 19.8 | 0.0188 | 0.0222 | 8.653 | -0.012443 |\n", - "| 0.06549111 | 1 | 8.782323 | 0.0000 | 0.167210 | 0.003311 | 0.246 | 0.331 | 0.160 | 0.99 | ⋯ | 15.52 | 1.969 | 2.121 | 1.828 | 25.3 | 17.5 | 0.1032 | 0.0958 | 296.800 | -0.057094 |\n", - "| 0.02124651 | 1 | 7.251345 | 0.0516 | 0.263813 | 0.045225 | 0.090 | 0.053 | 0.030 | 0.88 | ⋯ | 4.00 | 0.817 | 1.205 | 0.413 | 34.7 | 21.1 | 0.0730 | 0.2227 | 0.320 | 0.128443 |\n", - "| 0.14144548 | 1 | 7.511525 | 0.1053 | 0.228424 | 0.029328 | 0.103 | 0.139 | 0.054 | 1.00 | ⋯ | 9.33 | 1.700 | 2.369 | 1.060 | 51.7 | 37.1 | 0.0903 | 0.1229 | 484.000 | 0.007257 |\n", - "| 0.09681623 | 1 | 7.713785 | 0.0050 | 0.185240 | 0.015453 | 0.031 | 0.042 | 0.016 | 0.91 | ⋯ | 4.35 | 0.891 | 1.255 | 0.517 | 31.8 | 27.3 | 0.1922 | 0.1821 | 2.402 | 0.030424 |\n", - "| 0.04053420 | 1 | 6.728629 | 0.6190 | 0.171181 | 0.058937 | 0.019 | 0.027 | 0.009 | 0.46 | ⋯ | 1.01 | 0.670 | 1.039 | 0.271 | 40.1 | 18.0 | 0.0281 | 0.0459 | 9.900 | -0.012137 |\n", - "| 0.01058841 | 1 | 7.186144 | 0.0760 | 0.179508 | 0.035842 | 0.184 | 0.173 | 0.217 | 1.00 | ⋯ | 5.34 | 0.943 | 1.049 | 0.837 | 29.0 | 31.5 | 0.0703 | 0.0716 | 7.248 | 0.009640 |\n", - "| 0.18552649 | 1 | 8.326033 | 0.0050 | 0.321658 | 0.005106 | 0.090 | 0.109 | 0.075 | 1.00 | ⋯ | 4.64 | 1.127 | 1.427 | 0.817 | 30.5 | 23.1 | 0.7470 | 0.8489 | 2.371 | 0.051395 |\n", - "| 0.09310491 | 1 | 7.894691 | 0.1062 | 0.247626 | 0.037392 | 0.121 | 0.175 | 0.063 | 0.96 | ⋯ | 1.47 | 0.481 | 0.761 | 0.200 | 33.8 | 19.6 | 0.0797 | 0.1018 | 3.017 | 0.207492 |\n", - "| 0.06522856 | 1 | 7.175490 | 0.0000 | 0.179933 | 0.046376 | 0.035 | 0.040 | 0.027 | 0.83 | ⋯ | 1.23 | 0.332 | 0.451 | 0.219 | 27.9 | 27.2 | 0.0636 | 0.0721 | 20.379 | 0.018019 |\n", - "| 0.03809502 | 1 | 9.030974 | 0.0000 | 0.293138 | 0.005517 | 0.245 | 0.251 | 0.238 | 1.00 | ⋯ | 7.50 | 1.167 | 1.210 | 1.128 | 22.5 | 15.5 | 0.1662 | 0.1617 | 4.286 | -0.006642 |\n", - "| 0.03421300 | 1 | 8.995537 | 0.0000 | 0.304720 | 0.011658 | 0.246 | 0.260 | 0.190 | 1.00 | ⋯ | 6.75 | 0.667 | 0.776 | 0.575 | 23.5 | 15.0 | 0.2597 | 0.2288 | 2.460 | -0.003241 |\n", - "| 0.05275914 | 1 | 8.234830 | 0.0363 | 0.288405 | 0.011589 | 0.183 | 0.222 | 0.142 | 1.00 | ⋯ | 8.18 | 1.010 | 1.220 | 0.821 | 30.2 | 28.3 | 0.1044 | 0.1796 | 32.051 | -0.034352 |\n", - "| 0.03841564 | 1 | 8.332549 | 0.0000 | 0.345485 | 0.006503 | 0.188 | 0.248 | 0.136 | 1.00 | ⋯ | 13.12 | 1.576 | 1.567 | 1.585 | 31.0 | 14.3 | 0.2866 | 0.3500 | 0.452 | -0.001660 |\n", - "| 0.03189479 | 1 | 8.645586 | 0.0000 | 0.288440 | 0.005995 | 0.256 | 0.301 | 0.199 | 1.00 | ⋯ | 6.91 | 1.307 | 1.579 | 1.062 | 18.9 | 11.3 | 0.1296 | 0.1458 | 652.850 | -0.046278 |\n", - "| 0.03119598 | 1 | 8.991064 | 0.0000 | 0.371898 | 0.014586 | 0.255 | 0.336 | 0.170 | 0.98 | ⋯ | 11.41 | 2.226 | 2.494 | 1.971 | 27.5 | 15.9 | 0.4407 | 0.4257 | 2.529 | -0.011883 |\n", - "| 0.03409566 | 1 | 8.025189 | 0.0050 | 0.296437 | 0.013615 | 0.108 | 0.117 | 0.093 | 1.00 | ⋯ | 1.95 | 0.510 | 0.694 | 0.362 | 20.2 | 15.7 | 0.1669 | 0.2201 | 25.553 | -0.039080 |\n", - "| 0.04690046 | 1 | 9.030137 | 0.0000 | 0.265778 | 0.008629 | 0.288 | 0.337 | 0.237 | 1.00 | ⋯ | 25.64 | 2.727 | 2.664 | 2.788 | 20.4 | 9.4 | 0.3238 | 0.3134 | 4.152 | 0.005175 |\n", - "| 0.03977337 | 1 | 8.865312 | 0.0000 | 0.282939 | 0.005048 | 0.188 | 0.236 | 0.139 | 1.00 | ⋯ | 10.76 | 1.888 | 1.920 | 1.860 | 20.0 | 16.0 | 0.1845 | 0.1940 | 0.452 | -0.029551 |\n", - "| 0.04064154 | 1 | 8.912339 | 0.0000 | 0.150366 | 0.024377 | 0.257 | 0.338 | 0.215 | 1.00 | ⋯ | 24.40 | 3.051 | 3.235 | 2.875 | 18.5 | 29.1 | 0.1876 | 0.2007 | 0.886 | -0.036482 |\n", - "\n" - ], - "text/plain": [ - " Outcome intercept gdpsh465 bmp1l freeop freetar h65 hm65 hf65 \n", - "1 -0.02433575 1 6.591674 0.2837 0.153491 0.043888 0.007 0.013 0.001\n", - "2 0.10047257 1 6.829794 0.6141 0.313509 0.061827 0.019 0.032 0.007\n", - "3 0.06705148 1 8.895082 0.0000 0.204244 0.009186 0.260 0.325 0.201\n", - "4 0.06408917 1 7.565275 0.1997 0.248714 0.036270 0.061 0.070 0.051\n", - "5 0.02792955 1 7.162397 0.1740 0.299252 0.037367 0.017 0.027 0.007\n", - "6 0.04640744 1 7.218910 0.0000 0.258865 0.020880 0.023 0.038 0.006\n", - "7 0.06733234 1 7.853605 0.0000 0.182525 0.014385 0.039 0.063 0.014\n", - "8 0.02097768 1 7.703910 0.2776 0.215275 0.029713 0.024 0.035 0.013\n", - "9 0.03355124 1 9.063463 0.0000 0.109614 0.002171 0.402 0.488 0.314\n", - "10 0.03914652 1 8.151910 0.1484 0.110885 0.028579 0.145 0.173 0.114\n", - "11 0.07612651 1 6.929517 0.0296 0.165784 0.020115 0.046 0.066 0.025\n", - "12 0.12795121 1 7.237778 0.2151 0.078488 0.011581 0.022 0.031 0.014\n", - "13 -0.02432609 1 8.115820 0.4318 0.137482 0.026547 0.059 0.073 0.045\n", - "14 0.07829342 1 7.271704 0.1689 0.164598 0.044446 0.029 0.045 0.013\n", - "15 0.11291155 1 7.121252 0.1832 0.188016 0.045678 0.033 0.051 0.015\n", - "16 0.05230819 1 6.977281 0.0962 0.204611 0.077852 0.037 0.043 0.030\n", - "17 0.03639089 1 7.649693 0.0227 0.136287 0.046730 0.081 0.105 0.056\n", - "18 0.02973823 1 8.056744 0.0208 0.197853 0.037224 0.083 0.097 0.069\n", - "19 -0.05664358 1 8.780941 0.2654 0.189867 0.031747 0.068 0.089 0.046\n", - "20 0.01920480 1 6.287859 0.4207 0.130682 0.109921 0.053 0.039 0.011\n", - "21 0.08520600 1 6.137727 0.1371 0.123818 0.015897 0.028 0.025 0.007\n", - "22 0.13398221 1 8.128880 0.0000 0.167210 0.003311 0.129 0.196 0.063\n", - "23 0.17302474 1 6.680855 0.4713 0.228424 0.029328 0.062 0.090 0.032\n", - "24 0.10969915 1 7.177019 0.0178 0.185240 0.015453 0.020 0.026 0.013\n", - "25 0.01598990 1 6.648985 0.4762 0.171181 0.058937 0.018 0.028 0.007\n", - "26 0.06224977 1 6.879356 0.2927 0.179508 0.035842 0.188 0.169 0.208\n", - "27 0.10987069 1 7.347300 0.1017 0.247626 0.037392 0.080 0.133 0.027\n", - "28 0.09210628 1 6.725034 0.0266 0.179933 0.046376 0.015 0.020 0.010\n", - "29 0.08337604 1 8.451053 0.0000 0.358556 0.016468 0.090 0.133 0.044\n", - "30 0.07623345 1 8.602453 0.0000 0.416234 0.014721 0.148 0.194 0.100\n", - "⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "61 -0.03404539 1 8.346168 0.3199 0.110885 0.028579 0.272 0.289 0.272\n", - "62 -0.03380635 1 7.303170 0.3133 0.165784 0.020115 0.112 0.132 0.065\n", - "63 0.06991488 1 7.859027 0.1222 0.078488 0.011581 0.107 0.103 0.092\n", - "64 -0.08172560 1 7.998335 1.6378 0.137482 0.026547 0.156 0.181 0.150\n", - "65 0.04601005 1 7.655864 0.1345 0.164598 0.044446 0.080 0.097 0.058\n", - "66 0.06659809 1 7.675082 0.0898 0.188016 0.045678 0.269 0.338 0.200\n", - "67 -0.01138424 1 7.830028 0.4880 0.136287 0.046730 0.146 0.193 0.094\n", - "68 -0.10098990 1 8.498622 0.0010 0.189867 0.031747 0.181 0.190 0.159\n", - "69 0.05475087 1 6.216606 0.7557 0.214345 0.073495 0.023 0.051 0.006\n", - "70 0.09461817 1 8.414496 0.0000 0.374328 0.000000 0.101 0.147 0.053\n", - "71 0.04571529 1 6.383507 0.3556 0.130682 0.109921 0.086 0.130 0.042\n", - "72 0.06549111 1 8.782323 0.0000 0.167210 0.003311 0.246 0.331 0.160\n", - "73 0.02124651 1 7.251345 0.0516 0.263813 0.045225 0.090 0.053 0.030\n", - "74 0.14144548 1 7.511525 0.1053 0.228424 0.029328 0.103 0.139 0.054\n", - "75 0.09681623 1 7.713785 0.0050 0.185240 0.015453 0.031 0.042 0.016\n", - "76 0.04053420 1 6.728629 0.6190 0.171181 0.058937 0.019 0.027 0.009\n", - "77 0.01058841 1 7.186144 0.0760 0.179508 0.035842 0.184 0.173 0.217\n", - "78 0.18552649 1 8.326033 0.0050 0.321658 0.005106 0.090 0.109 0.075\n", - "79 0.09310491 1 7.894691 0.1062 0.247626 0.037392 0.121 0.175 0.063\n", - "80 0.06522856 1 7.175490 0.0000 0.179933 0.046376 0.035 0.040 0.027\n", - "81 0.03809502 1 9.030974 0.0000 0.293138 0.005517 0.245 0.251 0.238\n", - "82 0.03421300 1 8.995537 0.0000 0.304720 0.011658 0.246 0.260 0.190\n", - "83 0.05275914 1 8.234830 0.0363 0.288405 0.011589 0.183 0.222 0.142\n", - "84 0.03841564 1 8.332549 0.0000 0.345485 0.006503 0.188 0.248 0.136\n", - "85 0.03189479 1 8.645586 0.0000 0.288440 0.005995 0.256 0.301 0.199\n", - "86 0.03119598 1 8.991064 0.0000 0.371898 0.014586 0.255 0.336 0.170\n", - "87 0.03409566 1 8.025189 0.0050 0.296437 0.013615 0.108 0.117 0.093\n", - "88 0.04690046 1 9.030137 0.0000 0.265778 0.008629 0.288 0.337 0.237\n", - "89 0.03977337 1 8.865312 0.0000 0.282939 0.005048 0.188 0.236 0.139\n", - "90 0.04064154 1 8.912339 0.0000 0.150366 0.024377 0.257 0.338 0.215\n", - " p65 ⋯ seccf65 syr65 syrm65 syrf65 teapri65 teasec65 ex1 im1 xr65 \n", - "1 0.29 ⋯ 0.04 0.033 0.057 0.010 47.6 17.3 0.0729 0.0667 0.348\n", - "2 0.91 ⋯ 0.64 0.173 0.274 0.067 57.1 18.0 0.0940 0.1438 0.525\n", - "3 1.00 ⋯ 18.14 2.573 2.478 2.667 26.5 20.7 0.1741 0.1750 1.082\n", - "4 1.00 ⋯ 2.63 0.438 0.453 0.424 27.8 22.7 0.1265 0.1496 6.625\n", - "5 0.82 ⋯ 2.11 0.257 0.287 0.229 34.5 17.6 0.1211 0.1308 2.500\n", - "6 0.50 ⋯ 1.46 0.160 0.174 0.146 34.3 8.1 0.0634 0.0762 1.000\n", - "7 0.92 ⋯ 1.59 0.342 0.484 0.207 46.6 14.7 0.0342 0.0428 12.499\n", - "8 0.69 ⋯ 1.63 0.184 0.219 0.152 34.0 16.1 0.0864 0.0931 7.000\n", - "9 1.00 ⋯ 24.72 3.206 3.154 3.253 28.2 20.6 0.0594 0.0460 1.000\n", - "10 1.00 ⋯ 6.76 0.703 0.785 0.620 20.3 7.2 0.0524 0.0523 2.119\n", - "11 0.73 ⋯ 6.21 1.316 1.683 0.969 27.8 17.2 0.0560 0.0826 11.879\n", - "12 1.00 ⋯ 3.96 0.594 0.674 0.515 28.2 14.8 0.0270 0.0275 1.938\n", - "13 1.00 ⋯ 11.36 1.132 1.126 1.138 52.1 18.8 0.0804 0.0930 0.003\n", - "14 0.84 ⋯ 3.10 0.568 0.695 0.450 35.9 13.1 0.0617 0.0678 10.479\n", - "15 0.91 ⋯ 3.16 0.440 0.512 0.369 37.4 12.7 0.0775 0.0780 18.476\n", - "16 1.00 ⋯ 2.40 0.419 0.548 0.299 30.3 7.9 0.0668 0.0787 125.990\n", - "17 0.99 ⋯ 3.51 0.562 0.699 0.427 35.7 14.7 0.0872 0.0938 26.800\n", - "18 1.00 ⋯ 3.30 0.722 0.765 0.680 36.6 12.6 0.0557 0.0624 0.052\n", - "19 0.94 ⋯ 2.99 0.372 0.462 0.281 34.0 20.3 0.3178 0.1583 4.500\n", - "20 0.74 ⋯ 0.34 0.142 0.223 0.055 35.5 19.1 0.0201 0.0341 4.762\n", - "21 0.72 ⋯ 0.56 0.148 0.232 0.065 41.3 21.3 0.0298 0.0297 4.125\n", - "22 1.00 ⋯ 13.16 1.727 1.910 1.560 28.1 23.2 0.0570 0.0609 360.000\n", - "23 1.00 ⋯ 3.95 0.974 1.526 0.470 62.4 34.9 0.0206 0.0618 265.690\n", - "24 0.90 ⋯ 1.89 0.571 0.843 0.286 26.9 24.1 0.2295 0.1990 3.061\n", - "25 0.40 ⋯ 0.76 0.357 0.512 0.185 39.9 26.9 0.0178 0.0634 4.762\n", - "26 1.00 ⋯ 3.69 0.651 0.759 0.547 31.4 31.2 0.0695 0.0728 4.017\n", - "27 0.78 ⋯ 0.72 0.195 0.303 0.085 36.2 21.6 0.0860 0.0898 3.177\n", - "28 0.78 ⋯ 0.86 0.258 0.382 0.137 34.6 16.5 0.0558 0.0613 20.800\n", - "29 1.00 ⋯ 2.91 0.766 1.087 0.510 21.9 15.3 0.1687 0.1635 26.000\n", - "30 1.00 ⋯ 12.17 1.554 1.724 1.398 20.6 7.2 0.2629 0.2698 50.000\n", - "⋮ ⋮ ⋱ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", - "61 1.00 ⋯ 9.58 0.919 0.936 0.902 18.2 7.7 0.0625 0.0578 36.603\n", - "62 0.85 ⋯ 5.60 1.158 1.473 0.862 22.7 18.2 0.1071 0.1028 20.000\n", - "63 0.88 ⋯ 2.80 0.596 0.645 0.548 21.7 14.4 0.0357 0.0466 8.127\n", - "64 1.00 ⋯ 13.74 1.339 1.222 1.445 34.8 15.2 0.0783 0.0847 4.911\n", - "65 1.00 ⋯ 8.25 1.076 1.143 1.013 32.1 19.4 0.0525 0.0572 30.929\n", - "66 1.00 ⋯ 5.80 0.687 0.745 0.630 37.5 16.4 0.0906 0.0959 25.000\n", - "67 1.00 ⋯ 6.42 0.950 1.129 0.772 39.1 23.8 0.0764 0.0866 40.500\n", - "68 0.97 ⋯ 7.63 0.801 0.850 0.752 30.2 16.8 0.2131 0.1437 4.285\n", - "69 0.73 ⋯ 0.44 0.282 0.488 0.051 50.6 21.8 0.0232 0.0407 8.876\n", - "70 1.00 ⋯ 11.80 1.846 2.369 1.301 31.1 24.3 0.5958 0.5819 4.935\n", - "71 0.79 ⋯ 1.00 0.446 0.713 0.163 42.3 19.8 0.0188 0.0222 8.653\n", - "72 0.99 ⋯ 15.52 1.969 2.121 1.828 25.3 17.5 0.1032 0.0958 296.800\n", - "73 0.88 ⋯ 4.00 0.817 1.205 0.413 34.7 21.1 0.0730 0.2227 0.320\n", - "74 1.00 ⋯ 9.33 1.700 2.369 1.060 51.7 37.1 0.0903 0.1229 484.000\n", - "75 0.91 ⋯ 4.35 0.891 1.255 0.517 31.8 27.3 0.1922 0.1821 2.402\n", - "76 0.46 ⋯ 1.01 0.670 1.039 0.271 40.1 18.0 0.0281 0.0459 9.900\n", - "77 1.00 ⋯ 5.34 0.943 1.049 0.837 29.0 31.5 0.0703 0.0716 7.248\n", - "78 1.00 ⋯ 4.64 1.127 1.427 0.817 30.5 23.1 0.7470 0.8489 2.371\n", - "79 0.96 ⋯ 1.47 0.481 0.761 0.200 33.8 19.6 0.0797 0.1018 3.017\n", - "80 0.83 ⋯ 1.23 0.332 0.451 0.219 27.9 27.2 0.0636 0.0721 20.379\n", - "81 1.00 ⋯ 7.50 1.167 1.210 1.128 22.5 15.5 0.1662 0.1617 4.286\n", - "82 1.00 ⋯ 6.75 0.667 0.776 0.575 23.5 15.0 0.2597 0.2288 2.460\n", - "83 1.00 ⋯ 8.18 1.010 1.220 0.821 30.2 28.3 0.1044 0.1796 32.051\n", - "84 1.00 ⋯ 13.12 1.576 1.567 1.585 31.0 14.3 0.2866 0.3500 0.452\n", - "85 1.00 ⋯ 6.91 1.307 1.579 1.062 18.9 11.3 0.1296 0.1458 652.850\n", - "86 0.98 ⋯ 11.41 2.226 2.494 1.971 27.5 15.9 0.4407 0.4257 2.529\n", - "87 1.00 ⋯ 1.95 0.510 0.694 0.362 20.2 15.7 0.1669 0.2201 25.553\n", - "88 1.00 ⋯ 25.64 2.727 2.664 2.788 20.4 9.4 0.3238 0.3134 4.152\n", - "89 1.00 ⋯ 10.76 1.888 1.920 1.860 20.0 16.0 0.1845 0.1940 0.452\n", - "90 1.00 ⋯ 24.40 3.051 3.235 2.875 18.5 29.1 0.1876 0.2007 0.886\n", - " tot1 \n", - "1 -0.014727\n", - "2 0.005750\n", - "3 -0.010040\n", - "4 -0.002195\n", - "5 0.003283\n", - "6 -0.001747\n", - "7 0.009092\n", - "8 0.011630\n", - "9 0.008169\n", - "10 0.007584\n", - "11 0.086032\n", - "12 0.007666\n", - "13 0.016968\n", - "14 0.004573\n", - "15 -0.020322\n", - "16 0.028916\n", - "17 0.020228\n", - "18 0.013407\n", - "19 -0.024761\n", - "20 -0.021656\n", - "21 -0.054872\n", - "22 -0.054874\n", - "23 0.018194\n", - "24 -0.034733\n", - "25 -0.000222\n", - "26 0.033636\n", - "27 0.010162\n", - "28 -0.018514\n", - "29 0.010943\n", - "30 -0.001521\n", - "⋮ ⋮ \n", - "61 0.014286\n", - "62 0.111198\n", - "63 0.006002\n", - "64 -0.127025\n", - "65 -0.004592\n", - "66 0.191066\n", - "67 -0.007018\n", - "68 0.168536\n", - "69 -0.084064\n", - "70 0.021808\n", - "71 -0.012443\n", - "72 -0.057094\n", - "73 0.128443\n", - "74 0.007257\n", - "75 0.030424\n", - "76 -0.012137\n", - "77 0.009640\n", - "78 0.051395\n", - "79 0.207492\n", - "80 0.018019\n", - "81 -0.006642\n", - "82 -0.003241\n", - "83 -0.034352\n", - "84 -0.001660\n", - "85 -0.046278\n", - "86 -0.011883\n", - "87 -0.039080\n", - "88 0.005175\n", - "89 -0.029551\n", - "90 -0.036482" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "growth" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-AMcbsgefhTg" - }, - "source": [ - "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", - "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "DncWsRS9mgAp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "## Create the outcome variable y and covariates x\n", - "y <- growth$Outcome\n", - "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vPO08MjomqfZ", - "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The estimated coefficient on gdpsh465 is -0.009377989 and the corresponding robust standard error is 0.032421195% Confidence Interval: [ -0.07292335 , 0.05416737 ]" - ] - } - ], - "source": [ - "fit <- lm(Outcome ~ ., data = X)\n", - "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", - "\n", - "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", - "\n", - "# print unconditional effect of gdpsh465 and the corresponding standard error\n", - "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", - " \" and the corresponding robust standard error is\", se)\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci <- est - 1.96 * se\n", - "upper_ci <- est + 1.96 * se\n", - "\n", - "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D7nJZzhGfjQT" - }, - "source": [ - "## Summarize OLS results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EwGVcIVAfRe5", - "outputId": "87f41279-8907-415b-f8eb-589f736089b2", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - " upper bound CI\n", - "1 0.0541673700112012\n" - ] - } - ], - "source": [ - "# Create an empty data frame with column names\n", - "table <- data.frame(\n", - " Method = character(0),\n", - " Estimate = character(0),\n", - " `Std. Error` = numeric(0),\n", - " `Lower Bound CI` = numeric(0),\n", - " `Upper Bound CI` = numeric(0)\n", - ")\n", - "\n", - "# Add OLS results to the table\n", - "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", - "\n", - "# Rename the columns to match the Python table\n", - "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KfrhJqKhfwKB" - }, - "source": [ - "Least squares provides a rather noisy estimate (high standard error) of the\n", - "speed of convergence, and does not allow us to answer the question\n", - "about the convergence hypothesis since the confidence interval includes zero.\n", - "\n", - "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "D9Y2U1Ldf1eB", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "y <- growth$Outcome\n", - "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", - "D <- growth$gdpsh465" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8yNU2UgefzCZ" - }, - "source": [ - "## Method 1: Lasso with Theoretical Penalty using HDM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tQPxdzQ2f84M" - }, - "source": [ - "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", - "\n", - "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "DIzy51tZsoWp", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "double_lasso <- function(y, D, W) {\n", - "\n", - " # residualize outcome with Lasso\n", - " yfit_rlasso <- hdm::rlasso(W, y, post = FALSE)\n", - " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", - " yres <- y - as.numeric(yhat_rlasso)\n", - "\n", - "\n", - " # residualize treatment with Lasso\n", - " dfit_rlasso <- hdm::rlasso(W, D, post = FALSE)\n", - " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", - " dres <- D - as.numeric(dhat_rlasso)\n", - "\n", - " # rest is the same as in the OLS case\n", - " hat <- mean(yres * dres) / mean(dres^2)\n", - " epsilon <- yres - hat * dres\n", - " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", - " stderr <- sqrt(V / length(y))\n", - "\n", - " return(list(hat = hat, stderr = stderr))\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "Ncz7Uqn5sqqU", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "results <- double_lasso(y, D, W)\n", - "hat <- results$hat\n", - "stderr <- results$stderr\n", - "# Calculate the 95% confidence interval\n", - "ci_lower <- hat - 1.96 * stderr\n", - "ci_upper <- hat + 1.96 * stderr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "P5PEjKw9gLvC" - }, - "source": [ - "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", - "\n", - "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tNLVM4WEgL9v", - "outputId": "1f2683b7-630a-43c5-e110-74c527603850", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", - " upper bound CI\n", - "1 0.0541673700112012\n", - "2 -0.00975949506187093\n" - ] - } - ], - "source": [ - "# Add Double Lasso results to the table\n", - "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", - "\n", - "# Print the table\n", - "print(table)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "smPkxqCpgMR8" - }, - "source": [ - "## Method 2: Lasso with Cross-Validation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MH-eUye8liRq" - }, - "source": [ - "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "YhpTUkE_wQz9", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Choose penalty based on KFold cross validation\n", - "set.seed(123)\n", - "# Given small sample size, we use an aggressive number of 20 folds\n", - "n_folds <- 20\n", - "\n", - "\n", - "# Define LassoCV models for y and D\n", - "model_y <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = y,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "model_d <- cv.glmnet(\n", - " x = as.matrix(W),\n", - " y = D,\n", - " alpha = 1, # Lasso penalty\n", - " nfolds = n_folds,\n", - " family = \"gaussian\"\n", - ")\n", - "\n", - "# Get the best lambda values for y and D\n", - "best_lambda_y <- model_y$lambda.min\n", - "best_lambda_d <- model_d$lambda.min\n", - "\n", - "# Fit Lasso models with the best lambda values\n", - "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", - "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", - "\n", - "# Calculate the residuals\n", - "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", - "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "79U65py1grzb" + }, + "source": [ + "# Testing the Convergence Hypothesis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "cbVsr86tyqTY", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", - "colnames(tmp_df) <- c(\"res_y\", \"res_d\")" - ] + "id": "GK-MMvLseA2Q", + "outputId": "f429014a-9f26-4030-cdb8-6d925704172d", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "install.packages(\"hdm\")\n", + "install.packages(\"xtable\")\n", + "install.packages(\"lmtest\")\n", + "install.packages(\"sandwich\")\n", + "install.packages(\"glmnet\")\n", + "install.packages(\"ggplot2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "library(hdm)\n", + "library(xtable)\n", + "library(lmtest)\n", + "library(sandwich)\n", + "library(glmnet) # For LassoCV\n", + "library(ggplot2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nlpSLLV6g1pc" + }, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xXkzGJWag02O" + }, + "source": [ + "We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\\beta_1$ in the high-dimensional linear regression model:\n", + " $$\n", + " Y = \\beta_1 D + \\beta_2'W + \\epsilon.\n", + " $$\n", + " \n", + "Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$).\n", + " \n", + "The relationship is captured by $\\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\\beta_1< 0)$ or fall behind $(\\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \\beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a5Ul2ppLfUBQ" + }, + "source": [ + "## Data Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9GgPNICafYuK" + }, + "source": [ + "We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_B9DWuS6fcVW", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "getdata <- function(...) {\n", + " e <- new.env()\n", + " name <- data(..., envir = e)[1]\n", + " e[[name]]\n", + "}\n", + "\n", + "# now load your data calling getdata()\n", + "growth <- getdata(GrowthData)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smYhqwpbffVh" + }, + "source": [ + "The sample contains $90$ countries and $63$ controls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "D7SzuZ2P0P0X", - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", - "est_cv <- summary(fit_cv)$coef[\"res_d\", 1]\n", - "\n", - "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", - "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", - "\n", - "# Calculate the 95% confidence interval for 'gdpsh465'\n", - "lower_ci_cv <- est_cv - 1.96 * se_cv\n", - "upper_ci_cv <- est_cv + 1.96 * se_cv" - ] + "id": "1dsF7_R4j-Qv", + "outputId": "c77d3a1a-35e5-482f-d414-75304fc218c3", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "growth" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-AMcbsgefhTg" + }, + "source": [ + "Thus $p \\approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\\beta_1$.\n", + "To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DncWsRS9mgAp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "## Create the outcome variable y and covariates x\n", + "y <- growth$Outcome\n", + "X <- growth[-which(colnames(growth) %in% c(\"intercept\"))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ctl5T5vUygRk", - "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Method Estimate Std. Error lower bound CI\n", - "1 OLS -0.00937798878257774 0.0324211014253974 -0.0729233475763566\n", - "2 Double Lasso -0.0446926781072429 0.0178230525741694 -0.0796258611526148\n", - "3 Double Lasso CV -0.00210480949226998 0.00822866735729585 -0.0182329975125698\n", - " upper bound CI\n", - "1 0.0541673700112012\n", - "2 -0.00975949506187093\n", - "3 0.0140233785280299\n" - ] - } - ], - "source": [ - "# Add LassoCV results to the table\n", - "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", - "\n", - "# Print the table\n", - "print(table)" - ] + "id": "vPO08MjomqfZ", + "outputId": "afa89548-e9ef-4060-d5db-eb8e632e8e95", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit <- lm(Outcome ~ ., data = X)\n", + "est <- summary(fit)$coef[\"gdpsh465\", 1]\n", + "\n", + "hcv_coefs <- vcovHC(fit, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors\n", + "\n", + "# print unconditional effect of gdpsh465 and the corresponding standard error\n", + "cat(\"The estimated coefficient on gdpsh465 is\", est,\n", + " \" and the corresponding robust standard error is\", se)\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci <- est - 1.96 * se\n", + "upper_ci <- est + 1.96 * se\n", + "\n", + "cat(\"95% Confidence Interval: [\", lower_ci, \",\", upper_ci, \"]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D7nJZzhGfjQT" + }, + "source": [ + "## Summarize OLS results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "0LzDsUi8gmQM" - }, - "source": [ - "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." - ] + "id": "EwGVcIVAfRe5", + "outputId": "87f41279-8907-415b-f8eb-589f736089b2", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create an empty data frame with column names\n", + "table <- data.frame(\n", + " Method = character(0),\n", + " Estimate = character(0),\n", + " `Std. Error` = numeric(0),\n", + " `Lower Bound CI` = numeric(0),\n", + " `Upper Bound CI` = numeric(0)\n", + ")\n", + "\n", + "# Add OLS results to the table\n", + "table <- rbind(table, c(\"OLS\", est, se, lower_ci, upper_ci))\n", + "\n", + "# Rename the columns to match the Python table\n", + "colnames(table) <- c(\"Method\", \"Estimate\", \"Std. Error\", \"lower bound CI\", \"upper bound CI\")\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KfrhJqKhfwKB" + }, + "source": [ + "Least squares provides a rather noisy estimate (high standard error) of the\n", + "speed of convergence, and does not allow us to answer the question\n", + "about the convergence hypothesis since the confidence interval includes zero.\n", + "\n", + "In contrast, we can use the partialling-out approach based on lasso regression (\"Double Lasso\")." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D9Y2U1Ldf1eB", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "y <- growth$Outcome\n", + "W <- growth[-which(colnames(growth) %in% c(\"Outcome\", \"intercept\", \"gdpsh465\"))]\n", + "D <- growth$gdpsh465" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8yNU2UgefzCZ" + }, + "source": [ + "## Method 1: Lasso with Theoretical Penalty using HDM" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQPxdzQ2f84M" + }, + "source": [ + "While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here.\n", + "\n", + "We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DIzy51tZsoWp", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "double_lasso <- function(y, D, W) {\n", + "\n", + " # residualize outcome with Lasso\n", + " yfit_rlasso <- hdm::rlasso(W, y, post = FALSE)\n", + " yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W))\n", + " yres <- y - as.numeric(yhat_rlasso)\n", + "\n", + "\n", + " # residualize treatment with Lasso\n", + " dfit_rlasso <- hdm::rlasso(W, D, post = FALSE)\n", + " dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W))\n", + " dres <- D - as.numeric(dhat_rlasso)\n", + "\n", + " # rest is the same as in the OLS case\n", + " hat <- mean(yres * dres) / mean(dres^2)\n", + " epsilon <- yres - hat * dres\n", + " V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2\n", + " stderr <- sqrt(V / length(y))\n", + "\n", + " return(list(hat = hat, stderr = stderr))\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ncz7Uqn5sqqU", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "results <- double_lasso(y, D, W)\n", + "hat <- results$hat\n", + "stderr <- results$stderr\n", + "# Calculate the 95% confidence interval\n", + "ci_lower <- hat - 1.96 * stderr\n", + "ci_upper <- hat + 1.96 * stderr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P5PEjKw9gLvC" + }, + "source": [ + "The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large.\n", + "\n", + "In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\\%$ and the $95\\%$ confidence interval for the (annual) rate of convergence $[-7.8\\%,-2.2\\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 857 - }, - "id": "7uzcIGhVgmei", - "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", - "vscode": { - "languageId": "r" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3deaBM9f/H8fddrddyUdakFLIVkaVIi+qXaEUpUimlpEUqRAqlpH0laVPas5WkolIpVL7JEsp6nTZEcnF+Z5mZ+5l7z3vOLO85M4fX8497586d+7nvc8zDnTt35gzpCKGEo1QPgNCBECAhJBAgISQQICEkECAhJBAgISQQICEkECBF0RLqGH7GUHpMZuFWOeXWJbKA1CAxVmKHsBeMaQOH0FOxjRH/7ou046LdqWGXSxDSlpHtquXkHz9kbclPvfheYksvoSMTW8CoYGSbqrm12j6w1Th9No0Pnd+GpoS+C3ULnjudaCgzSsfwM5x2dTy7ogl1umkrP757zL+5sVVWZRpctyqR5bmihhT1Blq76L66L8c0RgK7z95xzv8wKYD0TBnKbdn5hNKUPabE56pfmdDSEpCmlKWc408/Jpcqf6jrM6lh8PxllP9v6LtkZBcETl+YkQCkeHbFbqpQ6LoREWMhle9h1L1jZSrzUWLfwbFoIUW/gfFcWxLZffaOc/6u3kN6lrLv3W683zWhNI0q9rlfKOWQplLm0G3G+4KBlLVY31ePPgt84ka6pei7HBv8SfVnqSbxQ4prV/xFdd23ImIspMC+23ktHbYvwe/huH7HqC4X9QbGdW1JZPdZO475rp5D2lKG3gic/DAz82ddH0CTzQ8W0tn6BeYti/a6vu+J48uVP8W6Bu95+PjypY4csNE4OYze+7Jj+ap9tu+f0KBMozH7jbP2P9emfOmGw/4JrR4GadvtDUvn1r/VZKG/0alyTo0zZxU7qaweaHul0E3uIXS1ccOBLrU/2p2fEbrBs4SurNbEPvkk3WRBUlda16NKmeaT7OuNMqG9C7NofSy7IlDR8t2sm1+hUZSNUTa35K66id6a16FC+ZPmhgaJsO/2VaClRbtE+RZru+cbW7aFWoTNGvat76T3nqhZMXx5dYdE/KcxC2ygur6xOcvOrVaq+avWcMErR2AX2b8jhV9Pii4dcfepO0UdOGwFZVRzx9nftT3NsFb4JPRvVHyndqT3rfOn0ykO14JAiUAaZe15u0toQNi/yIzLqc2EacbNJTrmul7l6UVjr51FDW8cfhbVWGd+5R2VegyoTZcNqdO/d675Wf1SqnHLnW3o2O3BFVVIe06ilrfe2IBa7TX/7692zfAr8jNeDDuprh7oSTo+ePLfX403WqnSf1ofvUKd9aLvckV/WmSdbH3oNBOSutKfdajDsGuq97WuN8qEJSBFsyvslOXfH0OVJ0z4O/AJZWPUzS25q4bQ9WW63tItI+uT4CCR9l19+jo0mfIt/qhNne7pX/1a89qjXtHVbz2Sbi57cb+w5cN2SKR/GqvABqrrj6KRFTvfdDbRx7py5QjsIgtS+PVEuXTE3afuFHVgdQV1VHPH2d91Il1grXAtPR34HsV36jPUxzr/MnM7SlwLAiUCqSO9FDo9k44O/xd5w/qxOZXOMqb+uWy5HcZebrtbN/+L6K7rY6mUscG/ZuU0/EPXJ1IXXX+dWhqj7b+ebg+uqEJ6i9oYy/zX0PyvoSmtNs5Zn9cm7KS6eqALlHsXrHrRw9b7k+md0HlLqM83dJ15ajnd/IYJSV3pLuphnNxc3bzeqBPau/Djj3bHsCsCqcuH3TZRNkbd3JK7aihlTjcu+QC1CQwSad+tyMz52+lb3EUXmlt2qLll6qzqtx5DFY3fLcOWV3dIxH8aO3sD1fXHUq65p241r5vKlcPeRRak8OtJ0aUj7z51p6gDqyuoo1o7zvqu28vm/m5cYu8hpf4KrRW+U//MrbzHOHt3xTLbHa4FgRKBVIeWhE5vpOz9DteezrTAPGfCLb/o7cm+1ZKbu8vYvDPM08fSE8bbLdRI10+jOdaez6kRXFGFtPZt64fGELrX+K4Zm83T/+lhJ9XVAx1nL1nUF9TYfLcyo/be0HkGJL1pZfPf5nZaZkFSV2pOC83Td5vXG3XCEreio9gVgdTlwyApG6NubsldNdS+DbK7bMYf9iD8vtvydn3zx6PDt2hOn5snRxSHFP6tLRHq8uoOsWP+aeycIFnDf01tw64cCqTw60nRpSPvPnWnqAOrK6ijFkEyftA8arz9yPqfJbBWsZ16Dn1gnHyXeka4FiQCqVLR7Xt9B9FOh2tPOQper/eXJvu/xqbGLamxNMQ82cHak7vocF0vT/Yt/OPo18AXFL+zYfvmzXebV/TrqOGkzfZZRSfDVg9U3/5HV2pOXxhvb1PvDTAhPUxTjdsMtVvrJiR1pX259vgfmNcbdcISkKLYFQ67IQySsl3q5pbcVUPpNusSzeh7exCnfRfqyv+cvoWxZTvN97OLQwr/1jeZHyrLh+2QkrMW2wRnSNb9PCvpWPXKoUAqdj0punTk3afuFHV/FF8hOKoCaZ75e6Lej94rWqvYTp1KV+nmTfaZEa4FiUCqW3Sl1ddTrsPtmX+odPAC280LmJ1Cs43Ne8A82ZGWG2//NfbHrqJ/+S8DXxAG6Z32pa1PGrtgzzU5RMfctkZXT6qr7+1o9rHegmbp4T1t/oDfc2j2pqKzTEi/lzJ+Z/rQ+Ec0IakrbQuMv8i43oRNWAKS+64ITBW2G6xrQnBcZbvUzS2xq4xvbt9k7Wj852gO4rjvrLu/T6SGP5kflvwW26hUaMvCIYV9a/OefHV5dYdE/qexc4Jkbc4qaq5eORRITtcT89J2JXdfIGWnhO2PsBWUURVI++vRD3phlap7itYqtlN3lq+6V/8375DCCNeCRCCdQc+ETk+nZg7Xnl2UtT9wgR2UY5842fg5WeLa8S9ljAgU/GdQIT1DeYNenTn7Gvvu6U1Pd8uj3NfVk+rqhdZ2TtUvLnE/9D8Vyvytv0kXKWeZkPTumb/plxifMiGpK/0duLp9ZVxvwiYsAcl9VwSmCtsN1jUhOK6yXermOkGaYK3QwfgF2hyE33e7jjCubrrTtwhu2cLikEp+a3V5dYe4/dOYRYSkXDkUSE7XkyJIJXdfIGWnhO0PdQV1VAWScfv2ZuNH8w3KWsV3ai9jV79FA/UI14JEII0vuulq/NwzfrReT8+bp98uuj2TR1rwEmXJ/mWuMX3ncO2oSMX/RK1CqmX/Ceiu0N95/n0qu9Ju9aS6eqBJdGTw73X7Ry233g8wftM4k+aFfZc+5k+j0dvL9NItSOpKe7PsGx/vmtcbdcISkKLYFQ67oeQfQuyNUTfXCdIw67LNaJk9CL/vZtNh24t9yv4We7PtLXvb3DJ1VodvrSwftkPsIv3TBDZQXV+9YitXDuV3JIfrSREkdvepO0XdH+oK6qgqpLUZtfZfptykKLlTZ9J1ek/rEuy1IBFIf1UI3WM4LzP3N/O+Eevn6x1F155O1v+D+phTvzD+r7Dur/8ju8xuh2tHZ3rTWuiP0OoKpN1U3ny3v7W5C9bZN8w60E/qSXX1QLsOCdxu1vV7qZ31/n90UkFW6AEO9nfpY/zGcFirF837Ry1I6koN6Svz9K3m9UadsASkKHZFIHX5MEhFGxO2uU6QrHvvt+dmbbMHibDvLqJrlSmV/dXIvhl4k7llyqxO31pdXt0hVtw/TWC3WBuo7gv1iq1cORRIDtcTBRK3+9Sdog6srBA2qgrJGGRW+Ua6slbxnVpYtc6ucg2L7wy5RzaYDx0w/8fb/WRZ606lJ6mj8cP6p6rmHptO5xrnvEAtjV/O1lYu+6c+hdqZv/feYv7iVvLa8To1ManPzw7dd6L+RMon47q5f2R1GqAvpVPMZbbXzvpdORm2erDpGdTX/N1383VUIfBP2zHjDnpE3QQLkj4i89R6+wOQ1JVutu5NX1PZvvu7aEJ7F376SQhtFLsikLq8ek1QN0bZXEdIWea9KI9Sp9A9tey+21gh4zPHb3ErnW/Mt7qSuWXqrA7fWl1e3SGR/mkCn7Q3UF1fpaFcOexdZEFyuJ4okLjdp+4UdWB1BXXbrB0X/IeZQvVIeWCXw069lkab9/U5XQsCJfZYu5fLU+4J/9e+HJW2HkNQUIHa3tSz/AN0lvFvlJFzRX99Xxeq2/+yPHrO2IJu1GTwnafS0b87XTuM32hq3Tzigpy80J8Pl1DZs+1eMf79jrr33jYNPqQq962/hI4YcNd1delG80ZU6KS6eqg3K1Bm89Ob5FKdHwLnvEbZZf9SLhGAtC6T7tYDkNSVNlal42/oWdH+D1iZsMQfZKPYFYHU5cN+Iikbo26uE6Q+eX3u6Z2VszA4iMO+C/4n9AjV3+X0LTZVoxNu7VVxgLll6qwO31pdPmyHWDH/NHb2Bqrrq1ds5cph7yILksP1RIHE7T51p6gDqyuoo1o7LvgPszOPjN+SlbVK7NQFlJextvjOkHz099a721bNyW89LHA/2I+nlC1/wrsanWycvq9qqRbGD8XxzcqU62D9VlL4cIuypRreYV6PHSDte65tXnbt3stDSxfdhTtC/3fokaXqXPe7fnm56j/se6Jd1ayKJz2/33yISeikunpRv49qUzW7coeJwYeo6nuqF3t4lQ1JPz3TvOfYghS20vJulUo3fe4POiF8QgdI7rsikLJ8GCRlY9TNdYL05NyO5ct3nB8axGHfhR4i1LLocYVh++tnc8smLrVEKLM6fOuw5dUdYsX809gFNlBZP4yGcuWwdpH9EKGS1xMFErf71J2iDqyuoI5q77jgP8yV5oN/lLVK7NT9h9NJevGdIQkJeZ/kc5Cifj5EupfYThmjPDAlzgDJdwGSQwntlD11qu52v1TkAMl3AZJDCe2UQSX+4hh7gOS7AMmh+HfK8iEnUvNd7pdzCZB8FyA5FP9O+Tiz/MUJPd3fDpAQEgiQEBIIkBASCJAQEgiQEBIIkBASCJAQEgiQEBIIkBASKAFI//zt1r+F210vE3//7Eri4n8X/pfM1XcndccU7kzi6tuSu2MKtyVx9Z3SO6boifwJQPpbc2u3/qfrZeJvx84kLq7pe5K5+p5k7pjt+o4krv5HYRIX1/bovydx9X+kd8yfgOQWIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApAZJrgMQESEqA5BogMQGSEiC5BkhMgKQESK4BEhMgKQGSa4DEBEhKgOQaIDEBkhIguQZITICkBEiuARITICkBkmuAxARISoDkGiAxAZISILkGSEyApARIrgESEyApeQbp3YnCkysBEhMgcfkXUrOsn4VHLwqQmACJy7+QjqYvhEcvCpCYAInLv5COpE+FRy8KkJgAicu/kOrSHOHRiwIkJkDi8i+k2jRTePSiAIkJkLj8C6k6vSM8elGAxARIXP6FVI2mCY9eFCAxARKXfyHl0wPCoxcFSEyAxOVfSBWor/DoRQESEyBx+RdSWeomPHpRgMQESFz+hVSKThIevShAYgIkLv9CyqLjhEcvCpCYAInLt5AKiZoIj14UIDEBEpdvIe0kaig8elGAxARIXL6F9BfREcKjFwVITIDE5VtIBUR1hEcvCpCYAInLt5DWE1UXHr0oQGICJC7fQlpDlC88elGAxARIXL6FtIIoT3j0ogCJCZC4fAtpGVFp4dGLAiQmQOLyLaQlRFnCoxcFSEyAxOVbSIuIqEB49lCAxARIXL6FtNCAtEF49lCAxARIXL6FNN+AtFZ49lCAxARIXL6F9LEBaaXw7KEAiQmQuHwL6QMD0nLh2UMBEhMgcfkW0vsGpB+EZw8FSEyAxOVbSG8ZkBYLzx4KkJgAicu3kF4zIH0tPHsoQGICJC7fQnrRgJS0g38DEhMgcfkW0iTKTN7BvwGJCZC40hPSjr/d2vM0laVPXS8WZ7t2J2tlM31vMlcv3J7ExXfpu5K4+vbk7hh9WxJX/1d6x2yXgLTbtb2PURWa7365+Crcm6yVzfR9yVx9339JXLxQL0zi6v8ld8dEcbWKv0LxHSMBKYqbdg9RLZou/NM0FG7aMeGmHVd63rSLAtL9VD95R9EHJCZA4vItpNHUNHlH0QckJkDi8i2kkdSaXhGePRQgMQESl28hDacONEV49lCAxARIXL6FdAedTs8Lzx4KkJgAicu3kG6js+k54dlDARITIHH5FtLNdB49LTx7KEBiAiQu30K6kXrQE8KzhwIkJkDi8i2kAXQZPSY8eyhAYgIkLt9C6k9X0sPCs4cCJCZA4vItpH7Un8YLzx4KkJgAicu3kPrSjTROePZQgMQESFy+hXQZDaaxwrOHAiQmQOLyLaRLaCiNFp49FCAxARKXbyF1p5E0Snj2UIDEBEhcvoV0AY2lEcKzhwIkJkDi8i2kc+lBGiY8eyhAYgIkLt9COocepTuFZw8FSEyAxOVbSGfTkzREePZQgMQESFy+hXQWPUe3Cs8eCpCYAInLt5A602S6SXj2UIDEBEhcvoV0Gr1CNwrPHgqQmACJy7eQTqFpdL3w7KEAiQmQuHwL6WR6m64Vnj0UIDEBEpdvIXWg6XS18OyhAIkJkLh8C6k9zaarhGcPBUhMgMTlW0jtaC71FZ49FCAxARKXbyGdQJ9Rb+HZQwESEyBx+RZS68wF1Et49lCAxARIXL6FdHz2l9RTePZQgMQESFy+hdQi52u6SHj2UIDEBEhcvoV0bKlv6QLh2UMBEhMgcfkWUvPSS6ib8OyhAIkJkLh8C6lp2R+oi/DsoQCJCZC4fAupcbmf6Czh2UMBEhMgcfkWUqO8FdRZePZQgMQESFy+hdSw4mo6RXj2UIDEBEhcvoV0dOW1dLLw7KEAiQmQuHwL6aj89XSi8OyhAIkJkLh8C+mIqhuprfDsoQCJCZC4fAvp8GoF1Ep49lCAxARIXL6FdNihWkZL4dlDARITIHH5FlLtGlp2c+HZQwESEyBx+RZSzVpaqSbCs4cCJCZA4vItpOp1tLKNhGcPBUhMgMTlW0iH1tUqHC08eyhAYgIkLt9CqlZPq3yE8OyhAIkJkLh8C6nKkVrVw4RnDwVITIDE5VtIletrh9YWnj0UIDEBEpdvIVU8WqtVQ3j2UIDEBEhcvoWU11CrU0149lCAxARIXL6FVO4YrV6+8OyhAIkJkLh8C6lME61+ReHZQwESEyBx+RZSqWZaw/LCs4cCJCZA4vItpJxjtWNKC88eCpCYAInLt5CyjtOa5QrPHgqQmACJy7eQMlpqx2YJzx4KkJgAicu3kOh4rWWG8OyhAIkJkLj8CmkXtdZa0Vbh4YMBEhMgcfkV0k46QTuBNgsPHwyQmACJy6+QdlBbrR1tFB4+tDogOQdIXH6FtJ3aaSfSeuHhgwESEyBx+RXS33Si1pHWCQ8fDJCYAInLr5D+opO0TvSL8PDBAIkJkLj8CukP6qidSquEhw8GSEyAxOVXSL8bkDrTz8LDBwMkJkDi8iskjU7WzqTlwsMHAyQmQOLyK6QC6qSdTcuEhw8GSEyAxOVXSFvoVO0c+l54+GCAxARIXH6FtJlO07rREuHhgwESEyBx+RXSJjpdO5++Ex4+GCAxARKXfyF11i6ib4SHDwZITIDE5VdIG+hMrQd9JTx8MEBiAiQuv0JaT2dpF9MXwsMHAyQmQOLyK6Tf6P+0XjRfePhggMQESFx+hfQrna31ps+Ehw8GSEyAxOVXSOuoi9aX5gkPHwyQmACJy6+Q1tA52pU0V3j4YIDEBEhcfoX0C3XV+tGHwsMHAyQmQOLyK6TV1E3rT7OEhw8GSEyAxOVXSCvpPO06mik8fDBAYgIkLr9CWkHnazfQ+8LDBwMkJkDi8iukn+kCbRC9Izx8MEBiAiQuv0L6iS7SbqK3hIcPBkhMgMTlV0j/o+7arTRNePhggMQESFx+hbSMemi30WvCwwcDJCZA4vIrpB+pp3YHvSI8fDBAYgIkLr9C+p4u0YbSS8LDBwMkJkDi8iukpdRLG05ThIcPBkhMgMTlV0hLDEgjabLw8MEAiQmQuPwKaTFdqo2iicLDBwMkJkDi8iuk7+gybQw9Izx8MEBiAiQuv0L6lnprY+lJ4eGDARITIHH5FdIi6qM9QI8JDx8MkJgAicuvkL6hy7WH6BHh4YMBEhMgcfkV0tfUV3uEHhIePhggMQESl18hfUVXaI/ROOHhgwESEyBx+RXSQrpSe4rGCg8fDJCYAInLr5C+pKu0Z2m08PDBAIkJkLj8CukL6qdNoruFhw8GSEyAxOVXSJ/T1doLdJfw8MEAiQmQuPwM6SUaKjx8MEBiAiQuv0JaQNdor9IdwsMHAyQmQOLyL6T+2us0WHj4YIDEBEhcfoU0n67V3qSbhYcPBkhMgMTlZ0jv0CDh4YMBEhMgcfkV0mcGpPfpBuHhgwESEyBx+RXSp3SdNsvAlJwAiQmQuPwK6RMD0gfUT3j4YIDEBEhcfoU0j67XPqKrhIcPBkhMgMTlZ0jzqK/w8MEAiQmQuPwK6WO6QZtPlwkPHwyQmACJy6+Q5tJA7XO6RHj4YIDEBEhcfoa0kHoKDx8MkJgAicuvkD6iG7VFdKHw8MEAiQmQuFIIacf4PpfcXWCf/uOBS7vfviIGSHNokLaYzhMePhggMQESVwoh3TNkzcYHBuyzTt805JdND/b6NzZIS6mr8PDBAIkJkLhSB0nr+ovxU+ncpebp7WN+0/Wt56yMHtKHdJP2I3URHj4YIDEBElfqIH15wX7j7fWvh85Y3s382l0bjLQ/3fqQbvlzBZ3lern42rkrSQtb6YXJXL3w7yQu/o++M4mr/7U3iYv/uUf/K4mr75TeMX9HC+mDy823Q58Nfrz9usnmu3ktjb6O/KVGc2mo/jud7Xo5hPzZvtApN0h9zbchSOuvftL8CaUvG2K0fLdbc+j23Vuos+vl4qtwb5IWttL3JXP1ff8lcfFCvTCJq/+3P4mL796nJ3P1QvEdEy2kr+ybdm/YHy29ZLryOfffkWbRrdo66ih8uzQYfkdiwu9IXKn7HemPrqt0fVu3ZdYH/7v4W/Vz7pBm0mBtPZ0oPHwwQGICJK4U3v09dtCaDSNv3q/PeV//r99U82tjuPvbhLSR2ggPHwyQmACJK4WQdk7o3WuMcfFxw/Sl51jNiB7SDLpN20qthIcPBkhMgMTl14cImZC07ObCwwcDJCZA4vIrpOk0RNNKHyM8fDBAYgIkLr9Cet+EVOFo4eGDARITIHH5GlL+4cLDBwMkJkDi8iuk90xIh9YWHj4YIDEBEpd/Id2uaXUOER4+GCAxARKXXyG9ax5Av16+8PDBAIkJkLj8CukdE1KDPOHhgwESEyBx+RfSnZrWuJTw8MEAiQmQuPwK6W0T0nFZwsMHAyQmQOLyL6ShmtaKCoSnDwRITIDE5VdIb5mQ2tMG4ekDARITIHH5FdKbNEzTTqa1wtMHAiQmQOLyL6ThmnYarRSePhAgMQESl18hvWFC+j/6n/D0gQCJCZC4/AvpLk3rSkuFpw8ESEyAxOVXSNNMSBfQIuHpAwESEyBx+RfSCE3rSV8KTx8IkJgAicuvkF43IV1G84WnDwRITIDE5WtIV9Jc4ekDARITIHH5FdJrdLemXUOzhacPBEhMgMTlV0hTTUgDaIbw9IEAiQmQuPwLaZSm3UjvCk8fCJCYAInLr5BeNSHdQm8ITx8IkJgAicu/kO7RtNvoNeHpAwESEyBx+RXSKyakofSy8PSBAIkJkLh8DWkETRaePhAgMQESl18hvUyjNW0UTRSePhAgMQESl18hvWRCGktPC08fCJCYAInL15DG0WPC0wcCJCZA4vIrpBdpjKY9RA8LTx8IkJgAicuvkKaYkB6jB4WnDwRITIDE5V9IYzXtSfNNMgIkJkDi8iukF+g+TXuW7hWePhAgMQESl18hTTYhTaaRwtMHAiQmQOLyL6T7zXschgpPHwiQmACJy9eQXjVf2yUZARITIHH5F9I488ANtwpPHwiQmACJy6+QnqcHzAOADxKePhAgMQESl18hTTL/hPQ+3SA8fSBAYgIkLr9CmkjjNW0W9ReePhAgMQESl18hPWdCmkNXCU8fCJCYAInLv5Ae0rR51Fd4+kCAxARIXH6F9CxN0LT5dKnw9IEAiQmQuPwK6Rnzgd9fUk/h6QMBEhMgcfkV0tMmpG/oIuHpAwESEyBx+RfSI5q2mM4Vnj4QIDEBEpdfIT1Fj2raD9RFePpAgMQESFx+hfSk+Szzn+gs4ekDARITIHH5GtJKOk14+kCAxARIXH6F9IQJ6RfqJDx9IEBiAiQuv0J6nB7XtF+pg/D0gQCJCZC4/ArpMRPSRmonPH0gQGICJC7/QnpC07ZQa+HpAwESEyBx+RXSo/Sk8TajpfD0gQCJCZC4/ArpEXrKeJt9rPD0gQCJCZC4/ArpYeuw36WaCE8fCJCYAInLr5AmWJDKNRKePhAgMQESl38hPWO8rXiU8PSBAIkJkLj8CukhetZ4m19PePpAgMQESFx+hTTegnRIHeHpAwESEyBx+RXSg9aL9dWoJTx9IEBiAiQuv0J6wIJU51Dh6QMBEhMgcfkX0iTj7eFVhKcPBEhMgMTlV0jj6Hnjbf1KwtMHAiQmQOLyK6T7abLxtkF54ekDARITIHH5FdJ9FqRjSgtPHwiQmACJy9+QmuUITx8IkJgAicuvkMbSFONti0zh6QMBEhMgcfkV0hgLUisqEB7fDpCYAInL35Da0kbh8e0AiQmQuPwKaTS9aLw9kX4THt8OkJgAicuvkO61IJ1Ma4XHtwMkJkDi8jekU2ml8Ph2gMQESFx+hXQPvWy8PYN+Fh7fDpCYAInLr5BGWZDOpmXC49sBEhMgcfkX0ivG2660VHh8O0BiAiQuv0K6m1413p5Hi4XHtwMkJkDi8jek7vS18Ph2gMQESFx+hSNkk4kAACAASURBVDSCphpve9KXwuPbARITIHH5G9KlNF94fDtAYgIkLv9Ces14ezl9Ijy+HSAxARKXJ5DKKeXKQLrLgnQVfSQ8vh0gMQESlyeQehg1yGl7wbnHZrS8XgrS68bbq+kD4fHtAIkJkLi8umn3RpNN5rufG74vA2m4Bek6mik8vh0gMQESl1eQmkyz3z/VXBLSQHpXeHw7QGICJC6vIOXODfxkKiUDaRhNM97eRG8Jj28HSEyAxOUVpJq9rHf7e9SQhDTYeisfIDEBEpdXkEZQ04H33DOgEd0uBelN4+3t1uMb5AMkJkDi8grS/vtrkFHV4XslIQ21npUkHyAxARKXd3+Q3f/r11/9si8aRtFAGmr9djTCOiiXfIDEBEhc3kH695u3Nb1QFtK99Jzw+HaAxARIXJ5BejCPaKF+5+VRUXKHdAe9o5lHt3tKeHw7QGICJC6vID1LXZ82IE3JHicD6XYL0jh6THh8O0BiAiQuryA166//a0DS7zhaBtIQC9JD9LDw+HaAxARIXF5BKv2RDenDHBlIu3Xz+vIoPSg8vh0gMQESl1eQDpluQ5pWQRLSE3Sf8Ph2gMQESFxeQTqt4y4T0h9NOktCeppGC49vB0hMgMTlFaRPsurfSFf0qZDzuSSkiTRKeHw7QGICJC7P7v6ee5z5yIbWn0bjKGpIk+ku4fHtAIkJkLg8fKp5wZIlf+rRFS2kl2io8Ph2gMQESFxeQWo7M0pDMUGaSkOEx7cDJCZA4vIKUu3xyYA0jW4VHt8OkJgAicsrSO81emePPKS3aZDw+HaAxARIXF5BOqkp5dasayYJ6X26QXh8O0BiAiQuryC1P+XUQJKQZtG1wuPbARITIHF5fYDIHSslIX1I/YTHtwMkJkDi8hrS3HxJSB/TFcLj2wESEyBxeQZpRq+T2rdv3yavqiSkz6i38Ph2gMQESFxeQZpK2bWpZmnqFNXfk6KF9DldIjy+HSAxARKXV5Banrldz/qx8NGTt0tC+op6CI9vB0hMgMTlFaS8Gbqe9YOuDxogCelbukB4fDtAYgIkLs+e2Ddb1yvM1/UFNSUhLaFuwuPbARITIHF5Bem4C//TGw/V9ffKSUL6gboIj28HSEyAxOUVpJfoVH14Vr+7a7WThPQTnSk8vh0gMQESl2d3f08dq+88najOomggbf/Trf/0v423q6mz6yXj6Z9/k7JsIL0wmasXbkvi4v/oO5O4+t9J3TF79L+SuPou6R3zNwPJatVP0T1ydfcet/bphcZbjTq7XjKe9u5LyrKB9P3JXH1/YRIX36vvTeLqhcndMXoyV98rvWP+iwQp2qK9afcrdRD+gWqHm3ZMuGnH5dVNuyrB8iQhbaR2wuPbARITIHF5BambVesyTUT/jrSFWguPbwdITIDE5fGDVjd3mCEJSctoKTy+HSAxARKX14/+XtRSFFJOc+Hx7QCJCZC4vIa0uYwopNKNhce3AyQmQOLyGNL+0bVFIZVvKDy+HSAxARKXV5CaWzWpSreKQqpUX3h8O0BiAiQubyEdd8oj/5VQkwik/MOFx7cDJCZA4vL6d6ToihrSIXWEx7cDJCZA4vI5pJo1hce3AyQmQOLyClJ22XJKYpDqHCI8vh0gMQESl1eQrjsmu8355x6bcWzPHkZikA6vIjy+HSAxARKXV5DeaLrRfLe8wXQ3RDFBql9JeHw7QGICJC6vIDV+w37/VHNRSA3KC49vB0hMgMTlFaTcj+z300qJQjqmjPD4doDEBEhcXkGqecl+893ec2qIQmqWKzy+HSAxARKXV5DuovrXjhgx4Bi6QxTScZnC49sBEhMgcXkFad+YGuZryFYbsVcUUivaKjy/FSAxARKXd3+Q3f/r11/9si8aRjFAakObhOe3AiQmQOLyDNLOTbq+a/KDv8hCak/rhee3AiQmQOLyCtLyQ8bqhccTVVwsCqkjrROe3wqQmACJyytI5zddrb9ET65ud6EopFNotfD8VoDEBEhcXkE65BVdP6+Jrr9SRxRSZ1ohPL8VIDEBEpdnf5Cdp++tfJuuz8kVhXQW/SQ8vxUgMQESl1eQ6kzU59A8XZ8k+wfZLvSD8PxWgMQESFxeQbqy+u11j9yrFzST/R3pXFosPL8VIDEBEpdXkDa1oaoLdb1Hxe9FIV1A3wrPbwVITIDE5d0fZLeZx89ftCUaR9FD6kFfCc9vBUhMgMTl4VPNd80uiEpRLJAuoc+F57cCJCZA4vIQ0lp6RxzSZTRfeH4rQGICJC6fQ+pL84TntwIkJkDi8jmkq2iO8PxWgMQESFw+h3QNzRKe3wqQmACJy0NI/y35W4+yqCENoBnC81sBEhMgcXl+gMi1opBupHeF57cCJCZA4vIG0med63eeZZ7Yfa/sy7rcTG8Kz28FSEyAxOUJpIU5GYflZEzT9Q+PogaikAbTNOH5rQCJCZC4PIHUreJSveD4RusvpEoT9ohCup1eFZ7fCpCYAInLE0iH32i8mU2ls67VomEUA6Sh9KLw/FaAxARIXJ5Ayn7CeLOOOv4YHaMYII2gycLzWwESEyBxeQKJnjPebKbZ0TqKHtIomig8vxUgMQESl88hjaGnhee3AiQmQOLyOaT76Qnh+a0AiQmQuLyBdMfChQtn0oSFZqKQHqRHhee3AiQmQOLyBpKaKKSH6SHh+a0AiQmQuDyBNEJNFNJjNE54fitAYgIkLp+/GPOTNFZ4fitAYgIkLp9DepbuFZ7fCpCYAInL55Cep5HC81sBEhMgcfkc0gs0XHh+K0BiAiQun0N6me4Unt8KkJgAicvnkF6n24TntwIkJkDi8g7Sv9+8remFwpDepJuF57cCJCZA4vIM0oN5RAv1Oy+PilLUkN6lgcLzWwESEyBxeQXpWer6tAFpSvY4UUgz6Trh+a0AiQmQuLyC1Ky//q8BSb/jaFFIH1I/4fmtAIkJkLi8glT6IxvShzmikOZRX+H5rQCJCZC4PHvpy+k2pGkVRCHNp8uE57cCJCZA4vIK0mkdd5mQ/mjSWRTSl9RTeH4rQGICJC6vIH2SVf9GuqJPhZzPRSEtoguF57cCJCZA4vLs7u+5x5lPRmr9aTSOooe0mM4Vnt8KkJgAicvDRzYULFnypx5dUUP6kboIz28FSEyAxOXzhwj9TGcIz28FSEyAxOUJpAZqopBW0anC81sBEhMgcXkCqb2aKKR11FF4fitAYgIkLp/ftNtI7YTntwIkJkDi8g7SllmTp3ywRRjSFmolPL8VIDEBEpdXkP66KNu8+zuj1z+ikLTMFsLzWwESEyBxeQXp8pwrp8x495lu1F8WUm5T4fmtAIkJkLi8glR5iv1+SBVZSGUbCc9vBUhMgMTlFaRSm+3388rKQqpwlPD8VoDEBEhcXkFq8YX9/skOspDyDxee3wqQmACJyytIc49fsF/X985s/J0spENrC89vBUhMgMTlFaQ21ajcEUeUoToNo3l0Q/SQalcXnt8KkJgAicuzm3ZtY3l0Q/SQ6lYRnt8KkJgAicvnj2zQ6lcUnt8KkJgAictDSNv/spKF1Kic8PxWgMQESFxeQfrl7HLJeKExrUmu8PxWgMQESFxeQTq5Yq9bh1jJQjouU3h+K0BiAiQuryCV+yIaQLFDakUFwhtgBkhMgMTl2eG4NiYHUlvaKLwBZoDEBEhcXkG65Z7kQOpAvwpvgBkgMQESl1eQ/jut/a1jrWQhnUKrhTfADJCYAInLK0hjiZJyr11n+ll4A8wAiQmQuLyCVOOCz1evtZKFdDYtE94AM0BiAiQuz55GkaQ7G7rREuENMAMkJkDi8grScUuTA+kCWiS8AWaAxARIXF5B+uyU75MCqSctFN4AM0BiAiQuryC1r03l61rJQrqU5gtvgBkgMQESl1eQTjo1mCyky2me8AaYARITIHF5/TSKHStlIfWjD4U3wAyQmACJy2tIc/NlId1A04U3wAyQmACJyzNIM3qd1L59+zZ5VWUhDaZpwhtgBkhMgMTlFaSplF2bapamTjNlId1FLwpvgBkgMQESl1eQWp65Xc/6sfDRk7fLQhpLzwhvgBkgMQESl1eQ8mboetYPuj5ogCykCfSI8AaYARITIHF5Ban0bF2vMF/XF9SUhfQU3Se8AWaAxARIXJ49ROjC//TGQ3X9vXKykCbTSOENMAMkJkDi8grSS3SqPjyr39212slCep2GCG+AGSAxARKXZ3d/Tx2r7zydqM4iWUjv0Y3CG2AGSEyAxOXtH2RX/bQnGkcxQPqQrhbeADNAYgIkLs8g7dyk67smP/iLMKT51Ft4A8wAiQmQuLyCtPyQsXrh8UQVF8tCWkTdhTfADJCYAInLK0jnN12tv0RPrm53oSykH+kc4Q0wAyQmQOLy7Lh2r+j6eU10/ZU6spBW0WnCG2AGSEyAxOUVpNx5+t7Kt+n6nFxZSBvoROENMAMkJkDi8gpSnYn6HJqn65NqyELSMo8X3gAzQGICJC6vIF1Z/fa6R+7VC5oJ/46klWksvAFmgMQESFxeQdrUhqou1PUeFaM6BkoMkPKPEN4AM0BiAiQu7/4gu838W+yiLdE4igVSrZrCG2AGSEyAxOXhIxt2zS6ISlFskI7MF94AM0BiAiQuDyGtpXeSAKlxGeENMAMkJkDi8j2kVhmbhLdAAyQ2QOLyPaSu9J3wFmiAxAZIXL6HdAO9I7wFGiCxARKXJ5DW79TX/qf/t+TvMC07xve55O7g/Q8bbukWH6RxyThoAyAxARKXJ5BKT9ep5BP67hmyZuMDA/ZZp+f3nhAnpNfpZuEt0ACJDZC4PIFU5rL5NHFBoOCZWtdfjJ9K59ov9/Lx1oVxQlpIFwlvgQZIbIDE5QmkS0gpeOaXF+w33l7/euDDIKTCbUZ//u7Wbv0v+8TGzBNcLxxz/+ySX7MovTCZq+/5K4mL79D/SeLqfyZ3x+h/JHH1ndI75i8HSIUzXqARkwMFz/zgcvPt0GeLQZrX0ujrErcD+Y4s/08Ml0bIH+0LnQq71+7UFcUv+EFf820JSEuvNfpxj1v79MLAqcH0guulY23vXvEllfT9yVw9qYvv1ZO6Z5K7Y/Rkri5+lfnPGZKu/z7j2YkfKAcs/sq+afdGMUhWMfyOpC2gDsK3TvE7Eht+R+Ly6u9I+27JMX9BKjcudM4fXVfp+rZuyxKFpLXK+ER4GwCJC5C4vII0js6bNGvGM2fQlNBZYwet2TDy5v36nPd1Q8Wcbpr2b1yQXqQTpR/cAEhMgMTlFaRGN9vvr24ROmvnhN69xhgXHzdM1688x+y9uCAVtKT8LbIbAUhMgMTlFaRSH9vvZ5bRoygmSNqmk2mx7EYAEhMgcXkFqdx0+/275eUhaddLP94OkJgAicsrSCd2su7O+7fzyUmA9CBNkN0IQGICJC6vIM3MOKz/PaP61cz8KAmQ3pQ+kj4gMQESl2dPo3inoXn3d9OoXkI2VkjfUVfZjQAkJkDi8vD5SBu/ifLQJzFD2pLbTHYjAIkJkLh8f/ATq3p5shsBSEyAxOX7Z8hadaIVohsBSEyAxHVgQLqSPhDdCEBiAiSuAwPSUHpRdCMAiQmQuA4MSA8J/yEJkJgAictDSMUPfiIIaQoNE90IQGICJC6vILX8yX7/ZqNkQJpJ14puBCAxARKXV5ACRxEqvFv4hcbsvhJ+JVlAYgIkLm8gKcc+aeHgJmFIq+hU0Y0AJCZA4vIG0tJHqNuVZlfdtT4ZkLZmHyu6EYDEBEhcXt20O2NlNIDihaRVqyO6EYDEBEhcHt5rl0xIjWRf3AWQmACJyytIVYLlJQXSifSb5EYAEhMgcXkFqZtV6zJNBiQFUjdaIrkRgMQESFwe37Tb3GFGUiBdQXMlNwKQmACJy+vfkRa1TAqkwfS65EYAEhMgcXkNaXMSjiJkdB89IbkRgMQESFweQ9o/unZSID1HoyQ3ApCYAInLK0jNrZpUpVuTAultGiS5EYDEBEhc3kI67pRH/iuhRgLSfLpUciMAiQmQuA6QP8guo/+T3AhAYgIkLs8grX7/lRkbkgZpU0YbyY0AJCZA4vII0ntNrId+t/00SZC0vKMlNwKQmACJyxtI46lsr4cnT7i4bObzSYJUt4rkRgASEyBxeQJpaWb7TdaJje1ySrwEpgykFlkFghsBSEyAxOUJpMsr/x449Xvla5ID6TRaKbgRgMQESFyeQDq8X+jk1fWTA6k7fSW4EYDEBEhcnkAq9UDo5EPJeYiQ1p9mCW4EIDEBEpcnkMqPDZ28LznPR9LupJcENwKQmACJyxNITS8KnTyneXIgPUiPCG4EIDEBEpcnkG7LWRY49WXmsORAmkx3CW4EIDEBEpcnkDZVrDXbfL9van6V30uykYD0Ht0guBGAxARIXN78QXZuBTr8/D5da1DVL6NxFAekBdRLcCMAiQmQuDx6iNC662oRUb1bN0flKA5I/6OzBDcCkJgAicu7R39v27AjOkVxQdqU0VpwIwCJCZC4DpCnUWhaxfqCGwFITIDEdcBAqpcvuBGAxARIXAcMpJaZW+Q2ImZI3z557RMbor0wIDEBklKqIJ0u+XrMsUDaOGtUl0PMJ1vlX78ouq8AJCZAUkoVpIvpc7mNiBbS8hdvOKGUYajKmXdNu7YyZZ7yYjQ/FgGJCZCUUgXpVslDREYFafMNRxqGMhte9pj9wPP1j7Ygqn3n/1y/EJCYAEkpVZAm0ENyGxENpI1dqOyJN09drZ43t1cZyun2rstXAhITICmlCtI0ukVuI6KAtLEznbCm5NmrRx9FdPTYXyJ9KSAxAZJSqiB9Tj3lNsId0vpOdOKvjp/Z+vY5OVR2bISvBSQmQFJKFaR1dJLcRrhC+vVE6rSe/eyPQ/IzJ/NfDEhMgKSUKkhapXpyG+EGad2JdGrEPxvNK1v6I/aTgMQESEopg9Q4d6vYRrhAWn08dd4YeYXJGbV/4j4HSEyApJQySJ1pudhGRIa0qgV12+S2xK3UirMGSEyApJQySFcQf2Mq1iJCWn4Mnb/ZdYmt51J35lOAxARISimDNIxeENuISJB+akSXRnMwyvXH0QjnzwASEyAppQzSUzRabCMiQFp6BPWJ7qCuP9bIfNnxE4DEBEhKKYM0na4T2wge0uLDqH+0d2rMLpW3wOl8QGICJKWUQVpM3cQ2goW0pTndGP0yT1Bdp4ekAxITICmlDNKmzFZiG8FCuj82rQOpncPde4DEBEhKKYOk5cv9RZaDtCK//A+xrFNwJl1c8lxAYgIkpdRBOrKS2EZwkC6mkbEttO4YGlPiTEBiAiSl1EFqleH+150oYyDNyWzg+ofYYi2umjW1+HmAxARISqmD1FnuyebOkLY0pbdjXmpmbom77gCJCZCUUgepBy2U2ghnSGPpgjjWeqzEXXeAxARISqmDJPgSSY6QlleM7Z6GYNdQx/DbnIDEBEhKqYN0Jzk/kCCOHCH1pFFxLbalM10VdgYgMQGSUuogPUCPSW2EE6SZGQ1jvach0NqG9ID6MSAxAZJS6iBNorulNsIB0ubGGe/Hu9w3lXM+UT4EJCZAUkodpLdpkNRGOEAaTRfFv95L1EJ5oCsgMQGSUuogfUq9pTaiJKSfKub9mMCCXei+og8AiQmQlFIH6Xs6R2ojSkK6KLEnaSyrkFd0jx8gMQGSUuogracTpTaiBKQZGY3ivKch0BjqGjoNSEyApJQ6SFrpY6Q2ojikzY0z3ktsxYLji+6cByQmQFJKIaSaNaQ2ojikUYkfffKznNrrAicBiQmQlFIIqXEpqY0oBmlZXkX22FpR15+uD5wCJCZAUkohpJPI+SDCsVcM0mUOz4WIuXW1sz+1TwESEyAppRBSV1oqtBHhkL7KPszlcJBRNZWOs189CZCYAEkphZAup08cLhlP4ZC60HMiq55N91vvAYkJkJRSCGkQvSW0EWGQ5mQ0ju74W24F/5gESEyApJRCSKNootBGhEE6id4QWvZeOtd8B0hMgKSUQkiP0TihjVAhvU7thFbVClrSKxogsQGSUgohvUJ3CG2EAmlrs4zZQqtq2qfZdX4FJDZAUkohpFl0jdBGKJCeVh7bk3hX00BAYgMkpRRCWkg9hDaiCNKmw7O/ElrUbG3NnPmAxAVISimE9DOdIbQRRZDGUh+hNe1eoBYFgMQESEophLQp4wShjQhB+vWQ0t8LrRnoLHoAkJgASSmFkLRyDYU2IgTpdrln3QZaWi5vIyA5B0hKqYRUq7rQRgQhrcirtFJoyVCj6GJAcg6QlFIJ6ZjSQhsRhHRNrMf6jqItzehd8UWVAIkJkJxyhNSONshsRADS4twav8ksqPZR1mFSj1J3CpCYAMkpR0j/R/+T2YgApB70iMx64Q0U/8VLDZCYAMkpR0gX0xcyG2FDmp9VX+z1LdS21wo+MykZARITIDnlCOlaqaN/25A60xSZ5Yqlv04tZR5P7hQgMQGSU46Q7qBXZTbCgvQdtYj2ZZdjS99zBo1PyspmgMQESE45QrqPnpTZCAvSWBors1rx9D1LylX6OTlrAxIbIDnlCOlpqau+Belk+k5mteLpe7S76IrkrA1IbIDklCOk12iIzEaYkNblNpJZrEQGpI31soq/jp9UgMQESE45Qpot9TwKE9LzSbuT2nys3UTqnKTVAYkJkJxyhLQw8eM42pmQesq9AGCxrAetnkDTkrM6IDEBklOOkJbTmTIbYUAqqFZli8xiJbIgfZBxTHLWByQmQHLKEdJGaiOzEQakWVI/3UpmP43iPHo4KasDEhMgOeUISSsrdP+AAWkQPS+zVslsSEvLVFuTjNUBiQmQnHKGJHUYfQNSo5xfZNYqWeCJfTfSLclYHZCYAMkpZ0iNyshsxI6dS6ijzFIOBSCtPaT0kiSsDkhMgOSUM6S2JHGQbhPS2MReoS9iwaeajxM7WIsaIDEBklPOkM6ixF9+xWzHzk70jchKTgUhbWmU+ZH86oDEBEhOOUPqSV+KbMSOraUaiCzkWOjgJ1PlDuJaFCAxAZJTzpD6C/0RdcdUukFkIceKjiLUKQlP1AAkJkByyhnS7TRVZCN29KbpIgs5VgTp8+zDZX6pUwIkJkByyhnSWHpKZCO2HVo5Kc+NtVOOa3eZ/H0agMQESE45Q3pK6HkUn9BFIus4p0BanldphfDqgMQESE45Q5pKt4tsxG1Cr9HnnHqk1Tuov/DqgMQESE45Q5oldLVslr1aZB3nVEjr6+R8Lbs6IDEBklPOkL6UeaTp9xknSSzDFXbs7yepi+zqgMR0MELa9rtbu/W/HM5dQae7fmUUPUD3SSzDpRcqH2jH0XTR1fc47Ripduj/JHH1PwvdLxN/e/Q/krj6Tukd85cEpD173dqvO527J6uV61dG0Vn0s8QyXPp+9aPPM45139wY2u9+kfjbp+9L5vJJnd35KiPVPukdUygBKd6bdlqVwwR+qK4vc9RO90vFX7GXdelCT0iujpt2TAfjTbu4ITUoL7AJL9FALyF9K3uEcUBiAiSnGEhtJQ6jfxnN8hKS1l/sRaTNAIkJkJxiIHWhxF9gb2uNCn96CmlVfrllcqsDEhMgOcVAupzmJbwFH9H5OzyFpI2my+RWByQmQHKKgXQLvZHwFgympz2GtOnIzI/FVgckJkByioE0hp5OeAuOzV7pMSRtCp0stjogMQGSUwykZxJ/NPWarOM1ryFp7YSe/6EBEhsgOcVAeoNuTnQDptEA7yF9knmU1PM2AIkJkJxiIH1CfRLdgFtpsveQtJ40Tmh1QGICJKcYSD8k/hDQjrQsBZB+KFtF6Dh6gMQESE4xkBI/aPGWvLpaCiBpt9BAmdUBiQmQnGIgaXlHJzj/p+aTY1MAaV31Ut+KrA5ITIDkFAepbn6C899HD6QEkva40BOTAIkJkJziILXMTPDFUi6gz1IDaWsLelNidUBiAiSnOEidKcFXOa6TtyU1kLTZGY0lXjEJkJgAySkO0sX0RULj/0idtBRBMn4YPiSwOiAxAZJTHKTr6f2Exn+ebtNSBemHslUF7gIHJCZAcoqDNIImJzR+f+sXldRA0gbTgMRXByQmQHKKg/SoeadbArXMMl9GL0WQ1tfJ/Srh1QGJCZCc4iC9ktizTdfnNjHfpQiS9rTAq0kDEhMgOcVB+oD6JTL9+3Sl+S5VkLaeQNMSXR2QmADJKQ7St3R+ItMPs4/CnypI2seZDRJ9FDggMQGSUxykNYk9Re4MWmy+SxkkrQfdn+DqgMQESE5xkDT7l5w425p/qPU+dZB+LJe/KrHVAYkJkJxiIdWsnsDwC+kc633qIGl30tWJrQ5ITIDkFAupWc7W+Id/hO6x3qcQ0oY62QsSWh2QmADJKRZSJ0rgplEv+sB6n0JI2iTqmNDqgMQESE6xkLpTAn/TPLq0/ZKuqYSktaPXElkdkJgAySkW0nU0I+7ZV2W2tU+kFNInWUdtSmB1QGICJKdYSMMTeLDdq3SjfSKlkIwbmIkcUwyQmADJKRbSI/Rg3LMPopftE6mFtLxCIq/QDEhMgOQUC+mVBF6PuV1G4FmBqYWk3UVXxL86IDEBklMspA/pqnhH31SmfuBUiiFtPCJrftyrAxITIDnFQvqOzo139Dl0SeBUiiFpUyj+V4MGJCZAcoqF9Fv818HRNCFwKtWQtE70UryrAxITIDnFQtLKNIp39G6h4z2kHNLn2YdvjHN1QGICJKd4SHWqxTt6zUoFgVMph6RdTiPjXB2QmADJKR5Si6w4D2q1mE4Pnkw9pJWVy/8vvtUBiQmQnOIhnU5x/hHmGRoaPJl6SNo91Du+1QGJCZCc4iFdTJ/HN/lV9G7wZBpA2lQ/c25cqwMSEyA5xUO6ochDbDXP/jV4Mg0gaa9Q+7hW9x7ShqVzXn1seP8BsxJ4AosVICmlHNLdNCmuwddmtwidTgdIWqf4HjXoEaRVX06ffN/gvmefUD+Pgh12U2JPpQIkpZRDboLdGAAAGDBJREFUepzui2vw1+i60Om0gPR5dt0NcayeREibfvxi9rOjru95WrMauSE9lY5qc86Vt42bMnNKt9JEjUcsjX99QFJKOaTXaXBcgw+kV0Kn0wKS8UvbsDhWTwakn4df1a3t0fkhPDnVG5/S/dqRj7827wf1z11rH++URZntxq+M89sAklLKIX1MfeMavGXW6tDp9IC0snL5ZbGvLg9py/2VTD3l6rU664q7Hpz4/hcR7hb9aUxLotyzJq6P5xsBklLKIS0NHMAkxtZlNy/6ID0gaWPp4thXF4c0uxmVG/7BEotGNPfaLbr9KKLyPabF/uc8QFJKOaQN1DaeuV+na4s+SBNImxtkfhDz6sKQfr4kg877IfhRlHd/f3xtDaJq/WIdHpCUUg5JyzsqnrlvDD6pzyxNIGlvUeOYD7wqCsm8Vdfg7aKPo/47UsHbvSoSHX7Ll7F8N0BSSj2kenG9jOzxmUW/IqUNJO0iujvW1SUhzW1JZQardyfE8gfZjS93L0vUYPB3UX8FICmlHlKrjDiOHbIup5nyUdpAWpFfJvrroZ0cpJX9MqnzkrCzYnxkwy+Pd86mzNajo3zQFiAppR7SWRTHwz2nqb8ipQ8k7SE6NcbVpSAVPJ5PRxZ/bYzYHyK0YnzrDMrt/Piv7hcFJLXUQ7rMfF3yWBsU9ky69IG09YRYH98gBGlWUyo/ssSP9rgea/fV4COIKlz8puvdeICklHpIg+M5wGKrTPUArekDSVuQc2hsLywrAsnhVp1VvA9aXTDwUKL83jMiPxoPkJRSD+nhOA7I9VtuU/XDNIJk/KyM7aj6ApAcb9VZxf/o74IZvfOI6gyMdBxcQFJKPaQ36KaYp55G16gfphOk9XWzYno+ReKQnG/VWSX0NIr1z/9fLlHLd9gLAJJS6iEtpO4xTz2IXlQ/TCdI2uvUPJYHCSQKafnFGXT+j8wnE30+0qoJ7TMzevzMfBaQlFIPaX1Gu5inbpUZdg9tWkHSzqUxMVw6MUj8rTorgSf2fXwsVRxd4PgpQFJKPSQt/7BYh/6t2Ov8pRek/1Us/330l04I0kctqOzgCMcvkniG7ObR5am14wEwAUkpDSA1y3H+D4+v2K9IaQZJu5+6RH/hBCBx99UVJfNU8x/Ooex+60qeD0hKaQDp/4i7ic81iKaEfZxmkApaxXC8yLghudyqs5I6ZsPLdahGyb+PAZJSGkDqR7NjHLp1+K9I6QZJm59T2+E/cOfiheR2q85K7OAnvw3OKfnDD5CU0gDSSJoY28y/5TYOPyPdIGkDaEC0F40PkvutOivBowh9ejyVHR5+dyQgKaUBpImxHqX0jeJ/80w7SL8dlj0vyovGAymaW3VWkofj2mp80yZhT1kCJKU0gDSb+sU28830QvgZaQdJe4VaRnkPShyQfmxJ5e+O6iHzsse1W3YuZV21puhjQFJKA0g/0v/FNvMJxX5FSkNIWhe6P7oLxg5pTnXqEuXdM9IHiJx2OFV/PvQRICmlAaSC3Gbs55z6LfeYYuekIaRlFfJ+cL+UFgekiWUyBkd7aEfxI62uH5xLJwUffwdISmkASTsstufIvlnipmAaQtLGRPkKajFC2jo8o9wL7hcLlIRDFn/dkUoH7i0EJKV0gNSOfotl5JtLPOUnHSEVtFQOvBeh2CD9ejbV/Dj6iyfj2N9bH69CR75lngIkpXSA1J0WxjJym4ziT4VOR0jap9l1onmaaUyQvm9OrX6K4fLJOYj+qn6ZGd1/BqSw0gHSTfRGDBOvzy3xGn9pCUm7mgZFcalYIM2qRhfGdFjkZL0axYyGVHn8VkBSSgdID9LDMUz8VsnXQU9PSGtr5kTxYucxQHo0N+ue2EZI2su6bBxamk78CpCKSgdIr8V0+O9r1CPa2aUnJO0Fau3+x6SoIW25liq+HuMESXx9pG9PoVJD43nVgGgDJKciQloQ06F+65UpcddEmkLSzqSHXC8TLaS1Z9DhMb8kW1JfaOzlWnR4NA+viDNAcioipHXUIfqB5zv8+TZdIS0pV2m522WihLSoAbXlnqnKl9xX7Ft7fRZ1juGpV7EFSE5FhKRVrhv9wEPpkRLnpSskbRRd6HaR6CC9VZmuivloyMl/6cs5TaniA7E+myzKAMmpyJBaZ0b9rAOtZWbJ40mmLaTNTelNl4tEBWl8TvbYeL5/0l9DtmB8HjX7KCmrA5JTkSFdTh9GO+9Pma1Knpm2kLQ5WfVcfiGPAtLmflT5bddLOeXBizEv607Z/dYmYXVAcioypPsdbq4xPUTDS56ZvpC0K+mWyBdwh7SyAx0Z01+si/LkVc1frUPV43sd4IgBklORIb1f7BgMETqDvih5ZhpDWlMj12FgJVdIX9WnU2I7eGtRnkCyHsnaebH06oDkVGRIq6K+2259mcMdzk1jSNpEahvxsdpukF6vSP1ifzG9QN5A0rT5rYq9noxAgORUZEhajSpRjvui8lrmRaUzJK0zPRbp0y6QRmflRvzyyHkFyXr67DGzRFcHJKdcIJ1KUT4Usxe953BuWkP6rkx+pNcbighpY086JJGrp2eQNO2nCynzeskfSoDklAukG+itqKYtqFbZ6a8paQ1JG06tI/xZNhKkn1pR06WJfGsPIWnaW3WoScyPveADJKdcID1B90Y17Sy6yOns9Ia06Wyq8wn72QiQPqlN50TzTAw+TyFpa7pT6bHRPnnXNUByygXSPOoV1bQ30vNOZ6c3JPNZrWXZlx/jIb2alzEwwUcNeAtJ0ybnU8fonmHvHiA55QJpY3aLqKZtkOv4p780h2QdZ4EzwUHaOjyz1JOJfl+vIWn/O5XypzhcNI4AySkXSNpRZaP5v3cRneJ4ftpD0j6pw91KYyBtuIiqz0n423oOSds6Ope6R/+ArwgBklNukLrRN1EMew9zjKv0h6QtO56aOB4Z1RnSoubUclni39V7SJr2aSOq94HTJ2IMkJxyg3Q7RXODoH2G831YPoCkbehB1WY6nO8E6eercukiiSfNpQKStuHqjOwhcTxUvViA5JQbpCl0u/usK7ObOn/CD5A0bWRWrsNDCktCWjekPNV5QuRbpgSSpk2rTq0WJbo6IDnlBumbaF5S6EnuKen+gKRNrUDXlHi0T3FIm+6rRvn3CP1lM0WQtBVdqHzUj0NmAiSn3CBtrVPK/VeCrsQc0s0nkLQvjqBOq4udFw5p67OHU5lB8T5GtUSpgqRpj5SnLpEe0OEeIDnlBkkb6/ggurBWl6/F/LXPL5C0VR1LPCMiDNIbzSm7j8CdDMFSB0lb1IqqJ3REB0ByyhXS+mrl3P4DG8T+HuUbSOZz9IodCkiBNLcjZZzzVfEvSaQUQtI235adcXUC95gAklOukLS73J4Bt6xsFe6JmP6BpGkTcrPCnpoYgrS4dya1miH5rVILSdM+PJIaRPsyUSUDJKfcIf2aX6H4rw/hXU7sYQv8BEmbWY0uWF/0YQDSioG51FD8eaaphaSt70c5g+N9lBMgOeUOSbuNhkb69Dc5h7F3ZfkKkra4sXoAbwvS2lvLUZ0n5Q/Hk2JImvZCPnWI84BdgORUFJBWls+9NcJN6vOIf+SZvyBp67pQrdBNHgPSxjFVqMpo4SeYWqUckrasE1WK8SWCAwGSU1FA0l46hOqx9/N8nHEM/x+2zyBpW2/JKBO8Gbfn96frUtlb1kT8gnhLPSRt69jS1COerQMkp6KBpK3um0l3MJ/rRK/yX+g3SJo2qUzGLfZ9+TOaUk7fkofqkykNIGnagiZ0WBx3ogCSU1FB0rQPazK/KL1DbSJ8mf8gafNqUZd1mjbnJMro9nUS1rdLC0jaxuszs26K6sWj1QDJqSghad/UcDpunba1JTk94DOYDyGZTyNv/HbXDDolhhfgi7n0gGT8P1iLjo310HyA5FS0kLSvq9OIkudOpjMjfZEfIWkbLyaiptNif1XzGEoXSNrq86jsg7GtDkhORQ1JW3govVb8vC+PzIr4kl2+hKRpo497piD2VzWPpbSBpGlPVqAzXF+aQw2QnIoekvZR1pHh9wQv7plFvSN+iU8hWR0skLTFbahahDuMSgRITsUASetDdykf/TywFB02PvKxRgGJKZ0g2c9Cj/6oSIDkVCyQVlQu/2Po9PVlqM4jbk+3BCSmtIKkaXOPoqPmRnthQHIqFkjafcGj160dkkeHjHX/kz8gMaUZJO23vhk5Q6M8kjkgORUTpC3HZJh3dv86qgpVGhbNbQFAYko3SJr2ajVqG90LVwCSUzFB0t6jZpP7t8imcjdFfkB4MEBiSj9I2vIzqEJUx+sDJKdig6SdS0RZzQZFe3cpIDGlISRNe7AMnRfFf5CA5FSMkJZddOu0GF5OEZCY0hKStvBYqvWO66UAyakYIcUYIDGlJyRt8+CsjH5u9yEBklOAxHUwQtK02fWo0aeRLwJITgES18EJSVvTg0r3nhbpIeEHLKQd4/tccndBydOAlFAHKSRNm1iVqGL3F37jPn/AQrpnyJqNDwzYV+I0ICXUQQtJ2zKjX3Wi0p0fd74L70CFpHX9xfhJdO7S4qcBKbEOXkhGBXMH1yfKaj3a4aCYByqkLy/Yb7y9/vXipwEpsQ5qSGYLBjcnymw9vPjzhA9USB9cbr4d+myx0ws6GX273y1dd71I2pbc2ZO7Y/yx+i8T2mcQHXPXorDF/XWV2Rs1pL4KpKLTiy41+r7Qrf36XtfLxN++fUlcvFDfn8zV9ydzx+zVk7pnBHfMmqfOziGqd/2ne0KL63Krl0z8KrMnWkhf2Tfn3ih+2gw37eLvoL9pF2rl451ziWr3ftl+1syBetPuj66rdH1bt2XFTwNSYgGS0m8vd88jyu/+8sYDF5I+dtCaDSNv3q/Peb/oNCAlHCCFt+Hl3tWIynZ+/I8DFdLOCb17jTEuPm5Y0WlASjhAKtGWGf1qEJXu8lBMB0uJLTxEyClAYvInJLMFg4+2/sD0o/tF4wqQnAIkJv9CMnbMsrtaE2U2Hyz60mvBAMkpQGLyNSTjd6TFo1tnEDUYHPURU6IOkJwCJCa/QzL6+fHOOUR1+81gXj84zgDJKUBiOgAgaeYfmLqWJarV++WYD8TPB0hOARLTgQHJaO1z3coRVb3sNbdDHEYbIDkFSEwHDCTN/ANTz3zD0tUyr94BSE4BEtOBBMlo81u9KxE1vOuHxFcHJKcAiekAg2S08eWuOZTZevy6BFcHJKcAienAg2S0anxrotJdX07o1yVAcgqQmA5ISEZfDD6MqEY/l+MQRQqQnAIkpgMVkqYVzOhdjqjB8HgfjgdITgES04ELyWj9pM7ZlNXxcfY4RJECJKcAiemAhmT0w+gm5jG93oz9UQ+A5BQgMR3okIwWDKxKVGfgohhXBySnAInpIICkaVve7F6GqPnoFbGsDkhOARLTQQHJaPXjHTOoVOdJ0T8YD5CcAiSmgwWS0ZLhhxMd2i/aJ1wAklOAxHQQQTKa2y/fvEf8f9FcFpCcAiSmgwuS+cBW8wFEHR93f+FhQHIKkJgONkhGK80HEOW53iMOSE4BEtNBCEkz7xE/lKjmwIjHegAkpwCJ6eCEpGkFb3Yva94j/jN7CUByCpCYDlZIRmvMe8RzO09iXqoWkJwCJKaDGJLR98OPJKrUe4bT5wDJKUBiOrghaeY94lWIjhr8XYlPAJJTgMR00EPStI2Tz8qlzA4Pzw1/Ti0gOQVITIBktmLscWRU++Srxr0dPAQyIDkFSEyAFOiLe3q3rWJqogoteg574ctNgOQUIDEBktrquZMGd22QZXLKrtep3/g3JV/tApBcAyQmv0Gy2/Tty8N7ty5v/Xiq1Lz78Je/LZBYFpBcAyQmf0Ky+2f9jPEDO9fNMDnlNug68PG5cT1jvShAcg2QmHwNyd4xv5i39pqXtn48Hdqx9+g3l8W7ICC5BkhMBwAku83fvjm6d8dqoVt7kxZsiXlBQHINkJgOGEiBVs193Li1l2lyyqnbeeD4GbEczhWQXAMkpgMNkt3GBZOGd29e1v7x1Nq4tfdtVMcoAiTXAInpwIRkt2XRqyMvPSHf4lSx5Q3uCwKSa4DEdCBDCmTe2uvaIOsU90sCkmuAxHQQQLLbGMV9eYDkGiAxHTSQogmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCVAcg2QmABJCZBcAyQmQFICJNcAiQmQlADJNUBiAiQlQHINkJgASQmQXAMkJkBSAiTXAIkJkJQAyTVAYgIkJUByDZCYAEkJkFwDJCZAUgIk1wCJCZCUAMk1QGICJCURSO69O/pP9wulZ/tHT0n1CHH3w+jvUj1C3L0y+r9UjxBfSYU0rOX6ZC6fzPa17JvqEeJuRstpqR4h7q5ruTPVI8QXIDkHSKkJkJwCpJQESCkIkJwDpNQESAgdzAESQgIBEkICARJCAolC2jG+zyV3F4SfLv4+XYs0+w3nGF2U2vki5TS7vuGWbsU/l4ZFGj3dd3tYopDuGbJm4wMD9oWdLv4+XYs0e9/pmqb9keIBI+Q0+/zeE7oV/1waFmn0dN/tYUlC0rr+Yvy3cu5S9XTx94LfTrRIs+sXLkr1eBFzml3/eOvCbsU+l4ZFGj3dd3t4kpC+vGC/8fb619XTxd8LfjvRIs2+55xHb7xizIaUzhcpp9mNN9a1Mc33e6TR0323hycJ6YPLzbdDn1VPF38v+O1EizT735c9tGLFyMv+Sd10kXOaXQ9cG9N8v0caPd13e3iikKwHAwT2SuB08feC3060SLNbF9h10ZwUjeaa0+x6EFJ67/dIo1ul8W4PTxLSV/bP5jfU08XfC3470SLNbl/iuldTNpxLTrPrgWtjmu/3SKPbpe9uD08S0h9dV+n6tm7L1NPF3wt+O9Eizb7usUJd//eieamekctpdj1wbUzz/R5p9HTf7eGJ3v09dtCaDSNv3q/Peb/odPH36VqE2bdfMmHzhjF9d6d6RDan2f/U5nTTtH/Tfb9HGD3td3tYopB2Tujda8yfuj5uWNHp4u/TtUiz/zKsx6X3bEn1hHxOs19p/jXznPfSfb9HGj3dd3tYeIgQQgIBEkICARJCAgESQgIBEkICARJCAgESQgIBEkICAVKaNILM8jq85XrJ9g2YBRZG832Yr0YJBkhp0gi647nnnhl2GD3sdkmTwpKS/26AlNIAKU0KONheN+9fl0uaFB4FpDQLkNKkoIOb6Wtd//S0vDLHTTI+OunExafkVetpHhFkaqsyeS2n6haFM4xbgS3bV7FeuKFj1T1hC+hFX96+SqH54Qk19hatCEjJCZDSpKCDYfS5Pjerw/Q5/elBXT+1TquPCt7M6qPrr9F5M2acSTMsCiu70aKfJtGbxuU3Zw4MX0Av+vInyHxW3K8ZNysrAlJyAqQ0KejgxOy/9ePqmwfA7mrcyDvVYGVwqqnrY04xfv5sy+5lU7jS+HfbUf4c43OP0XfhC+hFX65lX22ceICWKCsCUnICpDRpBM3cvHnTN1fQtXoB3fiv0dP0jX5qWfNzfTKDl6p9UhEkvW+2cZPvpCahBQKQlC8/65B9ut6qsXoWICUnQEqT7Lu/Kfu63foSCvS2fmpd83Mmm23Dm1TIyqL2CqQFNF7fmDEutEAAkvLlL9En+lq6Tz0LkJITIKVJI2jC7NkfLPhLNyVcsdBKUyB1yLpz/g8/1lQh6Uc30x/O2hRaIAQp9OU7yg7Q78/4TT0LkJITIKVJyp1uf1Cf4MkQpFXUzzhRWDoM0lha1vrMEgsoX673qKkff3LYWYCUnAApTVL/DNS6ovmDacrQwiJIP9HduvnnozY2havIvGN7U9YlNLXkAkVfrr9H79CksLMAKTkBUpqkQvo0p9mUD4flXK78RNpTp9Z7n99y8sl58/4xKdxFd5v3fZ9NFXYVLXDLY2afKV+u78k/ovS2sBUBKTkBUpoU9sCEBafn5Rw9rlCBpC9qW/bQa7ZNr1p5hUlh/XE5Joi36CplAbsBypfr+tV0UfiKgJScAMnPvW8+DAKlQ4Dk4/Yc3ybVI6BAgOTbfnvvzKxvUj0ECgRIvm1SRr2ZqZ4BBQMkhAQCJIQEAiSEBAIkhAQCJIQEAiSEBAIkhAQCJIQE+n/r20q2hiJ7owAAAABJRU5ErkJggg==", - "text/plain": [ - "plot without title" - ] - }, - "metadata": { - "image/png": { - "height": 420, - "width": 420 - } - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAANICAMAAADKOT/pAAADAFBMVEUAAAABAQECAgIDAwMEBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////isF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAgAElEQVR4nO3de4CMdf//8fee7bJECNFRRSkVioSi033fodItUXSgRHdHpaJbErrJV+V3d5eSziqdnA9JRZEUKpGzsNbux3FZpz1cv7lm5vp8rtmdw/W55jPNzHW9nn/szs5c3vu5rp2Hnd2dA2kIoaijeC8AIScESAgpCJAQUhAgIaQgQEJIQYCEkIIACSEFAZJUK6lD4BlDaIKawa0yqm6NZoCqhUhW6YCE3FBqBwfT/+SWYf/whTtwVg+qZ7soIA0jUY2IW78zTW66afuVdKbs2ipV8Ezr2pkntxlb6Dn9DxrHz29Nb/PPQl2Nc2cQDQk6xhKkXc9cViejVsvBWyr/+5BHoRld+XBh6OVHLsTXfKX/C5R9zoAN0YwPlWVIlnfQe4ieP/U9qWVEcfh8By74F+avgTSlq6dLqI7+rmfErevdLTfdtL0CSG/nUEbLq8/NpJrzNG0WNTHOX021jvDPkpJe4D99c0oUkF7LpswW11xahdJHVfr3oY7CUapeEnEnwhYSUrVbPHXvUJOyv4zuMwTNKiTrOyh7RZGbXjnfgQv+Wf8aSN4+p06WtttEcsfHvH30kKZQ6pADnvcFD1DaCq3sdPrWf8GD9Kj4LBca36n2ZjWzD2kipT9X5Hl/eHwVerbCZSGPwj46NfJehC0kJP+xK76PTimL8nMEnd/B0naWd1D2iiI3PUjeAxfis8YD0lM07b8NPLfwyl9vXa1Kk6GH9PMOPNGkSmbjQZ6rcDf95kVbbShNW9KhWu0+ReXjz8luOqpcC9jec+nqG+pkNf+Ab+8rAJKYqWlTr6yZUf+62RVOHn+xZbWsMwfmiX9TdAK/yT2Y7vHccKDbfB8drZXCb/CspLvrNPOdfIUe9kIyT9p6y4nZzSf5rjemNfsOdRptN8bsyqap/pPzUlP/0LSBNFn/YCn9I3CvzOO7em9+8aWYdsa0u5UP3sP06cL21au1W8AXYj76FY5dWXVaJQ6J6VNs6V7Ls2e76OKAtQZ86mBfXPMBCful0fPvoHm+6avtWdx/W1at1vFb/oX3/YxkOv4BW4c9fOaDEuLaFbBU/cD5Pmtbmumd8DX/GlU8qB1ouvf8GdSx4rVAGaRn6JGcW/tp2m1U/9GnWtOFnv+Uj7ejFoMePIdalWoz76DW4z/WnqUnT7hlYEO6fXCj/r0z6R0tYPtn6Zka1zz8D6KvjO19mSGZZnr+769z79N31Up5J+Bk2d+oyYNP/43qix8+X6GWxskjf3resKwqe70fvU/X8I1W0l39abn35CUnfaxDMk/a24jaD7233p3e641pzZUgPeu9EvrqSQMDrpwBe2UeP30U1Rw/fr//AtPOmHe38sEbTPdnd3m0a0ra18ZCTCurdOwa0zK+MtOn2NOQrhzRv959+rXHfEU3f+ogX9yAAxLuS+PNv4Pm+aavtue2NJ07oFc1zz75D5EXkvn4B2wd9vCZD0qIa1fAUvUD5/usb1A374T76FX/56h4UF+jPt7zb9f3I/BaoAzSKKrh+fFD+4haeAaX309PaNqn1Nqz0GNNdMZTvd87R1OWZ/f+TMtoskfT3qDrA7cfTZnvejYapK92aoibduaZ59NGzznbc1sHnJxIbY5q+n9A3fk/6mb67YK3XvSi9/0V9Lnps/T5kQbop9bSI1N1SOZJ/6ZbPCfz6+nXG/OafYf6qy+PGmM60Lt85Cw6O/DKad6rgPEBt01MO2Pe3coHbwilzvBsOZZa+xdiXlnFY7cuNWN/sE/xb7pZ37OT9D0zr9X8qYN8cc0HJOyXxpdvB83zzV/tKfQ3zz/9I6fqQf8h8kIyHyDz1uEPn/mghLp2mZfqPXDez1qUk7nbs0Vp3ax9fFbgQd2bWfO45+yjNbKLKl4LlEEaTd6DdhXN9x63jPqe2wyfef+DH0zPCUjX6udcSP/1vN1FTQO3H+37lrqM2oSGZJ7ZKCVfP31MCzjZlny3iTIzDxv/6CLfJxF9T+fp79anNCw1fZY+2vk19a/NE7TaC8k8qTkt1U8P16835jVXuhXdiFby03mUXh4aknl8ACTTzph3t/LBG+I7YEdzUvb4FmJeWeCx2/VZY/3bY5BP0Zy+008Oqwgp8FNX+uKaD4ivEF8aX8Egia/2NbRYPz3+0U1mSOYDZN46/OEzH5RQ1y7zUgUkzzealz1vv/T+z+KfVeGgdqa5npNfUI9K1wKFkB7W31Uj3+3zi+hP7/ui/Pzh+pXSgDRYP7O997gdptMCtx/t+8l/PV0YGpJ55gBqMinfd5Y4WV6FfP/xnu+/nabpN2qWVlh2c/re8/Zx828DdEgv0hTPbYaGl2g6JPOkskzyspyrX2/Ma64E6QTxo452kKg4JKSAhQZAMu2XeXcrH7wh9Lh3iwvoF99CKh598etvT3cfC/YpPHtWrL+fUxFS4Keu+MUNOCCV11phF4JDEl/tqsT/0xOQAg6Qeevwh898UEJdu8xLNUFaqP+cqPWjaWJWhYM6hfpq+k32WZWuBQoh6b/sPSy+bks8F7at4j1pgjRW37YDrfW8PeLZ+4Dt/ZduoOZhIJlmHr83g+jcxzdr5pNFlOnbsCPNKe2g95V2Mc3WAntV/wZ//KT0neIsHdLuLM/PTPM8X0QdknnSAariPbncc70JWHMlSKcKv9p2fUIlSP5Vmcf7rgnGck37Zd7dSgfP88l9N1k7eP5z1BcSePT9e+X99ffl1GSN/mHlT3GAsvieBUIK+NQVv7jmAxL+S+MrGCT+1T7kH6YnIAUcIPN1w1vlw+fPdFBCXrvMSzVBKj+dftVKTqx9XMyqcFCLq9Uu1Y7k1i2pdC1QCElf5xFKGeZvs+dHs9yHPpg1594wkAK2twLJPFPTdr7aNZcyPzKfPEgZvi2voLkl3v2cot1a6ffQh6pn79c+oX+aztIhad1Tt2k9PRfpkMyT9vuvbj94rjcBa64E6Vp6jZ+eQRcEgeRflXm875pgLNe0X+bdDQZpvHdCe88P0PpCAlYWcOwOn+G5umnBPoWxZ0srQqr8qc3jzQck0pdGLyykw5RWbmwoIAUcoEqQKh8+f6aDEvLaZV6qCZLn9u0jnm/N/zLNqnhQe3kO9af0gFbpWqAYklaDxB+YT/b9uebfYSAFbG8FknmmtyP/Sz/hqPlkDvl+VDyPfjb+0SQ60/h7Xfmza73vB3p+0riOFpr2xAtpHo0syu6leSGZJ5Wm+W58fKFfb8xrrgRpnLgV77kJ4LmVcT+9qZ/+rNLPSOaFVv5DiG9nzLsbDNJQ77YX0GrfQswrCzx2c+iUogoX+T5Fabpvzz7T98y81iCf2jQ+4ID4Cvel8e+geb75q51LzJhi+hnJfIAqQQp5+MwHJdS1y7xUM6QtKSeX3266SVH5oM6iAVoP7xYVrgWqIV1Dn3g/3KP/ubmafqr8knCQTNtbgRQwc6vvhll7WmM+2d7314A96dn8N2mH6/pvN2vac3SZ9/3v1K4gjd/BwfdZ+nh+Yjil1Tv670e9kMyTmtAP+ulB+vXGvOZKkPZV5788XZiauU3/J96bGk9WgmQeHwBJ7EzA7gaD5P3tfVFm2gHfQswrq3Ds/kn3mVZpOl5NfTcDH9b3zLTWYJ/aPN58QLyF+tL4D4t3B83HwvzVvtL7TVIb1el7MyTzAaoMKdThMx+UENeugKWaIXkWMrtaU800q+JBLand6HDVJhUPRgwgfUTNdKiL0m/WtFrkuR6VP1NP/3XRDLpBC3ZdMG9vPli+7X2ZvyOZZq6ijvqPz0UN03abTmpv02X66Ue9Pxb6m5FCd+o/++YPoOr+L22HlCfpJfOeeCFpw1I7nV7uh2Se9Ij3t+mba/p+/S3W7IP0zdccrfdeFPp//kdfyfH+fu0V6uAZuKa2fuUx71XAePM1wbwz5kMYDFKa/luUl+lK/ptasbIKxy6vesq3QT/FILrJs76NJ+h7Zl5rkE9tHm8+IOG+NP4LfTtonm/+ar9FLTw/uW+pmbPXf4i8kMwHqDKkUIfPfFBCXbvM++Y9cMYX5m06nUx37ApyUO+jkfrv+ipdC1RD8vw8cvIjw7pl5C7Tj/VZzz3X+px5dOLz2zemZNzVP8h1wby9eVd92/taSTn/8PV+wMyedMbAfw84lR7Ub0Txk+VdqdljT3Wis3eblvlJdUptfnWzTGr0q/+cDyk9Z59pCz+krak0XPNDMk/Kq00t/9Wjhu8/YNOaK/1BVtPeq0aZl/69bVWq4r07RUF1avNwj2pj6W+BexUwPuA7kmlnzLsbDFKf3D4jeqdlLDUWYlqZsVfGf0IvUePDwT7Fzjp06aBeNQbqe2Zea5BPbR4fcEC8hfjS+PLtoHm++atddj2d2v/2XHrdOEReSOYDVBlSqMNnPiihrl3mpXoPnPGFKc4lz0/JplmVDupiyk3ZUvFgxAJS2ettctMb9vZ+sYecmdVowG7tjqr1ftWer511cTBIpu0DDpZ3e1/iV7jDAmaW/fey2mk12r1Zrt/FhJ/USl68OCeryZMBSrTdz7aunV6z/RvGXVS14/Uq3L3KB0m7OlX/zbEXUsCktV1PqHL+63vo0sA1B4GkFQ5vUzuj1iVD/b8S/K1jTrVLv2B0ReBeBYwPgGTaGfPuBoP0yoIO1ap1WMQXYlqZsVf8LkItxP0KA47XH/qevbHKK8K01iCfOmC8+YB4C/Gl8eXfQdP8gK92ybgLsqu29/7I6j1EvrsImQ5QZUihDp/5oIS6dpmX6jtwxhfmbv3OP6ZZlQ5q+WnUTqt4MBRAQnFM5WOQLD8eItGL7qCMMt0xRSpASuIAKUhRHZTjjWofjbxVsAApiQOkIEV1UB6q9BdHqwFSEgdIQbJ/UNYOvpyaH468XdAAKYkDpCDZPyhfpVa71fbD/QEJIQUBEkIKAiSEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQVFAenQ/kgdLimKuI2tDhyNzVzPig/GZvCBY7GZu784VivefzxGc4tLIl9z7BWrFR8KuWLx8P0oIO1nkTqi7Yu4ja32HI/NXHbYwl7ZandJbOayYu1AjCaXxmjuIa0oRpPLYjT3oHYwxCV7ASl4gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkIkKQDJBEgGQGSdIAkAiQjQJIOkESAZARI0gGSCJCMAEk6QBIBkhEgSQdIIkAyAiTpAEkESEaAJB0giQDJCJCkAyQRIBkBknSAJAIkI0CSDpBEgGQESNIBkgiQjABJOkASAZIRIEkHSCJAMgIk6QBJBEhGgCQdIIkAyQiQpAMkESAZAZJ0gCQCJCNAkg6QRIBkBEjSAZIIkIwASTpAEgGSESBJB0giQDICJOkASQRIRoAkHSCJAMkoMSEtenXwUyNfm7Mumj0DJB4giVwEaecbl5C/c/p+kGd3zwCJB0gi10D64O56RO2fe2/K68/2vTyLKLfbh4W29gyQeIAkcgmkjdcT5fRZZHy4/dN+DYjOemGbjT0DJB4gidwBqaATtfx8R8DlhbNvzKDaz/wpvWeAxAMkkTsgPUqX51feZNX91aj2szsqXxA2QOIBksgFkBa/eQM1WB10o3UPVqVT3pLbM0DiAZLI+ZC6E1HTFaE2+6NfOl3xvcyeARIPkESOhzSZzhnywa4wGy5qRxmDg9zwCxUg8QBJ5HRIBeekLY606aST6ELr35QAiQdIIqdDeoVuirzYdTdQlREFFvcMkHiAJHI2pKUfnZT5o5Xlvl6LrrR4xyFA4gGSyNGQ3iaiIdbWu7oDNZhtaUtA4gGSyNGQbqRe71q9H9CuQakZI6xsDEg8QBI5GlLdE2XuTvdxLbp+a+TNAIkHSCInQ/qe/i615lWt6PxfIm4FSDxAEjkZ0nM0Tm7RebdQvQWRNgIkHiCJnAzpRloiu+whKTmR7jEESDxAEjkZUsu0ndLrnlQl9d/htwAkHiCJnAypQT0bC59Xl24I+ysHQOIBksjBkPZmNLez8lUt6OzvwlwOSDxAEjkY0la60tbS8x+gqhNDXwxIPEASORjSCrrZ5uJfzaHeIZ8cBZB4gCRyMKSvqK/d1S9qTBf9EOIyQOIBksjBkD6lR20vf/MNlPNC8LtFABIPkEQOhjSJnotiByadQFf+FuwCQOIBksjBkP6PJkSzByvaUO33gpwPSDxAEjkY0jB6O6pdKHg6k3r8XulsQOIBksjBkB6mz6PciYVNqOqQik/XBUg8QBI5GNJdtDDavdg5siY1ej3wlw6AxAMkkYMh3UQhn4TLehseyKSLAx46C0g8QBI5GFIn2qhiT5ZcRSkdJ4u7vwISD5BEDobUIs3qEwNF6OOLier8a5n/I0DiAZLIwZDOqKVsbxbeWZ1S2r7q/b0DIPEASeRgSCecqXB/tk24hKhm368ByRQgiZwLqTS1pdpd+q7/iUQXPL8JkIwASeRcSHvoKrW7xFjem53SKKvHp4p+9qoQIIkAySj+kDZRN7W75O2XJ08jqt/nPTuv9RchQBIBkpEKSAfH9ek5vMB3evuzvW554ncZSD/TXWp3yd/u+f+sTlTl6rGrFA8GJBEgGamANGLw5ryxA8v0k+X9JhQffe+fRRKQvqYH1e6Svz3H2c7P7juTiJo9PDvc68XIBkgiQDJSAIl12eT5rnTDKi+czms1bW/ndRKQpll92m/J/L+1W/ZsuwyimjdOWKNqMCCJAMlIAaQl3co9b+//yPvBY+OLjnzQ95gEpHfpebW75E/8+nvTGz3qEqU0f3iWkm9MgCQCJCMFkObeob8dMtH7wZ6BnTv33qifWtjC07Lw/1TvFXon8kbRVr5iZLt0zzem7h8Vx/6TIWSqjJ+KBOlO/a0PUsmDE/YXT+2lI1x+m6dfSiJV9h/6JOJGtiqv8DGbckd9oqq3fHIwurllWml0A0JWccWqwopFWozmlmllIS45bhXSD76bdlP10yu6HPG8vWu6cZmFm3bD6GO132T9BbtnQ+GCfzUiyu3+gfxTu4pw006Em3ZGCm7a7emyQdMOdF2tn/65s37TqbcMpMdoltpd8hfiLkKFc+71fF+q2Wuq7Z+XAEkESEYqfv09+qHNO555pFybP10r7j3h4LFPu+2UgDSQvlK7S/5C39euYMbddYjqPmzzD0yAJAIkIxWQisf37jXKs/mYoZq2dXivHo//yi+yAOkusv5K5TKFvdPqrk9vy6W0v38i8wJnRoAkAiSj+N9F6Fb6We0u+Yt07++tY5sSNR4p/6BCQBIBklH8Id1Iq9Xukj8LD6NY0D2Dsrp/LTkYkESAZBR/SH+jDWp3yZ+lxyOteboRUfNx22UGA5IIkIziD6kjxeAe2szyA/vyJ7dLoXrDLby6sxEgiQDJKP6QLiOV9ygVWX+E7Pd3Z1PNQeusbg5IIkAyij+kFplq98hI5qHmfzxSg3LusfjrcEASAZJR/CGdW13tHhnJPWfD1pH1KaO7pV/EA5IIkIziD+mMumr3yEj2yU/yXjiNUq//MvKGgCQCJKP4Q6rfUO0eGck/i1DBexcSXRLslS0CAiQRIBnFH1Ktxmr3yMjO03EVfnApUevZ4TcCJBEgGcUfUtWmavfIyObz2s28kugfS8JtAUgiQDKKP6T0C9XukZHtJ4icfSmldgnzvP6AJAIko7hDKqZWavfIyP4zrRZOOoOyHwp5JzxAEgGSUdwh7aXL1e6RUTRPWbxzTF2q9Wxe8AsBSQRIRnGHlE+d1O6RUXTP/b3t6erUYFzQ+1wAkgiQjOIOaTtdp3aPjKJ9Ev0/+mVS0ylBLgAkESAZxR3SRuqido+Mon81ip+6pdJl8yqdDUgiQDKKO6Q1dLPaPTJS8bIuC6+k1EovmA5IIkAyijuk7+h2tXtkpOb1kT4+m3KfCfytAyCJAMko7pCG0r1q98hI0QuN7XyuBjX+0HwOIIkAySjukN6gx9TukZGyV+xbe1sqXbNMfAxIIkAyijukgnuXRdzGVgpf+nJBK8p8YIvxESCJAMko7pCOaPvU7pGRyteQLZzUkE4a538BQEASAZIRIFlr22NZdKHvbuGAJAIkI0Cy2vIulNJd/1U4IIkAyQiQrPfRWVR9RD4gmQIkI0CSaOfw6nTePEASAZIRIEm15mZK7X8AkIwAyQiQJJt2FtV/NSaTAUkESI6HxHYMyaKrwzyA1n6AxAMk50Nih9e3p+zHonnNvxABEg+Q3ABJ2/f/atF5c5UPBiQeILkC0n62pjul9t6seDAg8QDJJZAY+6IxnfT/1A4GJB4guQYS2/5YJl2j9JcOgMQDJPdAYmxRK8p+WuFr0gASD5DcBIkVjK1Bzb9VNhiQeIDkKkiMre5KGYNCPP+ddIDEAySXQWJsSgNqUvmZhmwFSDxAch0ktql3Snq/P1UMBiQeILkPEmMfN6TTvlAwGJB4gORGSGzbA6kpvSVeFz1EgMQDJFdCYmzmmXTKp9EOBiQeILkUEtv+QFpKtPcZAiQeILkVEmOzz6Z670Y1GJB4gOReSGzHYxnUZV0UgwGJB0guhsTYN+dT3cn2BwMSD5BcDYntfDqTuvxhdzAg8QDJ3ZC835RqT7I5GJB4gOR2SCx/SCb9fbWtwYDEAyTXQ2Lsu1ZU46VCG4MBiQdIgMRY4biq1MbGa2wAEg+QAElvxRVURf4hf4DEAyRA8jWpFrX6TnIwIPEACZD8/X49ZTwg95A/QOIBEiDx3qtHTefLDAYkHiABkmhjb5J6yB8g8QAJkMx91JBO+8zyYEDiARIgBeR9yJ/VR1cAEg+QAKlCs86iem9b2xSQeIAESBWz/ugKQOIBEiBVbsF5VMfKHVkBiQdIgBSknU9mUrdNETcDJB4gAVLQvmtOjaZH2giQeIAESMHLfywtpV+EOzoAEg+QAClUs0+jpovCbgFIPEACpJBt7k1VRoZ7nBIg8QAJkMI0qSZd+VvoiwGJB0iAFK6VbenE0M99B0g8QAKksBWOzKTuoe7HCkg8QAKkCC1qSmctCH4RIPEACZAitaNfSvpjQR+GDkg8QAKkyH1cj1otD3I+IPEACZAs9Md1lDuu8tmAxAOkCh3YHakj2v6I29hqb0ls5u4+bGGvIvXfHOq6oeKZe0qjnhu8Yq0oRpPLYjT3UNKt2AMpxCX7VEA6VhKpMq004jb2Ko/RXCUrXt+GTllQ8cyEXnHQkm/FWozmlmllIS45rgISbtqFKsid73DTjoebdoBkuRmN6IIl5jMAiQdIgGS9TTdTNfMD/gCJB0iAJNOEKikD8vlHgMQDJECS6pvTqDV/DRhA4gESIMm1+XqqbTzzHSDxAAmQJCt8Oi39ad9JQOIBEiBJ90Ud+rv3mVEAiQdIgCTfLy2p8WIGSKYACZBslNfP+3twQOIBEiDZ6pXslH47AYkHSIBkr69PpTZrAMkIkADJZpv/QfXDP1uX/QBJBEiyJRkkVvh0qvF7cNUBkgiQZEs2SIx9UZdukniBP+sBkgiQZEs+SLs3t6CzZF8K3UqAJAIk2ZIQUkleb6r2pvrBgCQCJNmSERJj/0//PbjqwYAkAiTZkhMSW3gqXfa74sGAJAIk2ZIUElvfkerPVjsYkESAJFuyQmIFg1Iz/6N0MCCJAEm2pIXE2Acn0N1Bn4vVZoAkAiTZkhgS++lsukbhX5QASQRIsiUzJLbhcjr3F2WDAUkESLIlNSSW153qf61qMCCJAEm25IbECh9LqfahosGAJAIk2ZIcEmMTMtJfUDMYkESAJFvSQ2Kf1qB+4V682XKAJAIk2ZIfElvciLruUDAYkESAJJsDILHVzemSddEPBiQRIMnmBEjsz2vpnBVRDwYkESDJ5ghIbFdfqjs/2sGAJAIk2ZwBibGRqTnvRDkYkESAJJtTILHJVdJGRzcYkESAJJtjILG5talfQTSDAUkESLI5BxL76Sz6+7YoBgOSCJBkcxAktr4NXbzG/mBAEgGSbE6CxPK60Snf2x4MSCJAks1RkFjhY3TCF3YHA5IIkGRzFiTGXsrIfMXmYEASAZJsToPEpuamPGZvMCCJAEk2x0Fii06mW2095x0giQBJNudBYr+dTx022RgMSCJAks2BkNjWq6npSvnBgCQCJNmcCInt7EX1F0sPBiQRIMnmSEiMPZVSa6HsYEASAZJsDoXEXkitLvuMxoAkAiTZnAqJvZqe84ncYEASAZJsjoXE3sjIfFdqMCCJAEk250JiH2RlTpYZDEgiQJLNwZDYF1XTJkgMBiQRIMnmZEhsZm7qi9YHA5IIkGRzNCS2oFbKc5Y3BiQRIMnmbEjsu5NosNVtAUkESLI5HBJb2oAesLgpIIkASTanQ2IrTqO7rT01OCCJAEk2x0Niq86g3paeXQiQRIAkm/MhsTXnUrd8C9sBkgiQZHMBJLbhYupq4aF+gCQCJNncAIltakVXR37hF0ASAZJsroDE/uxAbbdG2giQRIAkmzsgsbzr6NLNEbYBJBEgyeYSSCzvemoe4cXIAEkESLK5BRLb1YPOWR12C0ASAZJsroHECm6jxr+E2wCQRIAkm3sgscJ7qdHyMJcDkgiQZHMRJMYG0ck/hL4UkESAJJurILGnqc63IS8EJBEgyeYuSOz5lBPmhboMkESAJJvLILFxoZ+mC5BEgCSb2yCFeZouQBIBkmyug8Tezcx8J+gFgCQCJNncB4m9n5X5VrDzAUkESLK5EBL7JCcj2BPeAZIIkGRzIyQ2LagkQBIlMqSD4/r0HF7g/2BW3xvv/xGQ7KQAEptZNW1ipTMBSZTIkEYM3pw3dmCZ9/SC3ssLvuhXDEg2UgGJzaia8XbF8wBJlMCQWJdNnu9KN6zyftDvq4DLAEkiJZCCSQIkUQJDWtKt3PP2/o/007s7f/Wvmx9dq58sOeBp7+5IHdH2R9zGVnuPx2bu7sPagdgM3lOiZMyMnIx3A88p1oqUTK5caYzmHorZiscXYRQAACAASURBVMtiNNcDKcQl+6xCmnuH/nbIRP3tus5Pbi+a2GO/5+TCFp6Whf+nKBZ9WzXzi3ivAfHK+KlIkO7U3xqQPLfwSm9d4Dm56j5Pvx2PVJlWEnEbe5XHaG7ir3h2duZn5o9LE37FFSvVSmM0+a9f8TGrkH7w3bSbqp9mnTd43g6calyGn5EkUvMzkt7UKgEvRYafkUQJ/DPSni4ePAe6rvZ+G+s9Q9OOdV8ESDZSB4l9nJX5nvgIkEQJDEkb/dDmHc88Uq7Nn65pU3utZC/3PgJINlIIKVASIIkSGVLx+N69Rnk2HzPU8y3p7dtvfGIbvwiQJFIJKUASIIkSGVKYAEkipZB0Se/7TwKSCJBkczsk9lFW5ge+U4AkAiTZXA9JSAIkESDJBkjsw8wq3sfMApIIkGQDJC4JkESAJBsgMa+kTwHJHCDJBkh6UzKzPwMkU4AkGyB5+8AjCZBEgCQbIPmanJE9G5B4gCQbIPmbnJGzEJCMAEk2QDLySJoZm8mAxAMk6ZIOEns/PfuL2EwGJCNAki75IBVPTc+JjSRAMgIk6ZIQkvZOes60WEwGJCNAki4ZIR2YFBtJgGQESNIlJST2RnrOdPWTAckIkKRLTkgeSblzlE8GJCNAki5JIcVEEiAZAZJ0yQqJvZ5efa7iyYBkBEjSJS0kNjFNtSRAMgIk6ZIXEpuQWmOB0smAZARI0iUxJI+kE5eqnAxIRoAkXTJDYmOowQqFkwHJCJCkS2pI7DE6Y426yYBkBEjSJTck1p8u2qJsMiAZAZJ0SQ6psAddvkPVZEAyAiTpkhwS23k1/S1f0WRAMgIk6ZIdEtvemnoUqpkMSEaAJF3SQ2KbzqcH1UwGJCNAki75IbG1Z9IwJZMByQiQpHMAJPbTSSnjVUwGJCNAks4JkNiimmlvKpgMSEaAJJ0jILG5OZkfRz8ZkIzkIFU1lQlIivtLIbFPMqt9GfVkQDKSg3SLp3My2nS74cKUFvcDkuL+WkjstdRa30c7GZCMpG/aTW22U3/3R5PpgKS4vxiSijuwApKRNKRmH/ve/685ICnur4ak4A6sgGQkDSlzgf87UxYgKe4vhxT9HVgByUgaUoNe3nflt9QHJMX99ZCivgMrIBlJQxpG5z8wYsTApvQEICnur4cU9R1YAclIGlL5f+qTp9pPlwKS4uIASb8D6y1R3IEVkIxs/EG2/M9lP2wqs8IIkKSKB6Qo78AKSEY2IB358TOmlQCS8uICKbo7sAKSkTykF3KJlmpP3WGJEiBJFB9IUd2BFZCMpCFNpC6veiC9nT4GkBQXJ0jR3IEVkIykIV3QXzvigaQ9eTYgKS5ekKK4AysgGUlDqvKlD9K8DEBSXNwg2b8DKyAZSUOqO8MH6ePqgKS4+EGyfQdWQDKShnRVh8M6pD3NrgEkxcURkt07sAKSkTSkr9MaP0h39ame8R0gKS6ekNjjtu7ACkhG8r/+XnCRfs+GS76x4giQZIorJHt3YAUkIzsPNS9YuXKvZi1Akii+kGzdgRWQjKQhtZll0RAgyRZfSLbuwApIRtKQGo4DJGdCsnMHVkAykoY0rennxwEpJsUbkn4H1gfkJgOSkTSkdudTZoNT9QBJcXGHxNY2lrwDKyAZSUNq27GTP0BSXPwhSd+BFZCMbD9B5MH1gKS4BICk34F1ksRkQDKyDWlBLUBSXCJAYnOrytyBFZCM5CHN7NWubdu2rXNrA5LiEgIS+zCj2kLLGwOSkTSkKZTekBpUoSst/T0JkCRKDEjsfyknWb7bHSAZSUNqcV2RlvZbyctXFAGS4hIEEhtKZ6+3uCkgGUlDyp2paWm/atpDAwFJcYkCifWl1hbvLARIRvIP7JujadUXadriBoCkuISBVHA9dSmwtCUgGUlDuujmY9p5QzRtWlVAUlzCQGLbW9F9ljYEJCNpSO9SJ+3ptH7DT74MkBSXOJDYusb0nJXtAMlI/tffU0ZrxVcTNVoOSIpLIEjspzqpVp5ZCJCMbP5BdsMaa/dcBSSJEgkS+zIna2bkrQDJCK8hK507ILEP0msuibgRIBlJQzrRKBeQFJdYkNj/UaPfI20DSEbSkLp6uyS7Gf6OpLoEg8Qeogu2RtgEkIzs3rTLbz8TkBSXaJAKe1CnCI89ByQj2z8jLW8BSIpLNEhs5xXUK/wWgGRkG1J+NiApLuEgsc3N6PGwGwCSkV1I5SMbWoFUtDdSR7UDEbex1f6S2Mzde8TCXtlqX2ls5u49rB20+S/XNEyZEO7yMptzI3VYOxSjybFacbFWHOKS/cEhNffWrDYNsgLpWEmkyrTSiNvYqzxGc1214l9qZswMc3ECrjhCWozmlmllIS4Rf3ENAumiji8dswIJN+0kSrybdp6mZVb7KvSluGlnhD/ISucuSGxiapjH+QGSESBJ5zJIYR/nB0hG0pDSc6qaAiSFJSikcI/zAyQjaUgDzk1vfdMNF6Zc2OMWT4CksESFFOZxfoBkJA1p6vl5+ru158yIhAiQJEtUSGEe5wdIRtKQzpvqe/+/5oCkuISFxNadGeJxfoBkJA0p80vf+4+zAElxiQsp5OP8AMlIGlKDnuX6u9LO9QFJcQkMKdTj/ADJSBrSv6nxfcOGDTyXngQkxSUypBCP8wMkI2lIZaPq668hW2dYKSApLqEhBX+cHyAZ2fiDbPmfy37YVGaFESBJldiQgj7OD5CM5CEV79S0w5Nf2ARIqktwSMEe5wdIRtKQ1tYdrZW0JKqxApAUl+CQgj3OD5CMpCHddP5G7V16ZeNlNwOS4hIdEtt8XsXH+QGSkTSkuu9r2o3NNO39RoCkuISHxH5rmPJSwBmAZCT/B9mFWmnNxzVtfiYgKS7xIbHFJ2R8ZP4YkIykITV6Q5tPCzVtEv4gq7okgFTxcX6AZCQN6e56T5x6ZqlWcAF+RlJdMkCq8Dg/QDKShrSzNdVeqmm31PgFkBSXFJACH+cHSEY2/iB7QH82h+W7rDgCJJmSA1LA4/wAycjOQ80PzymwpAiQ5EoSSObH+QGSkR1IW+hzQIpBSQLJ/Dg/QDICJOlcD8n0OD9AMgIk6QBJPM4PkIwASTpAEo/zAyQjO5COrdyvWQyQJEoiSMbj/ADJyP4TRG4BJMUlEyQ2hk75HZBEkpC+vabxNbP1E0efw8u6qC6pILEH6aJtgMSTg7Q0I+WUjJSPNW3eWXQOICkuuSAVdqerdwGSkRykrjVWaQUtm26/mU4YfzwYHECKouSCxPIup/6AZCQH6bQHPW/mUJW0+5gVRoAkVZJBYusb04uxmMucDyn9v543W6nDb9YYAZJUyQaJ/VQ79Z2YDHY8JHrd8yaf5lh1BEgyJR0kNi8ne15MBgMSINkv+SCxD1NPWhmLuYAESPZLQkilQ+mcjTGY63hITy5dunQWjV+qB0iKS0ZI7E5qm6d+ruMhmQMkxSUlpF3XUE/1c50OaZg5QFJcUkJiW86jocrnOh2SdIAkUXJCYr81SHlF9VxAAiT7JSkktrBq5ueK5wISINkvWSGxD9Nr/aB2LiABkv2SFhIbR6f+oXQuIAGS/ZIXEutPl+4Iu6FkgARI9ktiSAXXU9eCsFvK5QpIR378jGklgKS8JIbEtrekhxTOdQOkF3KJlmpP3WGJEiBJlMyQ2LrTaay6uS6ANJG6vOqB9Hb6GEBSXFJDYt+fkDFV2VwXQLqgv3bEA0l78mxAUlxyQ2LTM3O/VTXXBZCqfOmDNC8DkBSX5JDYayn1f1E01wWQ6s7wQfq4OiApLtkhsUF0wVY1c10A6aoOh3VIe5pdA0iKS3pIhbfQVbuUzHUBpK/TGj9Id/WpnvEdICku6SGxne2pj5K5LoCkLbhIfzDSJd9YcQRIMiU/JLapKY1UMdcNkDStYOXKvZq1AEkiB0BiP9dJfUvBXHdAkgiQJHICJPZldpU50c91OqRzzAGS4hwBiU1OPXF51HOdDqmtOUBSnDMgsWF09oZo5zodknSAJJFDILG76bJon1jIFZB2zZ789txdgKQ8p0DadR3dGuVcF0Da9890/dffKb0OAZLinAKJbWlGT0Y31wWQ7si4++2ZX7zWlfoDkuIcA4n9dnLKf6Oa6wJINd/2vR98IiApzjmQ2KLqmZ9FM9cFkLLyfe8X5gCS4hwEiX2UXnNpFHNdAOni733vX2kPSIpzEiQ2nk5Za3+uCyAtaLm4XNNKZ533MyApzlGQ2ED9xZrt5gJIretQ1TPOyKZGTazcuwGQJHIWpMKbqLPtJxZyAaSL28jcuwGQJHIWJLa9FT1gd64LIMkFSBI5DBJbdwb9x+Zcd0Aq2ucNkBTnNEhsea209+zNdQGkTf+oihcai81gx0FiMzKrfWNrrgsgXVGj16DB3gBJcc6DxCam1FtlZ64LIFX93gogQLKRAyGxwdR0k425LoBUNw+QAMkoIqTCHtQpX36uCyA9OgKQAMkoIiS2swPdLj/XBZCOXdV20Ghv/KyD4/r0HF7AP1zQeSkg2cmRkNjmpvSs9FwXQBpNVPG3diMGb84bO7DM/9G+27sBkq2cCYmtqJM6WXauCyDV7/bdxi3ejHNYl02e70o3rDKkTbodkGzlUEhsXnaVLyXnugBSVqVfNizpVu55e/9H/o/6HvFBKjngae/uSB3R9kfcxlZ7j8dm7u7D2oHYDN5TEpu5u4u1ohhNLrW01Vspp2yQm3soZisui9FcD6QQl4h7LgRAumiVVqG5d+hvh0z0fnCw90rNB2lhC0/LKm6M3NhTdFVpvNcQt8r4qQBI33b8pcKGc+/U3/ohvfii5oe0Wv+j7dqjkSrVjkXcxlbHymIz17Pi47EZfKw8NnOPlsRqxUctrvjwVTRUam7cVyxdiVYS6qLgkNo2pGqnejPO+cF3026qfnpl7yIDkjf8jCSRU39G8rS2QeqHMnNd8DNSu05Gxjl7umzQtANdV+unx3Tr2bNnl+6jAMlGDobE5mbW/FlirgsgGR1cz0+OfmjzjmceKdfmT/ft/m3zDwCSjZwMiY2iiySeNdJFkBbU4ieLx/fuNcqz+Zihvo9x085ejobEutOd1jd2A6SZvdq1bdu2dW7tYLwqBkgSORvSn01oguWNXQBpCqU3pAZV6MpZgKQ4Z0NiP+RmLbS6rQsgtbiuSEv7reTlK4oASXEOh8TeoNOtPqTCBZByZ2pa2q+a9tBAQFKc0yGxe+m6QmtbugBSlTmaVn2Rpi1uAEiKczyk/NY0zNqWLoB00c3HtPOGaNq0qoCkOMdDYr+flD7N0oYugPQuddKeTus3/OTLAElxzofEvkiv86uV7VwASZsyWiu+mqjRckBSnAsgsaHUysrfZd0AyduGNcetOAIkmdwAqfB6GmBhMzdAKt6paYcnv7AJkFTnBkhsy1kpkyJv5QJIa+uO1kpaEtVYAUiKcwUktii76vcRN3IBpJvO36i9S69svOxmQFKcOyCx/1HjLZG2cQGkuu9r2o3NNO39RoCkOJdAYr3pxkibuABS5kKttObjmjY/E5AU5xZIeRfS8xE2cQGkRm9o82mhpk2qD0iKcwsktrJWxqzwW7gA0t31njj1zFKt4AL8jKQ610Bin6Sd/EfYDVwAaWdrqr1U026pUfE5UAAp2twDiT1K7XeFu9wFkDTtgP632OW7rDgCJJlcBKmgIw0Kd7krIGmH5xRoFgMkiVwEia1vlPJOmIvdAWkLfQ5IMchNkNi8zBN+Cn0pIAGS/VwFiT1P520PeSEgAZL93AWJ9aDeIS8DJECyn8sgbW9GL4e6zOmQthdrW45px1buB6QY5DJIbFn1rK9CXOR0SFVmaGTpAX2AZCO3QWLvpDRaH/wSp0PKvn0RvbHYHyApznWQ2AC6JvjTCjkdUk8yBUiKcx+k/DY0NOgFTodUMvMtGjbZHyApzn2Q2Jp6qVODne90SJ46rbMCCJBs5EJIbHZG7WBPK+QCSJq2e+bEN+ZaesJiQJLKjZDYMGoZ5GmFXACp7NEM/QekqmMASXWuhFTYme6tfK4LII2hGyfNnvnatfQ2ICnOlZDYlrPplUpnugBS00d87++5GJAU505IbFFO1e8qnucCSFlf+d7PygYkxbkUEnuNGm+ucJYLIFWd4Xv/RTVAUpxbIbE76YYK57gA0uVXHtPfHbnmCkBSnGsh7WxFowLPcQGkWSmn9B/xbL8GqV8CkuJcC4mtqpUxM+AMF0DSPm+i//r7fEsvIQtIMrkXEvs0re5q88dugKRpeT9afOoTQJLKxZDY43S5+WmF3AEJT34Sm8FuhlTQiR4yfegOSHiEbGwGuxkS23BKylviI0ACJPu5GhL7ukqN5fwDQAIk+7kbEnuBzt1mnAYkQLKfyyGxW+mfxkl3QMKTn8RmsNsh7biAxvtPugBSizW+9580BSTFuR0S+7FG1gLfKRdA8j+LUMlwvNCY6lwPib2X0mid94TjIZme+wQPo1AdILEH6OoC/b3jIa16ibrerdf339sBSXGAxHZdQU/q7x0PSdOuXW8FECDZCJAYW1s/9WPmCkhyAZJEgORpTmbNFa6AdKJRLiApDpD0RtDFeW6A1NXbJdnNBgKS4gDJW1fq6wZI/vLbzwQkxQGSty1n0UT3QNKWtwAkxQGSr8XZNda6B1I+nkVIdYDk7z90WalbIJWPbAhIigMkf4VX0wuOh9TcW7PaNAiQFAdIRmvrZC2JzeREg3RRx5eOAZLiAIk3hZruiM3khIEkFyBJBEi8Q33o/thMThxIG6e/P3MHIMUgQOIdOnhG6ucxmZwokKY18971u803gKQ8QOId0uanNdgQi8kJAmkc5fR6cfL4W3NS3wQk1QES75BW9AB1j8XkxIC0KrXtTu+JvMsyLL0EJiBJBEg8D6SdF9LrMZicGJDuqLnbf2p3zXsBSXGAxNPvIvRdlRqr1E9ODEin9eMn72kMSIoDJJ73vnYj6IpC5ZMTA1LWWH7y/3AXIdUBEs8LqbAjjVY+OTEgVRvNTz6PxyOpDpB4vnt//1oza5HqyYkB6fx/8pOdmwOS4gCJ538YxWQ6P0/x5MSA9HjGav+pJalDAUlxgMQzHo90Ez2seHJiQNpZ4+Q5+vuyKbVO3F2ZDSBFFSDxDEgbG6ZOVzs5MSBpC6rTaTf16VKfai+x4giQZAIkHn+E7PTUU7YonZwgkLStA04motMH5VtyBEgyARJPPNS8P/VSOjlRIHk6sOOgNUWAJBcg8QSkvHPpTZWTEwiSTEX7InXUwja2OlASm7meFR+MzeD9pbGZu++IdihGk8tiNPewWPHSrBPXKZwcqxUXa8UhLjmgAtLR45Eq00oibmOrkvLYzI3dio/HasWlSbjiUn56JF1zTN3kv2LFgYlHwOKBfQHhpp0o9jftGCtoS+PUTU7Sm3aAJBEg8QKe127VCTk/KJsMSLIBEi/JIbEJdOFOVZMBSTZA4iU7JNaVBquaDEiyARIv6SFtaJA+V9FkQJINkHhJD4lNTTltq5rJgCQbIPGSHxK7i+5SMxmQZAMkngMgbWuc8oGSyYAkGyDxHACJfZVRe62KyYAkGyDxnACJPUF/VzEZkGQDJJ4jIOW3ogkKJgOSbIDEcwQk9lO1qj9GPxmQZAMknjMgsXF0ya6oJwOSbIDEcwgkdi0NjXoyIMkGSDynQPqjTsb8aCcDkmyAxHMKJPYunb09ysmAJBsg8RwDid1G90Y5GZBkAySecyD9eUbKh9FNBiTZAInnHEhsdlr99VFNBiTZAInnIEjsEbo+qsmAJBsg8ZwEKf9iejWayYAkGyDxnASJLcmuvjKKyYAkGyDxHAWJjabWBfYnA5JsgMRzFqTCq2iE/cmAJBsg8ZwFif1WK/Nb25MBSTZA4jkMEnuLmuywOxmQZAMkntMgsVvoX3YnA5JsgMRzHKRNjVK/sDkZkGQDJJ7jILGZaQ022JsMSLIBEs95kNj91MPeZECSDZB4DoSUdx69YWsyIMkGSDwHQmLfVTlhlZ3JgCQbIPGcCIk9S1cU2pgMSLIBEs+RkAra0fM2JgOSbIDEcyQk9mvNrMXykwFJNkDiORMS+x9dkCc9GZBkAySeQyGxG+lR6cmAJBsg8ZwKaWPD1OmykwFJNkDiORUS+zTl1C2SkwFJNkDiORYSu4dul5wMSLIBEs+5kPKa0mS5yYAkGyDxnAuJLcw88XepyYAkGyDxHAyJDaGOUndwACTZAInnZEgFl9H/yUwGJNkAiedkSOzn3JwfJCYDkmyAxHM0JPYyXbjT+mRAkg2QeM6GxLrQk9Y3BiTZAInncEjrTkqfa3ljQJINkHgOh8Q+Tjl9q9VtAUk2QOI5HRK7k+62uikgyQZIPMdD2tY4ZYrFTQFJNkDiOR4Sm5d+0jprWwKSbIDEcz4k9jjdYm1DQJINkHgugJTfjKZa2hCQZAMkngsgsXlpjf60sh0gyQZIPDdAYn3pISubAZJsgMRzBaQtJ6cvtLAZIMkGSDxXQGJT6KJdkbcCJNkAiecOSOx6Gh15I0CSDZB4LoH0e41qkZ8OHJBkAySeSyCxMXRVxG0ASTZA4rkFUsGl9GakbQBJNkDiuQUSW5pZN9IL+QGSbIDEcw0k9ijdEWELQJINkHjugZR3durM8FsAkmyAxHMPJDY9pXH4V6gAJNkAieciSKwXDQ57OSDJBkg8N0HaVC/z+3CXA5JsgMRzEyQ2kdqEe+pVQJINkHiugsSupfFhLgUk2QCJ5y5IK6tW/y30pYAkGyDx3AWJPUs3hr4QkGQDJJ7LIO1qTu+GvBCQZAMknssgsW8zGoZ8wkhAkg2QeG6DxAZS/1AXJTKkg+P69Bxe4Du9Z+xt3Z9YB0h2AiRelJC2nZo6J8RFiQxpxODNeWMHlnlPPzx4084Xeh0BJBsBEi9KSOxjOjfES70kMCTWZZPnu9INq/TTRaO2aVph5/WAZCNA4kULiXWjYcEvSGBIS7qVe97e/xE/Y21X/d8e3uGJ7Y3UUe1AxG1sta8kNnP3HtGKYjN4X2ls5u49rB2M0eTYrfhQdAM21M7+OegFZdHNDVmxVhzikv1WIc29Q387ZKLxcdGAyfq7hS08LQv/TxGKUW/StfFegr8yfioSpDv1txzS9nte0b9DaasHe1p7NFKl2rGI29jqWFls5npWfDw2g4+Vx2bu0ZJYrfho4q74SEeaHOz82K24JNRFViH94LtpN9X30aqeM0yX4WckifAzEi/qn5EYW1al1h9Bzk7gn5H2dNmgaQe6rvZ+8PutP5kvAySJAImnABJ7im4Ncm4CQ9JGP7R5xzOPlGvzp2vH+nlf8Am//rYTIPFUQMpvlvJJ5XMTGVLx+N69Rnk2HzNUW9XZ20xAshEg8VRAYvPSzthe6cxEhhQmQJIIkHhKILG7g7xABSDJBkg8t0La3CD964rnAZJsgMRzKyT2AV1c8QUqAEk2QOK5FhL7R6UXqAAk2QCJ515Iq2tU+yXwHECSDZB47oXE/kNXB54BSLIBEs/FkAouockBZwCSbIDEczEktiTzpI3mjwFJNkDiuRkSe4TuMn8ISLIBEs/VkPLOTp1l+hCQZAMknqshsWkpZ5leoAKQZAMknrshsZ70hPgAkGQDJJ7LIa2vY3qBCkCSDZB4LofEXjW9QAUgyQZIPLdDYtfQi8ZJQJINkHiuh7Sias01/pOAJBsg8VwPiQ2nm/ynAEk2QOIB0q7m9J7vFCDJBkg8QGLfGC9QAUiyARIPkBgbQAO87wFJNkDiAZL+AhVpC/T3gCQbIPEAydNHdJ7+AhWAJBsg8QBJ7yYazgBJPkDiAZLeulrZPwGSfIDEAyRvL9KVgCQfIPEAyVthe3oVkKQDJB4g+VqWVWsdIMkGSDxA8vcE9QQk2QCJB0j+8s9L+TIWcxkg2QiQREkGic1NPWtHTAYDknyAJEo2SOwueiQ2gwFJOkASJR2kzQ3Tv4nJYECSDpBESQeJfUItCmIxF5CkAyRR8kEq+zv9JxZzAUk6QBIlIaTVNXJ/ibyZdIAkHSCJkhASG13xBSqUBEjSAZIoGSEVtKK31M8FJOkASZSMkNiijMAXqFASIEkHSKKkhMQeoruVzwUk6QBJlJyQ8s4KeIEKJQGSdIAkSk5IbFpK07xIG0oGSNIBkihJIbFb6SnFcwFJOkASJSuk9bUzl6idC0jSAZIoWSGx/9FlheE3lAyQpAMkUdJCYtfQy0rnApJ0gCRKXkgrqtZcq3IuIEkHSKLkhcSGUTeVcwFJOkASJTGk/AvofYVzAUk6QBIlMST2TXqjrermApJ0gCRKZkisPw1UNxeQpAMkUVJD2naK7wUqlARI0gGSKKkhsQ/pvHxVcwFJOkASJTckdiM9q2ouIEkHSKIkh7TmBP0FKpQESNIBkijJIbHx1FHRXECSDpBEyQ6psB1NVDMXkKQDJFGyQ2JLM+usVzIXkKQDJFHSQ2JP0C1K5gKSdIAkSn5I+RfQOyrmApJ0gCRKfkjsm4x6Kp5TCJCkAySRAyCxh6iPgrmAJB0giZwAKa9JytTo5wKSdIAkcgIkNjdNwd3AAUk6QBI5AhLrT/dEPReQpAMkkTMgbTs9dUa0cwFJOkASOQMSm5Zy5vYo5wKSdIAkcggkdic9EOVcQJIOkEROgfTnqWnzo5sLSNIBksgpkNinKWdH92zggCQdIIkcA4n1oMeimgtI0gGSyDmQNtVPXxjNXECSDpBEzoHE3qdmO6OYC0jSAZLIQZDYjTQ0irmAJB0giZwEaV3tzO/sz40xpKJ9kTpqYRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns4HiueRC13255brBWHuOSACkhHI1aqHYu8kZ2OlcVmrmfFx2M0uTxGc0uwYl64Fd9AY23PLdFKQl2kAhJu2kmEm3a8eNy0Y+z3mtk/2p2Ln5GkAySRsyCxCdTW7uv4AZJ0gCRyGCTWkV6wOReQpAMkkdMg/Vojd6W9uYAk0i8KqwAAC59JREFUHSCJnAaJjaEO9m7cAZJ0gCRyHKTCDjTB1lxAkg6QRI6DxH6uWv1XO3MBSTpAEjkPEhtB19qZC0jSAZLIgZAKLqXXbcwFJOkASeRASGxpVq218nMBSTpAEjkREhtC3eTnApJ0gCRyJKT8C+kt6bmAJB0giRwJiX2bedIG2bmAJB0giZwJiQ2inrJzAUk6QBI5FFJeU/pIci4gSQdIIodCYgszGm6RmwtI0gGSyKmQ2L/oLrm5gCQdIIkcCynvnJRPpOYCknSAJHIsJDY79ZQ/ZeYCknSAJHIuJNaXBsjMBSTpAEnkYEjbTkudJTEXkKQDJJGDIbHPU87aYX0uIEkHSCInQ2K308PWNwYk6QBJ5GhIm09OX2B5Y0CSDpBEjobEptC5lp9XH5CkAySRsyGx7vSE1U0BSTpAEjkc0vq6mYssbgpI0gGSyOGQ2Lt08S5rWwKSdIAkcjok1pmesbYhIEkHSCLHQ1p3Yub3ljYEJOkASeR4SOwValVgZTtAkg6QRM6HxP5Oo61sBkjSAZLIBZB+OyFnuYXNAEk6QBK5ABIbT+0sPK8+IEkHSCI3QCq8ksZH3gqQpAMkkRsgsRXVcn+JuBEgSQdIIldAYqPp6ojbAJJ0gCRyB6SCNvS/SNsAknSAJHIHJPZDlVprImwCSNIBksglkNgw6hJhC0CSDpBEboFU0Iomh98CkKQDJJFbILElWSf+EXYDQJIOkESugcSeoO5hLwck6QBJ5B5I+c3pnXCXA5J0gCRyDyT2TUa9jWEuBiTpAEnkIkjsIeod5lJAkg6QRG6ClNck5ePQlwKSdIAkchMkNjet0daQFwKSdIAkchUkdh/1C3kZIEkHSCJ3Qdp2RuqMUJcBknSAJHIXJDYt5cztIS4CJOkASeQySOwu+leISwBJOkASuQ3Sn6emzQ9+CSBJB0git0Fin6WcnRf0AkCSDpBEroPEbqXHgp4PSNIBksh9kDY1SP8q2PmAJB0gidwHib1PzYK9aBIgSQdIIhdCYjfRkCDnApJ0gCRyI6R1dTIXVz4XkKQDJJEbIbHXqUXlF00CJOkASeRKSOx6GlHpPECSDpBE7oT0e83sHyueB0jSAZLInZDYBGpb8Xn1AUk6QBK5FBK7jsZWOAeQpAMkkVsh/Vojd2XgOYAkHSCJ3AqJjaX2gTfuAEk6QBK5FlLhFTQh4AxAkg6QRK6FxFZUrf6r+WNAkg6QRO6FxJ6ja8wfApJ0gCRyMaSCS2mi6UNAkg6QRC6GxJZm1VorPgIk6QBJ5GZIbAjdJD4AJOkASeRqSPkX0lv8A0CSDpBErobEvs08aYNxGpCkAySRuyGxx+hW46QKSAfH9ek5vKDyaUCSC5B4SQIprylN859UAWnE4M15YweWVToNSHIBEi9JILH5T+X7TymAxLps8nwnumFVxdOAJBkg8ZIFkkgBpCXdyj1v7/+owumiNZ4K9kXqqFYUcRtbHSiJzVzPig/GZvD+0tjM3XdEOxSjyWUxmns46VZcrBWHuOSAVUhz79DfDplY4fTCFp6Whf+nCDk+/nNOREh36m/9kMTpdSM9rT8SqVLtWMRtbHW0LDZzj5TEasVHkm/F5TGaW6Idj9HkWK34eOgVW4X0g+/m3NSKp/XwM5JE+BmJ58qfkfZ02aBpB7qurngakCQDJJ4rIWmjH9q845lHyrX508VpQJIPkHjuhFQ8vnevUZ7NxwwVpwFJPkDiuRNSmABJIkDiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiARIg2Q+QeIAESPYDJB4gAZL9AIkHSIBkP0DiAZJ0c0buiuX4GPTVyD/jvQTJFo/cGO8lSLZs5O/xXoJkK0eujLhNTCGNbvFHLMfHoJda/BzvJUg2scV38V6CZO+3mBfvJUj2eYvPI24DSAEBUuwDJOkAKfYBUuwDJOkAKfYBEkIoRICEkIIACSEFARJCCooJpIPj+vQcXlD5dOJmXuWesbd1f2JdfNcTuQrHdUHnpXFcjKUCVjyr7433/xjX5VjJvOTtz/a65YnQf0qOCaQRgzfnjR1YVul04mZe5cODN+18odeROK8oUoHHdd/t3RIeknnFC3ovL/iiX3GcVxQx05LL+00oPvreP4tCbRoLSKzLJg/mG1ZVPJ24mVdZNGqbphV2Xh/vNYWvwnEdPen2RIcUsOJ+X8V5NZYyL3l/57WatrdzyFsqsYC0pFu55+39H1U8nbhVWuXarntDb50IBa54Sd8jCQ/JvOLdnb/6182Pro3ziiIWcJAfG1905IO+x0JtGwtIc+/Q3w6ZWPF04lZxlUUDJsdtLdYKWPHB3iu1hIdkXvG6zk9uL5rYY398VxSxgIO8Z2Dnzr1D30E4JpDuFJ/ffDpxq7DK7fe8Uh7H1VgpYMUvvqglASTTitd19txcKr11QXxXFDHzkksenLC/eGqvkDdUYgHpB993xKkVTydugatc1XNGXFdjJfOKV/YuSgJI5hWzzhs8bwcm+LUiYMkruui/frpreqhtYwFpTxfPYTrQdXXF04lbwCp/v/WnOC/HQuYVj+nWs2fPLt1HxXtN4TOvuKy357+qY90XxXtNETIv+efO+u8Ye/+lkLTRD23e8cwj5dr86eJ0Ymda8bF+U/THPib6r79NK/Y+4PS2+QfivaQIma8VU3utZC/3TvRjbF5yce8JB4992m1nqE1jAql4fO9eozy3JscMFacTO9OKV3X2NjPeS4qQ+RjrJfxNu4AVl719+41PbIv3iiJmXvLW4b16PP5ryE1xFyGEFARICCkIkBBSECAhpCBAQkhBgISQggAJIQUBEkIKAqQEaRjp5bb/NOKWbc8JMcDSn2RD/GsUZYCUIA2jJ19//bWhp9CLkbbUKays/HUDpLgGSAmS30HRqbmR7oGmU3gZkBIsQEqQDAeP0DJN++aq3OyLJnk+anf5io65dXroz78xpVV2bospmpfCtZ5bgS3anuh9uGaH2scDBmjin7c9sUT/8NL6pWIiIMUmQEqQDAdD6TttQVr7GfP70wua1qlRqy8LPknro2kf0o0zZ15HM70U1nel5Wsm0See7fNTHwgcoIl//l+a7/nwz5RHTBMBKTYBUoJkOLg8fb92UWP9sS9dPDfyOpH+zN6dGmjaqI6e7z8H0nv5KNzt+bodrNbZc9kE+jlwgCb+OUu/x3NiLK00TQSk2ARICdIwmpWfv/PHu+g+rYAePOLpVfpR65SjX9Yn1diqYTsBSbsz3XOTr10zPsAPyfTP/1a3TNNanWc+C5BiEyAlSL5ff1P6gKPaSvL3mdbpVP0ync2Bp5tVT0ujtiZIi2mclpcyhg/wQzL983fpa20LPW8+C5BiEyAlSMNo/Jw5cxfv03QJdy31xkyQ2qc9tejX3xqYIWlnX6C9mLaTD+CQ+D8/mDNQ+0/KNvNZgBSbAClBMv3SbQ/1MU5ySBuon+dESZUASKNp9SXXVRpg+ufaLQ20llcEnAVIsQmQEiTzn4EuqaF/Y3p7SImAtIaGa/qfj1r7KPQl/RfbO9N60pTKA8Q/16bR5zQp4CxAik2AlCCZIX2TccHb84Zm3GH6jnS80cnTvnv0iityFx7SKfybhuu/+/4HVT8sBjw6Qe9b0z/Xjtc6o8qBgImAFJsAKUEKuGPC4qtzM84eU2KCpC1vk3PSvQdm1K65Tqew/aIMHcSn1Nc0wNdA0z/XtHvon4ETASk2AVIyN12/GwRKhAApiTvesnW8l4D8AVLStm3adWmJ/1pdbgmQkrZJKafPivcakBEgIaQgQEJIQYCEkIIACSEFARJCCgIkhBQESAgpCJAQUtD/B4iXRsAuE6zrAAAAAElFTkSuQmCC", - "text/plain": [ - "plot without title" - ] - }, - "metadata": { - "image/png": { - "height": 420, - "width": 420 - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Create a data frame to store the results\n", - "results_y <- data.frame(\n", - " Alphas = model_y$lambda,\n", - " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", - ")\n", - "\n", - "results_d <- data.frame(\n", - " Alphas = model_d$lambda,\n", - " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", - ")\n", - "\n", - "# Plot Outcome Lasso-CV Model\n", - "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n", - "\n", - "# Plot Treatment Lasso-CV Model\n", - "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", - " geom_line() +\n", - " labs(\n", - " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", - " x = \"Penalty Level\",\n", - " y = \"Out-of-sample R-squared\"\n", - " )\n" - ] + "id": "tNLVM4WEgL9v", + "outputId": "1f2683b7-630a-43c5-e110-74c527603850", + "vscode": { + "languageId": "r" } - ], - "metadata": { + }, + "outputs": [], + "source": [ + "# Add Double Lasso results to the table\n", + "table <- rbind(table, c(\"Double Lasso\", hat, stderr, ci_lower, ci_upper))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "smPkxqCpgMR8" + }, + "source": [ + "## Method 2: Lasso with Cross-Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MH-eUye8liRq" + }, + "source": [ + "This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YhpTUkE_wQz9", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Choose penalty based on KFold cross validation\n", + "set.seed(123)\n", + "# Given small sample size, we use an aggressive number of 20 folds\n", + "n_folds <- 20\n", + "\n", + "\n", + "# Define LassoCV models for y and D\n", + "model_y <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = y,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "model_d <- cv.glmnet(\n", + " x = as.matrix(W),\n", + " y = D,\n", + " alpha = 1, # Lasso penalty\n", + " nfolds = n_folds,\n", + " family = \"gaussian\"\n", + ")\n", + "\n", + "# Get the best lambda values for y and D\n", + "best_lambda_y <- model_y$lambda.min\n", + "best_lambda_d <- model_d$lambda.min\n", + "\n", + "# Fit Lasso models with the best lambda values\n", + "lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y)\n", + "lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d)\n", + "\n", + "# Calculate the residuals\n", + "res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W))\n", + "res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cbVsr86tyqTY", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "tmp_df <- as.data.frame(cbind(res_y, res_d))\n", + "colnames(tmp_df) <- c(\"res_y\", \"res_d\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D7SzuZ2P0P0X", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "fit_cv <- lm(res_y ~ res_d, data = tmp_df)\n", + "est_cv <- summary(fit_cv)$coef[\"res_d\", 1]\n", + "\n", + "hcv_cv_coefs <- vcovHC(fit_cv, type = \"HC1\") # HC - \"heteroskedasticity cosistent\"\n", + "se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors\n", + "\n", + "# Calculate the 95% confidence interval for 'gdpsh465'\n", + "lower_ci_cv <- est_cv - 1.96 * se_cv\n", + "upper_ci_cv <- est_cv + 1.96 * se_cv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "R", - "name": "ir" + "id": "Ctl5T5vUygRk", + "outputId": "1fc3990f-10c2-4e94-b1e9-a13b7a08cbab", + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Add LassoCV results to the table\n", + "table <- rbind(table, c(\"Double Lasso CV\", est_cv, se_cv, lower_ci_cv, upper_ci_cv))\n", + "\n", + "# Print the table\n", + "print(table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0LzDsUi8gmQM" + }, + "source": [ + "We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 857 }, - "language_info": { - "name": "R" + "id": "7uzcIGhVgmei", + "outputId": "ecff8a3f-60da-4b92-c6f7-cb40a116ec82", + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# Create a data frame to store the results\n", + "results_y <- data.frame(\n", + " Alphas = model_y$lambda,\n", + " OutOfSampleR2 = 1 - model_y$cvm / var(y)\n", + ")\n", + "\n", + "results_d <- data.frame(\n", + " Alphas = model_d$lambda,\n", + " OutOfSampleR2 = 1 - model_d$cvm / var(D)\n", + ")\n", + "\n", + "# Plot Outcome Lasso-CV Model\n", + "ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n", + "\n", + "# Plot Treatment Lasso-CV Model\n", + "ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) +\n", + " geom_line() +\n", + " labs(\n", + " title = \"Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level\",\n", + " x = \"Penalty Level\",\n", + " y = \"Out-of-sample R-squared\"\n", + " )\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "R" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/PM2/r_experiment_non_orthogonal.Rmd b/PM2/r_experiment_non_orthogonal.Rmd index ef14f649..42778dec 100644 --- a/PM2/r_experiment_non_orthogonal.Rmd +++ b/PM2/r_experiment_non_orthogonal.Rmd @@ -7,8 +7,10 @@ output: html_document ```{r} install.packages("hdm") +``` + +```{r} library(hdm) -library(stats) ``` ## Generating RCT data @@ -20,7 +22,7 @@ gen_data <- function(n, d, p, delta, base) { y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) y <- y1 * D + y0 * (1 - D) - return(list(y=y, D=D, X=X)) + return(list(y = y, D = D, X = X)) } ``` @@ -41,7 +43,8 @@ twomeans <- function(y, D) { V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated hat <- hat1 - hat0 # estimate of the treatment effect - stderr <- sqrt((V0 + V1) / n) # standard error of the estimate of the treatment effect + # standard error of the estimate of the treatment effect + stderr <- sqrt((V0 + V1) / length(y)) return(list(hat = hat, stderr = stderr)) } ``` @@ -99,16 +102,15 @@ partialling_out(y, D, cbind(D * X, X)) # Now we simply replace OLS with Lasso to implement the Double Lasso process double_lasso <- function(y, D, W) { - require(hdm) # residualize outcome with Lasso - yfit_rlasso <- rlasso(W, y, post = FALSE) + yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) yres <- y - as.numeric(yhat_rlasso) # residualize treatment with Lasso - dfit_rlasso <- rlasso(W, D, post = FALSE) + dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) Dres <- D - as.numeric(dhat_rlasso) @@ -294,16 +296,15 @@ mean(stderrs) # and close to the calculate standard errors; we correctly estimat # Now we simply replace OLS with Lasso to implement the Double Lasso process double_lasso <- function(y, D, W) { - require(hdm) # residualize outcome with Lasso - yfit_rlasso <- rlasso(W, y, post = FALSE) + yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) yres <- y - as.numeric(yhat_rlasso) # residualize treatment with Lasso - dfit_rlasso <- rlasso(W, D, post=FALSE) + dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) Dres <- D - as.numeric(dhat_rlasso) @@ -311,7 +312,7 @@ double_lasso <- function(y, D, W) { hat <- mean(yres * Dres) / mean(Dres^2) epsilon <- yres - hat * Dres V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 - stderr = sqrt(V / length(y)) + stderr <- sqrt(V / length(y)) return(list(hat = hat, stderr = stderr)) } @@ -333,7 +334,7 @@ for (i in 1:n_experiments) { # Calculate single lasso estimate - yfit_rlasso <- rlasso(cbind(D, X), y, post = FALSE) + yfit_rlasso <- hdm::rlasso(cbind(D, X), y, post = FALSE) hat <- yfit_rlasso$coefficients[2] hats[i] <- hat @@ -369,8 +370,8 @@ for (i in 1:n_experiments) { # run a big lasso y ~ D, X - DX = cbind(D,X) - yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality + DX <- cbind(D, X) + yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality coefs <- yfit_rlasso$coefficients[2:n] selected_columns <- X[, abs(coefs) > 0.0] # run OLS on y ~ D, X[chosen by lasso] @@ -412,9 +413,9 @@ mean(stderrs) ### Not RCT Data ```{r} -gen_data_nonRCT <- function(n, d, p, delta, base) { +gen_data_non_rct <- function(n, d, p, delta, base) { X <- matrix(rnorm(n * d), nrow = n, ncol = d) - D <- X[, 1] + rnorm(n, mean = 0, sd = 1/4) + D <- X[, 1] + rnorm(n, mean = 0, sd = 1 / 4) y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1) return(list(y = y, D = D, X = X)) } @@ -429,15 +430,15 @@ stderrs <- numeric(n_experiments) for (i in 1:n_experiments) { # Generate data for each experiment - data <- gen_data_nonRCT(n, d, p, delta, base) + data <- gen_data_non_rct(n, d, p, delta, base) y <- data$y D <- data$D X <- data$X # run a big lasso y ~ D, X - DX = cbind(D, X) - yfit_rlasso <- rlasso(DX, y, post=FALSE) # could just use this functionality + DX <- cbind(D, X) + yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality coefs <- yfit_rlasso$coefficients[2:n] selected_columns <- X[, abs(coefs) > 0.0] # run OLS on y ~ D, X[chosen by lasso] @@ -477,7 +478,7 @@ stderrs <- numeric(n_experiments) for (i in 1:n_experiments) { # Generate data for each experiment - data <- gen_data_nonRCT(n, d, p, delta, base) + data <- gen_data_non_rct(n, d, p, delta, base) y <- data$y D <- data$D X <- data$X diff --git a/PM2/r_heterogenous_wage_effects.Rmd b/PM2/r_heterogenous_wage_effects.Rmd index f588c35a..d2709e27 100644 --- a/PM2/r_heterogenous_wage_effects.Rmd +++ b/PM2/r_heterogenous_wage_effects.Rmd @@ -15,6 +15,9 @@ This analysis allows a closer look how the gender wage gap is related to other s ```{r} install.packages("hdm") install.packages("xtable") +``` + +```{r} library(hdm) library(xtable) ``` @@ -33,14 +36,14 @@ Z <- subset(data, select = -c(lwage, wage)) ```{r} center_colmeans <- function(x) { - xcenter <- colMeans(x) - x - rep(xcenter, rep.int(nrow(x), ncol(x))) + xcenter <- colMeans(x) + x - rep(xcenter, rep.int(nrow(x), ncol(x))) } ``` ```{r} # create the model matrix for the covariates -controls_formula <- '~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2' +controls_formula <- "~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2" Zcontrols <- model.matrix(as.formula(controls_formula), data = Z) Zcontrols <- center_colmeans(Zcontrols) ``` @@ -49,7 +52,7 @@ Construct all the variables that we will use to model heterogeneity of effect in ```{r} # create the model matrix for the linear heterogeneity -linear_het_formula <- '~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)' +linear_het_formula <- "~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)" Zhet <- model.matrix(as.formula(linear_het_formula), data = Z) Zhet <- center_colmeans(Zhet) ``` @@ -59,7 +62,7 @@ Construct all interaction variables between sex and heterogeneity variables ```{r} # create the model matrix for the higher order heterogeneity Zhet <- as.data.frame(cbind(Zhet, "sex" = Z$sex)) -nonlin_het_formula <- '~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)' +nonlin_het_formula <- "~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)" Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet) interaction_cols <- Zinteractions[, grepl("sex", colnames(Zinteractions))] ``` @@ -75,7 +78,7 @@ Get estimates and CIs ```{r} # this cell takes 30 minutes to run index_gender <- grep("sex", colnames(Zinteractions)) -effects_female <- rlassoEffects(x = X, y = y, index = index_gender, post = FALSE) +effects_female <- hdm::rlassoEffects(x = X, y = y, index = index_gender, post = FALSE) result <- summary(effects_female) result$coef print(xtable(result$coef[, c(1, 2, 4)], type = "latex"), digits = 3) diff --git a/PM2/r_linear_penalized_regs.Rmd b/PM2/r_linear_penalized_regs.Rmd index b1ed57c1..a2f52db0 100644 --- a/PM2/r_linear_penalized_regs.Rmd +++ b/PM2/r_linear_penalized_regs.Rmd @@ -9,7 +9,11 @@ output: html_document install.packages("xtable") install.packages("hdm") install.packages("glmnet") +install.packages("ggplot2") +install.packages("tidyr") +``` +```{r} library(hdm) library(xtable) library(glmnet) @@ -210,8 +214,8 @@ To select those parameters, they use $\lambda$ as variable to penalize\ **Funny thing: the function rlasso was named like that because it is the "rigorous" Lasso.** ```{r} -fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level -fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level +fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level +fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level ``` ```{r} @@ -242,19 +246,18 @@ Next we code up lava, which alternates the fitting of lasso and ridge # Define function to compute lava estimator. Doing an iterative scheme with fixed # number of iteration. Could iterate until a convergence criterion is met. lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { - require(glmnet) # Need to demean internally dy <- Y - mean(Y) dx <- scale(X, scale = FALSE) - sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" - de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) + sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" + de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) i <- 1 while (i <= iter) { - sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) - de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) + sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) + de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) i <- i + 1 } @@ -290,11 +293,11 @@ lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { # of the default implementation in glmnet for everything. Could do better here - maybe ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. - ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge + ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge ridge_lambda <- ridge_mod$lambda # values of penalty parameter ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. - lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) + lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) lasso_lambda <- lasso_mod$lambda # values of penalty parameter ## ------------------------------------------------------------ @@ -529,8 +532,8 @@ fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) # family gaussian means that we'll be using square loss fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) -fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level -fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level +fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level +fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) diff --git a/PM2/r_ml_for_wage_prediction.Rmd b/PM2/r_ml_for_wage_prediction.Rmd index 8e10ee86..9856acde 100644 --- a/PM2/r_ml_for_wage_prediction.Rmd +++ b/PM2/r_ml_for_wage_prediction.Rmd @@ -12,7 +12,9 @@ install.packages("xtable") install.packages("hdm") install.packages("glmnet") install.packages("MLmetrics") +``` +```{r} library(hdm) library(xtable) library(glmnet) @@ -120,8 +122,8 @@ cat("Basic model R^2 (OLS): ", r2_lm_basic) # MSE OLS (basic model) ### High-dimensional specification (flexible) ```{r} -x_flex <- "sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we " + - "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)" +x_flex <- paste("sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we ", + "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)") formula_flex <- as.formula(paste("lwage", "~", x_flex)) model_x_flex_train <- model.matrix(formula_flex, data_train) model_x_flex_test <- model.matrix(formula_flex, data_test) @@ -181,8 +183,8 @@ In practice, many people choose to use cross-validation, which is perfectly fine Now, we repeat the same procedure for the flexible model. ```{r} -fit_rlasso_flex <- rlasso(formula_flex, data_train, post = FALSE) -fit_rlasso_post_flex <- rlasso(formula_flex, data_train, post = TRUE) +fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE) +fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE) yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test) yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test) @@ -202,19 +204,18 @@ Finally, we try the combination of a sparse and a dense coefficient using the LA # Define function to compute lava estimator. Doing an iterative scheme with fixed # number of iteration. Could iterate until a convergence criterion is met. lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { - require(glmnet) # Need to demean internally dy <- Y - mean(Y) dx <- scale(X, scale = FALSE) - sp1 <- glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" - de1 <- glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) + sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" + de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) i <- 1 while (i <= iter) { - sp1 <- glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) - de1 <- glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) + sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) + de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) i <- i + 1 } @@ -249,11 +250,11 @@ lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { # of the default implementation in glmnet for everything. Could do better here - maybe ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. - ridge_mod <- glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge + ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge ridge_lambda <- ridge_mod$lambda # values of penalty parameter ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. - lasso_mod <- glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) + lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) lasso_lambda <- lasso_mod$lambda # values of penalty parameter ## ------------------------------------------------------------ @@ -408,8 +409,8 @@ Now let's repeat our penalized regression analysis for the extra flexible model. fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 1) fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 0) fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = .5) -fit_rlasso_extra <- rlasso(formula_extra, data_train, post = FALSE) -fit_rlasso_post_extra <- rlasso(formula_extra, data_train, post = TRUE) +fit_rlasso_extra <- hdm::rlasso(formula_extra, data_train, post = FALSE) +fit_rlasso_post_extra <- hdm::rlasso(formula_extra, data_train, post = TRUE) fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test) yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test) diff --git a/PM2/r_orthogonal_orig.Rmd b/PM2/r_orthogonal_orig.Rmd index 9e85eb1d..a49aef69 100644 --- a/PM2/r_orthogonal_orig.Rmd +++ b/PM2/r_orthogonal_orig.Rmd @@ -14,6 +14,10 @@ The true treatment effect here is 1. From the plots produced in this notebook (e ```{r} install.packages("hdm") +install.packages("ggplot2") +``` + +```{r} library(hdm) library(ggplot2) ``` @@ -29,8 +33,8 @@ Naive <- rep(0, B) Orthogonal <- rep(0, B) -lambdaYs <- rep(0,B) -lambdaDs <- rep(0,B) +lambdaYs <- rep(0, B) +lambdaDs <- rep(0, B) for (i in 1:B) { # Generate parameters @@ -45,25 +49,23 @@ for (i in 1:B) { Y <- D + X %*% beta + rnorm(n) # Single selection method - rlasso_result <- rlasso(Y ~ D + X) # Fit lasso regression - SX_IDs <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates + rlasso_result <- hdm::rlasso(Y ~ D + X) # Fit lasso regression + sx_ids <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates # Check if any Xs are selected - if (sum(SX_IDs) == 0) { + if (sum(sx_ids) == 0) { Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected } else { - Naive[i] <- lm(Y ~ D + X[, SX_IDs])$coef[2] # Fit linear regression with selected X otherwise + Naive[i] <- lm(Y ~ D + X[, sx_ids])$coef[2] # Fit linear regression with selected X otherwise } # Partialling out / Double Lasso - fitY <- rlasso(Y ~ X, post = TRUE) + fitY <- hdm::rlasso(Y ~ X, post = TRUE) resY <- fitY$res - #cat("lambda Y mean: ", mean(fitY$lambda)) - fitD <- rlasso(D ~ X, post = TRUE) + fitD <- hdm::rlasso(D ~ X, post = TRUE) resD <- fitD$res - #cat("\nlambda D mean: ", mean(fitD$lambda)) Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals } @@ -72,14 +74,15 @@ for (i in 1:B) { ## Make a Nice Plot ```{r} -#Specify ratio -img_width = 15 -img_height = img_width/2 +# Specify ratio +img_width <- 15 +img_height <- img_width / 2 ``` ```{r} # Create a data frame for the estimates -df <- data.frame(Method = rep(c("Naive", "Orthogonal"), each = B), Value = c(Naive-1,Orthogonal-1)) +df <- data.frame(Method = rep(c("Naive", "Orthogonal"), each = B), + Value = c(Naive - 1, Orthogonal - 1)) # Create the histogram using ggplot2 hist_plot <- ggplot(df, aes(x = Value, fill = Method)) + From 9d33bc2015fdc37ec3a94f6a1629aea2fc191b70 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 19 Jul 2024 17:17:56 +0000 Subject: [PATCH 162/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM4 --- PM4/r-dml-401k.Rmd | 687 ++++++++++-------- PM4/r-dml-401k.irnb | 174 ++--- ...ation-analysis-of-401-k-example-w-dags.Rmd | 45 +- ...d_ml_for_partially_linear_model_growth.Rmd | 155 ++-- PM4/r_dml_inference_for_gun_ownership.Rmd | 450 ++++++------ 5 files changed, 823 insertions(+), 688 deletions(-) diff --git a/PM4/r-dml-401k.Rmd b/PM4/r-dml-401k.Rmd index 6c30cead..ed199079 100644 --- a/PM4/r-dml-401k.Rmd +++ b/PM4/r-dml-401k.Rmd @@ -21,7 +21,9 @@ install.packages("randomForest") install.packages("glmnet") install.packages("rpart") install.packages("gbm") +``` +```{r} library(xtable) library(hdm) library(sandwich) @@ -56,16 +58,18 @@ The data consist of 9,915 observations at the household level drawn from the 199 Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. ```{r} -hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() +hist_e401 <- ggplot(data, aes(x = e401, fill = factor(e401))) + + geom_bar() hist_e401 ``` Eligibility is highly associated with financial wealth: ```{r} -dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + - geom_density() + xlim(c(-20000, 150000)) + - facet_wrap(.~e401) +dens_net_tfa <- ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401))) + + geom_density() + + xlim(c(-20000, 150000)) + + facet_wrap(. ~ e401) dens_net_tfa ``` @@ -73,35 +77,37 @@ dens_net_tfa The unconditional APE of e401 is about $19559$: ```{r} -e1 <- data[data$e401==1,] -e0 <- data[data$e401==0,] -round(mean(e1$net_tfa)-mean(e0$net_tfa),0) +e1 <- data[data$e401 == 1, ] +e0 <- data[data$e401 == 0, ] +round(mean(e1$net_tfa) - mean(e0$net_tfa), 0) ``` Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: ```{r} -p1 <- data[data$p401==1,] -p0 <- data[data$p401==0,] -round(mean(p1$net_tfa)-mean(p0$net_tfa),0) +p1 <- data[data$p401 == 1, ] +p0 <- data[data$p401 == 0, ] +round(mean(p1$net_tfa) - mean(p0$net_tfa), 0) ``` As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. ```{r} # outcome variable -y <- data[,'net_tfa'] +y <- data[, "net_tfa"] # treatment variable -D <- data[,'e401'] -D2 <- data[,"p401"] -D3 <- data[,"a401"] - -columns_to_drop <- c('e401', 'p401', 'a401', 'tw', 'tfa', 'net_tfa', 'tfa_he', - 'hval', 'hmort', 'hequity', - 'nifa', 'net_nifa', 'net_n401', 'ira', - 'dum91', 'icat', 'ecat', 'zhat', - 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', - 'a1', 'a2', 'a3', 'a4', 'a5') +D <- data[, "e401"] +D2 <- data[, "p401"] +D3 <- data[, "a401"] + +columns_to_drop <- c( + "e401", "p401", "a401", "tw", "tfa", "net_tfa", "tfa_he", + "hval", "hmort", "hequity", + "nifa", "net_nifa", "net_n401", "ira", + "dum91", "icat", "ecat", "zhat", + "i1", "i2", "i3", "i4", "i5", "i6", "i7", + "a1", "a2", "a3", "a4", "a5" +) # covariates X <- data[, !(names(data) %in% columns_to_drop)] @@ -109,8 +115,9 @@ X <- data[, !(names(data) %in% columns_to_drop)] ```{r} # Constructing the controls -X_formula = "~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown" -X = as.data.table(model.frame(X_formula, pension)) +x_formula <- paste("~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) ", + "+ poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") +X <- as.data.table(model.frame(x_formula, X)) head(X) ``` @@ -118,59 +125,57 @@ head(X) We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. We start using ML approaches to estimate the function $g_0$ and $m_0$ in the following PLR model: -\begin{eqnarray} +\begin{align} & Y = D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid D,X]= 0,\\ & D = m_0(X) + V, & E[V \mid X] = 0. -\end{eqnarray} +\end{align} ## Partially Linear Regression Models (PLR) ```{r} -DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=3, method = "regression") { - nobs <- nrow(x) #number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices - I <- split(1:nobs, foldid) #split observation indices into folds +dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 3, method = "regression") { + nobs <- nrow(x) # number of observations + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices + I <- split(1:nobs, foldid) # split observation indices into folds ytil <- dtil <- rep(NA, nobs) cat("fold: ") - for(b in 1:length(I)){ - + for (b in seq_along(I)) { if (method == "regression") { - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold + yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold } else if (method == "randomforest") { - dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]],], type="prob")[,2] #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]], ], type = "prob")[, 2] # predict the left-out fold + yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold } else if (method == "decisiontrees") { - dfit <- dreg(x[-I[[b]],], as.factor(d)[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]],])[,2] #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],]) #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]], ])[, 2] # predict the left-out fold + yhat <- predict(yfit, x[I[[b]], ]) # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold } else if (method == "boostedtrees") { - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold + yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold } - cat(b," ") - + cat(b, " ") } - rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other - coef.est <- coef(rfit)[2] #extract coefficient - se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals + rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other + coef_est <- coef(rfit)[2] # extract coefficient + se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) # printing output + return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals } ``` @@ -183,7 +188,7 @@ summaryPLR <- function(point, stderr, resD, resy, name) { upper = point + 1.96 * stderr, # upper end of 95% confidence interval `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D - `accuracy D` = mean(abs(resD) < 0.5)# binary classification accuracy of model for D + `accuracy D` = mean(abs(resD) < 0.5) # binary classification accuracy of model for D ) rownames(data) <- name return(data) @@ -197,21 +202,26 @@ summaryPLR <- function(point, stderr, resD, resy, name) { set.seed(123) cat(sprintf("\nDML with Lasso CV \n")) -dreg.lasso.cv <- function(x,d){ cv.glmnet(x, d, family="gaussian", alpha=1, nfolds=5)} -yreg.lasso.cv <- function(x,y){ cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +dreg_lasso_cv <- function(x, d) { + cv.glmnet(x, d, family = "gaussian", alpha = 1, nfolds = 5) +} +yreg_lasso_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} -DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.lasso.cv, yreg.lasso.cv, nfold=5) +dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_lasso_cv, yreg_lasso_cv, nfold = 5) -sum.lasso.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV') +sum_lasso_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "LassoCV") tableplr <- data.frame() -tableplr <- rbind(sum.lasso.cv) +tableplr <- rbind(sum_lasso_cv) tableplr ``` ```{r} # Because residuals are output, reconstruct fitted values for use in ensemble -dhat.lasso <- D - DML2.results$dtil -yhat.lasso <- y - DML2.results$ytil +dhat_lasso <- D - dml2_results$dtil +yhat_lasso <- y - dml2_results$ytil ``` #### Using a $\ell_2$ Penalized Logistic Regression for D @@ -223,19 +233,24 @@ Note we are using the $\ell_2$ penalty here. You can use the $\ell_1$ penalty as set.seed(123) cat(sprintf("\nDML with Lasso/Logistic \n")) -dreg.logistic.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} -yreg.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +dreg_logistic_cv <- function(x, d) { + cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) +} +yreg_lasso_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} -DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.logistic.cv, yreg.lasso.cv, nfold=5) -sum.lasso_logistic.cv <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'LassoCV/LogisticCV') -tableplr <- rbind(tableplr, sum.lasso_logistic.cv) +dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_logistic_cv, yreg_lasso_cv, nfold = 5) +sum_lasso_logistic_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "LassoCV/LogisticCV") +tableplr <- rbind(tableplr, sum_lasso_logistic_cv) tableplr ``` ```{r} # Because residuals are output, reconstruct fitted values for use in ensemble -dhat.lasso_logistic <- D - DML2.results$dtil -yhat.lasso_logistic <- y - DML2.results$ytil +dhat_lasso_logistic <- D - dml2_results$dtil +yhat_lasso_logistic <- y - dml2_results$ytil ``` #### Random Forests @@ -245,19 +260,24 @@ yhat.lasso_logistic <- y - DML2.results$ytil set.seed(123) cat(sprintf("\nDML with Random Forest \n")) -dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest -yreg.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest - -DML2.results <- DML2.for.PLM(as.matrix(X), D, y, dreg.rf, yreg.rf, nfold=5, method = "randomforest") -sum.rf <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Random Forest') -tableplr <- rbind(tableplr, sum.rf) +dreg_rf <- function(x, d) { + randomForest(x, d, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg_rf <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest + +dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_rf, yreg_rf, nfold = 5, method = "randomforest") +sum_rf <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "Random Forest") +tableplr <- rbind(tableplr, sum_rf) tableplr ``` ```{r} # Because residuals are output, reconstruct fitted values for use in ensemble -dhat.rf <- D - DML2.results$dtil -yhat.rf <- y - DML2.results$ytil +dhat_rf <- D - dml2_results$dtil +dhat_rf <- y - dml2_results$ytil ``` #### Decision Trees @@ -267,19 +287,25 @@ yhat.rf <- y - DML2.results$ytil set.seed(123) cat(sprintf("\nDML with Decision Trees \n")) -dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} -yreg.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +dreg_tr <- function(x, d) { + rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) +} +dreg_tr <- function(x, y) { + rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} -DML2.results <- DML2.for.PLM(X, D, y, dreg.tr, yreg.tr, nfold=5, method = "decisiontrees") # decision tree takes in X as dataframe, not matrix/array -sum.tr <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Decision Trees') -tableplr <- rbind(tableplr, sum.tr) +# decision tree takes in X as dataframe, not matrix/array +dml2_results <- dml2_for_plm(X, D, y, dreg_tr, dreg_tr, nfold = 5, method = "decisiontrees") +sum_tr <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "Decision Trees") +tableplr <- rbind(tableplr, sum_tr) tableplr ``` ```{r} # Because residuals are output, reconstruct fitted values for use in ensemble -dhat.tr <- D - DML2.results$dtil -yhat.tr <- y - DML2.results$ytil +dhat_tr <- D - dml2_results$dtil +yhat_tr <- y - dml2_results$ytil ``` @@ -293,21 +319,28 @@ set.seed(123) cat(sprintf("\nDML with Boosted Trees \n")) # NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) -dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) +dreg_boost <- function(x, d) { + gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg_boost <- function(x, y) { + gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} # passing these through regression as type="response", and D should not be factor! -DML2.results = DML2.for.PLM(X, D, y, dreg.boost, yreg.boost, nfold=5, method = "boostedtrees") -sum.boost <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Boosted Trees') -tableplr <- rbind(tableplr, sum.boost) +dml2_results <- dml2_for_plm(X, D, y, dreg_boost, yreg_boost, nfold = 5, method = "boostedtrees") +sum_boost <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "Boosted Trees") +tableplr <- rbind(tableplr, sum_boost) tableplr ``` ```{r} # Because residuals are output, reconstruct fitted values for use in ensemble -dhat.boost <- D - DML2.results$dtil -yhat.boost <- y - DML2.results$ytil +dhat_boost <- D - dml2_results$dtil +yhat_boost <- y - dml2_results$ytil ``` ## Ensembles @@ -317,8 +350,9 @@ Boosted trees give the best RMSE for both Y and D, so the ensemble based on choo ```{r} # Best fit is boosted trees for both D and Y -sum.best <- summaryPLR(DML2.results$coef.est, DML2.results$se, DML2.results$dtil, DML2.results$ytil, name = 'Best') -tableplr <- rbind(tableplr, sum.best) +sum_best <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, + dml2_results$ytil, name = "Best") +tableplr <- rbind(tableplr, sum_best) tableplr ``` @@ -327,14 +361,14 @@ We'll form a model average with unconstrained least squares weights. ```{r} # Least squares model average -ma.dtil <- lm(D~dhat.lasso+dhat.lasso_logistic+dhat.rf+dhat.tr+dhat.boost)$residuals -ma.ytil <- lm(y~yhat.lasso+yhat.lasso_logistic+yhat.rf+yhat.tr+yhat.boost)$residuals +ma_dtil <- lm(D ~ dhat_lasso + dhat_lasso_logistic + dhat_rf + dhat_tr + dhat_boost)$residuals +ma_ytil <- lm(y ~ yhat_lasso + yhat_lasso_logistic + dhat_rf + yhat_tr + yhat_boost)$residuals -rfit <- lm(ma.ytil ~ ma.dtil) #estimate the main parameter by regressing one residual on the other -coef.est <- coef(rfit)[2] #extract coefficient -se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error +rfit <- lm(ma_ytil ~ ma_dtil) # estimate the main parameter by regressing one residual on the other +coef_est <- coef(rfit)[2] # extract coefficient +se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error -sum.ma <- summaryPLR(coef.est, se, ma.dtil, ma.ytil, name = 'Model Average') +sum.ma <- summaryPLR(coef_est, se, ma_dtil, ma_ytil, name = "Model Average") tableplr <- rbind(tableplr, sum.ma) tableplr ``` @@ -343,107 +377,107 @@ tableplr Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous: - \begin{eqnarray}\label{eq: HetPL1} + \begin{align} & Y = g_0(D, X) + U, & \quad E[U \mid X, D]= 0,\\ & D = m_0(X) + V, & \quad E[V\mid X] = 0. -\end{eqnarray} +\end{align} To reduce the disproportionate impact of extreme propensity score weights in the interactive model we trim the propensity scores which are close to the bounds. ```{r} -DML2.for.IRM <- function(x, d, y, dreg, yreg0, yreg1, trimming=0.01, nfold=5, method="regression") { +dml2_for_irm <- function(x, d, y, dreg, yreg0, yreg1, trimming = 0.01, nfold = 5, method = "regression") { yhat0 <- rep(0, length(y)) yhat1 <- rep(0, length(y)) Dhat <- rep(0, length(d)) - nobs <- nrow(x) #number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices - I <- split(1:nobs, foldid) #split observation indices into folds + nobs <- nrow(x) # number of observations + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices + I <- split(1:nobs, foldid) # split observation indices into folds ytil <- dtil <- rep(NA, nobs) cat("fold: ") - for(b in 1:length(I)){ - + for (b in seq_along(I)) { # define helpful variables - Dnotb = d[-I[[b]]] - Xb = X[I[[b]],] - Xnotb = X[-I[[b]],] + Dnotb <- d[-I[[b]]] + Xb <- X[I[[b]], ] + Xnotb <- X[-I[[b]], ] # training dfs subsetted on the -I[[b]] fold - XD0 = X[-I[[b]],][d[-I[[b]]]==0] - yD0 = y[-I[[b]]][d[-I[[b]]]==0] - XD1 = X[-I[[b]],][d[-I[[b]]]==1] - yD1 = y[-I[[b]]][d[-I[[b]]]==1] + XD0 <- X[-I[[b]], ][d[-I[[b]]] == 0] + yD0 <- y[-I[[b]]][d[-I[[b]]] == 0] + XD1 <- X[-I[[b]], ][d[-I[[b]]] == 1] + yD1 <- y[-I[[b]]][d[-I[[b]]] == 1] if (method == "regression") { - yfit0 <- yreg0(as.matrix(XD0), yD0) - yfit1 <- yreg1(as.matrix(XD1), yD1) - yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = "response" for glmnet family gaussian - yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb)) + yfit0 <- yreg0(as.matrix(XD0), yD0) + yfit1 <- yreg1(as.matrix(XD1), yD1) + yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = "response" for glmnet family gaussian + yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb)) } else if (method == "randomforest") { - yfit0 <- yreg0(XD0, yD0) - yfit1 <- yreg1(XD1, yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for rf - yhat1[I[[b]]] <- predict(yfit1, Xb) + yfit0 <- yreg0(XD0, yD0) + yfit1 <- yreg1(XD1, yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for rf + yhat1[I[[b]]] <- predict(yfit1, Xb) } else if (method == "decisiontrees") { - yfit0 <- yreg0(XD0, yD0) - yfit1 <- yreg1(XD1, yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "vector" for decision - yhat1[I[[b]]] <- predict(yfit1, Xb) + yfit0 <- yreg0(XD0, yD0) + yfit1 <- yreg1(XD1, yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "vector" for decision + yhat1[I[[b]]] <- predict(yfit1, Xb) } else if (method == "boostedtrees") { - yfit0 <- yreg0(as.data.frame(XD0), yD0) - yfit1 <- yreg1(as.data.frame(XD1), yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for boosted - yhat1[I[[b]]] <- predict(yfit1, Xb) + yfit0 <- yreg0(as.data.frame(XD0), yD0) + yfit1 <- yreg1(as.data.frame(XD1), yD1) + yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for boosted + yhat1[I[[b]]] <- predict(yfit1, Xb) } # propensity scores: - if (method == "regression"){ - Dfit_b <- dreg(as.matrix(Xnotb), Dnotb) - Dhat_b <- predict(Dfit_b, as.matrix(Xb), type="response") # default is type="link" for family binomial! + if (method == "regression") { + dfit_b <- dreg(as.matrix(Xnotb), Dnotb) + dhat_b <- predict(dfit_b, as.matrix(Xb), type = "response") # default is type="link" for family binomial! } else if (method == "randomforest") { - Dfit_b <- dreg(Xnotb, as.factor(Dnotb)) - Dhat_b <- predict(Dfit_b, Xb, type = "prob")[,2] + dfit_b <- dreg(Xnotb, as.factor(Dnotb)) + dhat_b <- predict(dfit_b, Xb, type = "prob")[, 2] } else if (method == "decisiontrees") { - Dfit_b <- dreg(Xnotb, Dnotb) - Dhat_b <- predict(Dfit_b, Xb)[,2] + dfit_b <- dreg(Xnotb, Dnotb) + dhat_b <- predict(dfit_b, Xb)[, 2] } else if (method == "boostedtrees") { - Dfit_b <- dreg(as.data.frame(Xnotb), Dnotb) - Dhat_b <- predict(Dfit_b, Xb, type="response") + dfit_b <- dreg(as.data.frame(Xnotb), Dnotb) + dhat_b <- predict(dfit_b, Xb, type = "response") } - Dhat_b <- pmax(pmin(Dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] - Dhat[I[[b]]] <- Dhat_b - + dhat_b <- pmax(pmin(dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] + Dhat[I[[b]]] <- dhat_b - cat(b," ") + cat(b, " ") } # Prediction of treatment and outcome for observed instrument yhat <- yhat0 * (1 - D) + yhat1 * D # residuals - ytil <- y-yhat - dtil <- D-Dhat + ytil <- y - yhat + dtil <- D - Dhat # doubly robust quantity for every sample - drhat <- yhat1 - yhat0 + (y-yhat)* (D/Dhat - (1 - D)/(1 - Dhat)) - coef.est <- mean(drhat) + drhat <- yhat1 - yhat0 + (y - yhat) * (D / Dhat - (1 - D) / (1 - Dhat)) + coef_est <- mean(drhat) vari <- var(drhat) - se <- sqrt(vari/nrow(X)) - cat("point", coef.est) + se <- sqrt(vari / nrow(X)) + cat("point", coef_est) cat("se", se) - return(list(coef.est = coef.est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat)) + return(list(coef_est = coef_est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, + yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat)) } ``` ```{r} -summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) { - summary_data <- data.frame(estimate = coef.est, # point estimate - se = se, # standard error - lower = coef.est - 1.96 * se, # lower end of 95% confidence interval - upper = coef.est + 1.96 * se, # upper end of 95% confidence interval - rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y - rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D - accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D +summaryIRM <- function(coef_est, se, ytil, dtil, drhat, name) { + summary_data <- data.frame( + estimate = coef_est, # point estimate + se = se, # standard error + lower = coef_est - 1.96 * se, # lower end of 95% confidence interval + upper = coef_est + 1.96 * se, # upper end of 95% confidence interval + rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y + rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D + accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D ) row.names(summary_data) <- name return(summary_data) @@ -457,20 +491,28 @@ summaryIRM <- function(coef.est, se, ytil, dtil, drhat, name) { set.seed(123) cat(sprintf("\nDML with LassoCV/Logistic \n")) -dreg.lasso.cv <- function(x,d){cv.glmnet(x, d, family="binomial", alpha=0, nfolds=5)} -yreg0.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} -yreg1.lasso.cv <- function(x,y){cv.glmnet(x, y, family="gaussian", alpha=1, nfolds=5)} +dreg_lasso_cv <- function(x, d) { + cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) +} +yreg0_lasso_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} +yreg1_lasso_cv <- function(x, y) { + cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) +} -DML2.results <- DML2.for.IRM(X, D, y, dreg.lasso.cv, yreg0.lasso.cv, yreg1.lasso.cv, nfold=5) # more folds seems to help stabilize finite sample performance -sum.lasso.cv <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'LassoCVLogistic') +# more folds seems to help stabilize finite sample performance +dml2_results <- dml2_for_irm(X, D, y, dreg_lasso_cv, yreg0_lasso_cv, yreg1_lasso_cv, nfold = 5) +sum_lasso_cv <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$drhat, name = "LassoCVLogistic") tableirm <- data.frame() -tableirm <- rbind(sum.lasso.cv) +tableirm <- rbind(sum_lasso_cv) tableirm -yhat0.lasso <- DML2.results$yhat0 -yhat1.lasso <- DML2.results$yhat1 -dhat.lasso <- DML2.results$dhat -yhat.lasso <- DML2.results$yhat +yhat0_lasso <- dml2_results$yhat0 +yhat1_lasso <- dml2_results$yhat1 +dhat_lasso <- dml2_results$dhat +yhat_lasso <- dml2_results$yhat ``` ```{r} @@ -478,20 +520,27 @@ yhat.lasso <- DML2.results$yhat set.seed(123) cat(sprintf("\nDML with Random Forest \n")) -dreg.rf <- function(x,d){randomForest(x, d, ntree=1000, nodesize=10)} #ML method=Forest -yreg0.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest -yreg1.rf <- function(x,y){randomForest(x, y, ntree=1000, nodesize=10)} #ML method=Forest - - -DML2.results <- DML2.for.IRM(as.matrix(X), D, y, dreg.rf, yreg0.rf, yreg1.rf, nfold=5, method = "randomforest") -sum.rf <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Random Forest') -tableirm <- rbind(tableirm, sum.rf) +dreg_rf <- function(x, d) { + randomForest(x, d, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg0_rf <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest +yreg1_rf <- function(x, y) { + randomForest(x, y, ntree = 1000, nodesize = 10) +} # ML method=Forest + + +dml2_results <- dml2_for_irm(as.matrix(X), D, y, dreg_rf, yreg0_rf, yreg1_rf, nfold = 5, method = "randomforest") +sum_rf <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$drhat, name = "Random Forest") +tableirm <- rbind(tableirm, sum_rf) tableirm -yhat0.rf <- DML2.results$yhat0 -yhat1.rf <- DML2.results$yhat1 -dhat.rf <- DML2.results$dhat -yhat.rf <- DML2.results$yhat +yhat0_rf <- dml2_results$yhat0 +yhat1_rf <- dml2_results$yhat1 +dhat_rf <- dml2_results$dhat +dhat_rf <- dml2_results$yhat ``` ```{r} @@ -499,19 +548,26 @@ yhat.rf <- DML2.results$yhat set.seed(123) cat(sprintf("\nDML with Decision Trees \n")) -dreg.tr <- function(x,d){rpart(as.formula("D~."), cbind(data.frame(D=d),x), method = "class", minbucket=10, cp = 0.001)} -yreg0.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} -yreg1.tr <- function(x,y){rpart(as.formula("y~."), cbind(data.frame(y=y),x), minbucket=10, cp = 0.001)} +dreg_tr <- function(x, d) { + rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) +} +yreg0_tr <- function(x, y) { + rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} +yreg1_tr <- function(x, y) { + rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) +} -DML2.results <- DML2.for.IRM(X, D, y, dreg.tr, yreg0.tr, yreg1.tr, nfold=5, method = "decisiontrees") -sum.tr <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Decision Trees') -tableirm <- rbind(tableirm, sum.tr) +dml2_results <- dml2_for_irm(X, D, y, dreg_tr, yreg0_tr, yreg1_tr, nfold = 5, method = "decisiontrees") +sum_tr <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$drhat, name = "Decision Trees") +tableirm <- rbind(tableirm, sum_tr) tableirm -yhat0.tr <- DML2.results$yhat0 -yhat1.tr <- DML2.results$yhat1 -dhat.tr <- DML2.results$dhat -yhat.tr <- DML2.results$yhat +yhat0_tr <- dml2_results$yhat0 +yhat1_tr <- dml2_results$yhat1 +dhat_tr <- dml2_results$dhat +yhat_tr <- dml2_results$yhat ``` ```{r} @@ -520,21 +576,31 @@ set.seed(123) cat(sprintf("\nDML with Boosted Trees \n")) # NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg.boost, plot.it = FALSE) -dreg.boost <- function(x,d){gbm(as.formula("D~."), cbind(data.frame(D=d),x), distribution= "bernoulli", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg0.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} -yreg1.boost <- function(x,y){gbm(as.formula("y~."), cbind(data.frame(y=y),x), distribution= "gaussian", interaction.depth=2, n.trees=100, shrinkage=.1)} +## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) +dreg_boost <- function(x, d) { + gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg0_boost <- function(x, y) { + gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} +yreg1_boost <- function(x, y) { + gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", + interaction.depth = 2, n.trees = 100, shrinkage = .1) +} # passing these through regression as type="response", and D should not be factor! -DML2.results = DML2.for.IRM(X, D, y, dreg.boost, yreg0.boost, yreg1.boost, nfold=5, method = "boostedtrees") -sum.boost <- summaryIRM(DML2.results$coef.est, DML2.results$se, DML2.results$ytil, DML2.results$dtil, DML2.results$drhat, name = 'Boosted Trees') -tableirm <- rbind(tableirm, sum.boost) +dml2_results <- dml2_for_irm(X, D, y, dreg_boost, yreg0_boost, yreg1_boost, nfold = 5, method = "boostedtrees") +sum_boost <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, + dml2_results$drhat, name = "Boosted Trees") +tableirm <- rbind(tableirm, sum_boost) tableirm -yhat0.boost <- DML2.results$yhat0 -yhat1.boost <- DML2.results$yhat1 -dhat.boost <- DML2.results$dhat -yhat.boost <- DML2.results$yhat +yhat0_boost <- dml2_results$yhat0 +yhat1_boost <- dml2_results$yhat1 +dhat_boost <- dml2_results$dhat +yhat_boost <- dml2_results$yhat ``` ```{r} @@ -545,16 +611,16 @@ yhat.boost <- DML2.results$yhat # Here, the best performance for Y is the random forest and for D the boosted tree # residuals -ytil <- y-yhat.rf -dtil <- D-dhat.boost +ytil <- y - dhat_rf +dtil <- D - dhat_boost # doubly robust quantity for every sample -drhat <- yhat1.rf - yhat0.rf + (y-yhat.rf)* (D/dhat.boost - (1 - D)/(1 - dhat.boost)) -coef.est <- mean(drhat) +drhat <- yhat1_rf - yhat0_rf + (y - dhat_rf) * (D / dhat_boost - (1 - D) / (1 - dhat_boost)) +coef_est <- mean(drhat) vari <- var(drhat) -se <- sqrt(vari/nrow(X)) +se <- sqrt(vari / nrow(X)) -sum.best <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Best') -tableirm <- rbind(tableirm, sum.best) +sum_best <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = "Best") +tableirm <- rbind(tableirm, sum_best) tableirm ``` @@ -562,29 +628,29 @@ tableirm # Least squares model average # We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1 -ma.dw <- lm(D~dhat.lasso+dhat.rf+dhat.tr+dhat.boost)$coef -ma.yw <- lm(y~yhat.lasso+yhat.rf+yhat.tr+yhat.boost)$coef +ma_dw <- lm(D ~ dhat_lasso + dhat_rf + dhat_tr + dhat_boost)$coef +ma_yw <- lm(y ~ yhat_lasso + dhat_rf + yhat_tr + yhat_boost)$coef -Dhats <- cbind(as.matrix(rep(1,nrow(X))),dhat.lasso,dhat.rf,dhat.tr,dhat.boost) -Y0s <- cbind(as.matrix(rep(1,nrow(X))),yhat0.lasso,yhat0.rf,yhat0.tr,yhat0.boost) -Y1s <- cbind(as.matrix(rep(1,nrow(X))),yhat1.lasso,yhat1.rf,yhat1.tr,yhat1.boost) +Dhats <- cbind(as.matrix(rep(1, nrow(X))), dhat_lasso, dhat_rf, dhat_tr, dhat_boost) +Y0s <- cbind(as.matrix(rep(1, nrow(X))), yhat0_lasso, yhat0_rf, yhat0_tr, yhat0_boost) +Y1s <- cbind(as.matrix(rep(1, nrow(X))), yhat1_lasso, yhat1_rf, yhat1_tr, yhat1_boost) -dhat <- Dhats%*%as.matrix(ma.dw) -yhat0 <- Y0s%*%as.matrix(ma.yw) -yhat1 <- Y1s%*%as.matrix(ma.yw) +dhat <- Dhats %*% as.matrix(ma_dw) +yhat0 <- Y0s %*% as.matrix(ma_yw) +yhat1 <- Y1s %*% as.matrix(ma_yw) # Prediction of treatment and outcome for observed instrument yhat <- yhat0 * (1 - D) + yhat1 * D # residuals -ytil <- y-yhat -dtil <- D-dhat +ytil <- y - yhat +dtil <- D - dhat # doubly robust quantity for every sample -drhat <- yhat1 - yhat0 + (y-yhat)* (D/dhat - (1 - D)/(1 - dhat)) -coef.est <- mean(drhat) +drhat <- yhat1 - yhat0 + (y - yhat) * (D / dhat - (1 - D) / (1 - dhat)) +coef_est <- mean(drhat) vari <- var(drhat) -se <- sqrt(vari/nrow(X)) +se <- sqrt(vari / nrow(X)) -sum.ma <- summaryIRM(coef.est, se, ytil, dtil, drhat, name = 'Model Average') +sum.ma <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = "Model Average") tableirm <- rbind(tableirm, sum.ma) tableirm ``` @@ -615,17 +681,14 @@ library(ranger) ```{r} # Constructing the data (as DoubleMLData) -formula_flex = "net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" -model_flex = as.data.table(model.frame(formula_flex, pension)) -x_cols = colnames(model_flex)[-c(1,2)] -data_ml = DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols=x_cols) - +formula_flex <- paste("net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) ", + "+ poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown") +model_flex <- as.data.table(model.frame(formula_flex, pension)) +x_cols <- colnames(model_flex)[-c(1, 2)] +data_ml <- DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols = x_cols) -p <- dim(model_flex)[2]-2 +p <- dim(model_flex)[2] - 2 p - -# complex model with two-way interactions -#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE) ``` As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. @@ -635,11 +698,11 @@ As mentioned, in the tutorial we use the meta package `mlr3` to generate predict ```{r} # Estimating the PLR lgr::get_logger("mlr3")$set_threshold("warn") -lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") +lasso <- lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min") lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds=5) -dml_plr$fit(store_predictions=TRUE) +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds = 5) +dml_plr$fit(store_predictions = TRUE) dml_plr$summary() lasso_plr <- dml_plr$coef lasso_std_plr <- dml_plr$se @@ -658,15 +721,15 @@ m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o y <- as.matrix(pension$net_tfa) # true observations theta <- as.numeric(dml_plr$coef) # estimated regression coefficient d <- as.matrix(pension$e401) -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -lasso_y_rmse <- sqrt(mean((y-predictions_y)^2)) +predictions_y <- as.matrix(d * theta) + g_hat # predictions for y +lasso_y_rmse <- sqrt(mean((y - predictions_y)^2)) lasso_y_rmse ``` ```{r} # cross-fitted RMSE: treatment d <- as.matrix(pension$e401) -lasso_d_rmse <- sqrt(mean((d-m_hat)^2)) +lasso_d_rmse <- sqrt(mean((d - m_hat)^2)) lasso_d_rmse # cross-fitted ce: treatment @@ -679,10 +742,10 @@ Then, we repeat this procedure for various machine learning methods. # Random Forest lgr::get_logger("mlr3")$set_threshold("warn") randomForest <- lrn("regr.ranger") -randomForest_class <- lrn("classif.ranger") +random_forest_class <- lrn("classif.ranger") -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = randomForest_class, n_folds=5) -dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = random_forest_class, n_folds = 5) +dml_plr$fit(store_predictions = TRUE) # set store_predictions=TRUE to evaluate the model dml_plr$summary() forest_plr <- dml_plr$coef forest_std_plr <- dml_plr$se @@ -695,12 +758,12 @@ We can compare the accuracy of this model to the model that has been estimated w g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -forest_y_rmse <- sqrt(mean((y-predictions_y)^2)) +predictions_y <- as.matrix(d * theta) + g_hat # predictions for y +forest_y_rmse <- sqrt(mean((y - predictions_y)^2)) forest_y_rmse # cross-fitted RMSE: treatment -forest_d_rmse <- sqrt(mean((d-m_hat)^2)) +forest_d_rmse <- sqrt(mean((d - m_hat)^2)) forest_d_rmse # cross-fitted ce: treatment @@ -714,8 +777,8 @@ lgr::get_logger("mlr3")$set_threshold("warn") trees <- lrn("regr.rpart") trees_class <- lrn("classif.rpart") -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds=5) -dml_plr$fit(store_predictions=TRUE) +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds = 5) +dml_plr$fit(store_predictions = TRUE) dml_plr$summary() tree_plr <- dml_plr$coef tree_std_plr <- dml_plr$se @@ -724,12 +787,12 @@ tree_std_plr <- dml_plr$se g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -tree_y_rmse <- sqrt(mean((y-predictions_y)^2)) +predictions_y <- as.matrix(d * theta) + g_hat # predictions for y +tree_y_rmse <- sqrt(mean((y - predictions_y)^2)) tree_y_rmse # cross-fitted RMSE: treatment -tree_d_rmse <- sqrt(mean((d-m_hat)^2)) +tree_d_rmse <- sqrt(mean((d - m_hat)^2)) tree_d_rmse # cross-fitted ce: treatment @@ -747,11 +810,11 @@ library(mboost) ```{r} # Boosting -boost<- lrn("regr.glmboost") +boost <- lrn("regr.glmboost") boost_class <- lrn("classif.glmboost") -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds=5) -dml_plr$fit(store_predictions=TRUE) +dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds = 5) +dml_plr$fit(store_predictions = TRUE) dml_plr$summary() boost_plr <- dml_plr$coef boost_std_plr <- dml_plr$se @@ -760,12 +823,12 @@ boost_std_plr <- dml_plr$se g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -boost_y_rmse <- sqrt(mean((y-predictions_y)^2)) +predictions_y <- as.matrix(d * theta) + g_hat # predictions for y +boost_y_rmse <- sqrt(mean((y - predictions_y)^2)) boost_y_rmse # cross-fitted RMSE: treatment -boost_d_rmse <- sqrt(mean((d-m_hat)^2)) +boost_d_rmse <- sqrt(mean((d - m_hat)^2)) boost_d_rmse # cross-fitted ce: treatment @@ -776,13 +839,13 @@ Let's sum up the results: ```{r} table <- matrix(0, 4, 4) -table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr) -table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr) -table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse) -table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) +table[1, 1:4] <- c(lasso_plr, forest_plr, tree_plr, boost_plr) +table[2, 1:4] <- c(lasso_std_plr, forest_std_plr, tree_std_plr, boost_std_plr) +table[3, 1:4] <- c(lasso_y_rmse, forest_y_rmse, tree_y_rmse, boost_y_rmse) +table[4, 1:4] <- c(lasso_d_rmse, forest_d_rmse, tree_d_rmse, boost_d_rmse) +rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D") +colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") +tab <- xtable(table, digits = 2) tab ``` @@ -796,10 +859,12 @@ lasso_plr ```{r} lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds=5) -dml_irm$fit(store_predictions=TRUE) +dml_irm <- DoubleMLIRM$new(data_ml, + ml_g = lasso, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds = 5 +) +dml_irm$fit(store_predictions = TRUE) dml_irm$summary() lasso_irm <- dml_irm$coef lasso_std_irm <- dml_irm$se @@ -809,7 +874,7 @@ lasso_std_irm <- dml_irm$se dml_irm$params_names() g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o ``` @@ -817,11 +882,11 @@ m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome y <- as.matrix(pension$net_tfa) # true observations d <- as.matrix(pension$e401) -lasso_y_irm <- sqrt(mean((y-g_hat)^2)) +lasso_y_irm <- sqrt(mean((y - g_hat)^2)) lasso_y_irm # cross-fitted RMSE: treatment -lasso_d_irm <- sqrt(mean((d-m_hat)^2)) +lasso_d_irm <- sqrt(mean((d - m_hat)^2)) lasso_d_irm # cross-fitted ce: treatment @@ -831,10 +896,12 @@ mean(ifelse(m_hat > 0.5, 1, 0) != d) ```{r} ##### forest ##### -dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, - ml_m = randomForest_class, - trimming_threshold = 0.01, n_folds=5) -dml_irm$fit(store_predictions=TRUE) +dml_irm <- DoubleMLIRM$new(data_ml, + ml_g = randomForest, + ml_m = random_forest_class, + trimming_threshold = 0.01, n_folds = 5 +) +dml_irm$fit(store_predictions = TRUE) dml_irm$summary() forest_irm <- dml_irm$coef forest_std_irm <- dml_plr$se @@ -842,17 +909,17 @@ forest_std_irm <- dml_plr$se # predictions g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0 # cross-fitted RMSE: outcome y <- as.matrix(pension$net_tfa) # true observations d <- as.matrix(pension$e401) -forest_y_irm <- sqrt(mean((y-g_hat)^2)) +forest_y_irm <- sqrt(mean((y - g_hat)^2)) forest_y_irm # cross-fitted RMSE: treatment -forest_d_irm <- sqrt(mean((d-m_hat)^2)) +forest_d_irm <- sqrt(mean((d - m_hat)^2)) forest_d_irm # cross-fitted ce: treatment @@ -860,9 +927,11 @@ mean(ifelse(m_hat > 0.5, 1, 0) != d) ##### trees ##### -dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class, - trimming_threshold = 0.01, n_folds=5) -dml_irm$fit(store_predictions=TRUE) +dml_irm <- DoubleMLIRM$new(data_ml, + ml_g = trees, ml_m = trees_class, + trimming_threshold = 0.01, n_folds = 5 +) +dml_irm$fit(store_predictions = TRUE) dml_irm$summary() tree_irm <- dml_irm$coef tree_std_irm <- dml_irm$se @@ -870,17 +939,17 @@ tree_std_irm <- dml_irm$se # predictions g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome y <- as.matrix(pension$net_tfa) # true observations d <- as.matrix(pension$e401) -tree_y_irm <- sqrt(mean((y-g_hat)^2)) +tree_y_irm <- sqrt(mean((y - g_hat)^2)) tree_y_irm # cross-fitted RMSE: treatment -tree_d_irm <- sqrt(mean((d-m_hat)^2)) +tree_d_irm <- sqrt(mean((d - m_hat)^2)) tree_d_irm # cross-fitted ce: treatment @@ -889,9 +958,11 @@ mean(ifelse(m_hat > 0.5, 1, 0) != d) ##### boosting ##### -dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class, - trimming_threshold = 0.01, n_folds=5) -dml_irm$fit(store_predictions=TRUE) +dml_irm <- DoubleMLIRM$new(data_ml, + ml_g = boost, ml_m = boost_class, + trimming_threshold = 0.01, n_folds = 5 +) +dml_irm$fit(store_predictions = TRUE) dml_irm$summary() boost_irm <- dml_irm$coef boost_std_irm <- dml_irm$se @@ -899,17 +970,17 @@ boost_std_irm <- dml_irm$se # predictions g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o # cross-fitted RMSE: outcome y <- as.matrix(pension$net_tfa) # true observations d <- as.matrix(pension$e401) -boost_y_irm <- sqrt(mean((y-g_hat)^2)) +boost_y_irm <- sqrt(mean((y - g_hat)^2)) boost_y_irm # cross-fitted RMSE: treatment -boost_d_irm <- sqrt(mean((d-m_hat)^2)) +boost_d_irm <- sqrt(mean((d - m_hat)^2)) boost_d_irm # cross-fitted ce: treatment @@ -918,13 +989,13 @@ mean(ifelse(m_hat > 0.5, 1, 0) != d) ```{r} table <- matrix(0, 4, 4) -table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm) -table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm) -table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm) -table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) +table[1, 1:4] <- c(lasso_irm, forest_irm, tree_irm, boost_irm) +table[2, 1:4] <- c(lasso_std_irm, forest_std_irm, tree_std_irm, boost_std_irm) +table[3, 1:4] <- c(lasso_y_irm, forest_y_irm, tree_y_irm, boost_y_irm) +table[4, 1:4] <- c(lasso_d_irm, forest_d_irm, tree_d_irm, boost_d_irm) +rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D") +colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") +tab <- xtable(table, digits = 2) tab ``` @@ -932,10 +1003,12 @@ Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best ```{r} lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds=5) -dml_irm$fit(store_predictions=TRUE) +dml_irm <- DoubleMLIRM$new(data_ml, + ml_g = randomForest, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds = 5 +) +dml_irm$fit(store_predictions = TRUE) dml_irm$summary() best_irm <- dml_irm$coef best_std_irm <- dml_irm$se diff --git a/PM4/r-dml-401k.irnb b/PM4/r-dml-401k.irnb index 0d5b565d..8879759a 100644 --- a/PM4/r-dml-401k.irnb +++ b/PM4/r-dml-401k.irnb @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04026d3c", + "id": "3", "metadata": { "vscode": { "languageId": "r" @@ -87,7 +87,7 @@ }, { "cell_type": "markdown", - "id": "3", + "id": "4", "metadata": { "id": "7e23cba0", "papermill": { @@ -110,7 +110,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4", + "id": "5", "metadata": { "id": "c442abdc", "papermill": { @@ -134,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "5", + "id": "6", "metadata": { "id": "e47fa9d3", "papermill": { @@ -153,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "7", "metadata": { "id": "00e04b82", "papermill": { @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "8", "metadata": { "id": "24b41e4a", "papermill": { @@ -193,7 +193,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": { "id": "ed9d4e82", "papermill": { @@ -212,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "10", "metadata": { "id": "63519184", "papermill": { @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": { "id": "823d2628", "papermill": { @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "12", "metadata": { "id": "5d8faf9c", "papermill": { @@ -282,7 +282,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": { "id": "0f4f86a7", "papermill": { @@ -301,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "14", "metadata": { "id": "836c6af7", "papermill": { @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": { "id": "22b09926", "papermill": { @@ -344,7 +344,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "16", "metadata": { "id": "e78aaa58", "papermill": { @@ -368,7 +368,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": { "id": "e0af3c81", "papermill": { @@ -387,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "18", "metadata": { "id": "1hBrSMQGzZBR", "vscode": { @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "19", "metadata": { "id": "DD0Hwcb6z4u5", "vscode": { @@ -437,7 +437,7 @@ }, { "cell_type": "markdown", - "id": "19", + "id": "20", "metadata": { "id": "MZThhulbKA9W" }, @@ -447,7 +447,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "21", "metadata": { "id": "UuYqY89D0pvs" }, @@ -457,7 +457,7 @@ }, { "cell_type": "markdown", - "id": "21", + "id": "22", "metadata": { "id": "vEAeB2ih0r8B" }, @@ -470,7 +470,7 @@ }, { "cell_type": "markdown", - "id": "22", + "id": "23", "metadata": { "id": "cde447aa", "papermill": { @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "24", "metadata": { "id": "tqFlcClUNr9Z", "vscode": { @@ -547,7 +547,7 @@ { "cell_type": "code", "execution_count": null, - "id": "24", + "id": "25", "metadata": { "id": "sS0P4CVySjDP", "vscode": { @@ -573,7 +573,7 @@ }, { "cell_type": "markdown", - "id": "25", + "id": "26", "metadata": { "id": "pdGcjnngSn5Q" }, @@ -584,7 +584,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26", + "id": "27", "metadata": { "id": "LOVuR5QO1bkB", "vscode": { @@ -616,7 +616,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "28", "metadata": { "id": "KatOw36Z0ghO", "vscode": { @@ -632,7 +632,7 @@ }, { "cell_type": "markdown", - "id": "28", + "id": "29", "metadata": { "id": "4wvLEj12SpDf" }, @@ -645,7 +645,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29", + "id": "30", "metadata": { "id": "b9Nvp5ZlSuwB", "vscode": { @@ -675,7 +675,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "31", "metadata": { "id": "hJqMdcZV05lr", "vscode": { @@ -691,7 +691,7 @@ }, { "cell_type": "markdown", - "id": "31", + "id": "32", "metadata": { "id": "txyv6IDXSu64" }, @@ -702,7 +702,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32", + "id": "33", "metadata": { "id": "nt0oTHTfSwMr", "vscode": { @@ -732,7 +732,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33", + "id": "34", "metadata": { "id": "TG476dPX1BI_", "vscode": { @@ -748,7 +748,7 @@ }, { "cell_type": "markdown", - "id": "34", + "id": "35", "metadata": { "id": "k8EFP-w_SwXZ" }, @@ -759,7 +759,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "36", "metadata": { "id": "3Nu4daQRSyRb", "vscode": { @@ -790,7 +790,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36", + "id": "37", "metadata": { "id": "RnCGwVbN1KJJ", "vscode": { @@ -806,7 +806,7 @@ }, { "cell_type": "markdown", - "id": "37", + "id": "38", "metadata": { "id": "jODHt0hjntdP" }, @@ -817,7 +817,7 @@ }, { "cell_type": "markdown", - "id": "38", + "id": "39", "metadata": { "id": "SaPGNW0SSxWk" }, @@ -828,7 +828,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39", + "id": "40", "metadata": { "id": "Ekg5qeEOSxep", "vscode": { @@ -863,7 +863,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40", + "id": "41", "metadata": { "id": "WSyqSd5Z1hne", "vscode": { @@ -879,7 +879,7 @@ }, { "cell_type": "markdown", - "id": "41", + "id": "42", "metadata": { "id": "7UZphpPS10Hz" }, @@ -889,7 +889,7 @@ }, { "cell_type": "markdown", - "id": "42", + "id": "43", "metadata": { "id": "Hqsqpgs6C4fJ" }, @@ -900,7 +900,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43", + "id": "44", "metadata": { "id": "gDrZqZXR12hA", "vscode": { @@ -919,7 +919,7 @@ }, { "cell_type": "markdown", - "id": "44", + "id": "45", "metadata": { "id": "pG8mmrQw2GRC" }, @@ -930,7 +930,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "46", "metadata": { "id": "Pkg7pw5h2N0z", "vscode": { @@ -955,7 +955,7 @@ }, { "cell_type": "markdown", - "id": "46", + "id": "47", "metadata": { "id": "67fa5873", "papermill": { @@ -973,7 +973,7 @@ }, { "cell_type": "markdown", - "id": "47", + "id": "48", "metadata": { "id": "86393e4c", "papermill": { @@ -991,7 +991,7 @@ }, { "cell_type": "markdown", - "id": "48", + "id": "49", "metadata": { "id": "830bb508", "papermill": { @@ -1012,7 +1012,7 @@ }, { "cell_type": "markdown", - "id": "49", + "id": "50", "metadata": { "id": "9e5ec32b", "papermill": { @@ -1032,7 +1032,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50", + "id": "51", "metadata": { "id": "-hCmnqC-N0nS", "vscode": { @@ -1126,7 +1126,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51", + "id": "52", "metadata": { "id": "bCj1D8_MSg09", "vscode": { @@ -1152,7 +1152,7 @@ }, { "cell_type": "markdown", - "id": "52", + "id": "53", "metadata": { "id": "6mCdfifchkgZ" }, @@ -1163,7 +1163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53", + "id": "54", "metadata": { "id": "AUiHMoNTvo-m", "vscode": { @@ -1203,7 +1203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "55", "metadata": { "id": "JPABXLYyvyqy", "vscode": { @@ -1242,7 +1242,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55", + "id": "56", "metadata": { "id": "SukZCfEbvyzC", "vscode": { @@ -1280,7 +1280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56", + "id": "57", "metadata": { "id": "bTfgiCabvy6f", "vscode": { @@ -1324,7 +1324,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57", + "id": "58", "metadata": { "id": "7rxqwK-R4Z2q", "vscode": { @@ -1356,7 +1356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58", + "id": "59", "metadata": { "id": "0-c3NI0fCfqg", "vscode": { @@ -1397,7 +1397,7 @@ }, { "cell_type": "markdown", - "id": "59", + "id": "60", "metadata": { "id": "01de9f24", "papermill": { @@ -1415,7 +1415,7 @@ }, { "cell_type": "markdown", - "id": "60", + "id": "61", "metadata": { "id": "6cdc366f", "papermill": { @@ -1438,7 +1438,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "62", "metadata": { "id": "2846a36a", "papermill": { @@ -1473,7 +1473,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62", + "id": "63", "metadata": { "id": "2a141248", "papermill": { @@ -1503,7 +1503,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "64", "metadata": { "id": "2e1c9339", "papermill": { @@ -1521,7 +1521,7 @@ }, { "cell_type": "markdown", - "id": "64", + "id": "65", "metadata": { "id": "Cwmd7ELXKeIg" }, @@ -1532,7 +1532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65", + "id": "66", "metadata": { "id": "a48e367d", "papermill": { @@ -1563,7 +1563,7 @@ }, { "cell_type": "markdown", - "id": "66", + "id": "67", "metadata": { "id": "135275dc", "papermill": { @@ -1582,7 +1582,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67", + "id": "68", "metadata": { "id": "e6d83bbe", "papermill": { @@ -1607,7 +1607,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68", + "id": "69", "metadata": { "id": "32c894fa", "papermill": { @@ -1636,7 +1636,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69", + "id": "70", "metadata": { "id": "da5b9334", "papermill": { @@ -1664,7 +1664,7 @@ }, { "cell_type": "markdown", - "id": "70", + "id": "71", "metadata": { "id": "c1481527", "papermill": { @@ -1683,7 +1683,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71", + "id": "72", "metadata": { "id": "dac2d0fc", "papermill": { @@ -1714,7 +1714,7 @@ }, { "cell_type": "markdown", - "id": "72", + "id": "73", "metadata": { "id": "c7c614e6", "papermill": { @@ -1733,7 +1733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73", + "id": "74", "metadata": { "id": "f8af1a74", "papermill": { @@ -1769,7 +1769,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74", + "id": "75", "metadata": { "id": "61a94dff", "papermill": { @@ -1817,7 +1817,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75", + "id": "76", "metadata": { "id": "885c94eb", "papermill": { @@ -1845,7 +1845,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76", + "id": "77", "metadata": { "id": "0372eefe", "papermill": { @@ -1890,7 +1890,7 @@ }, { "cell_type": "markdown", - "id": "77", + "id": "78", "metadata": { "id": "ffa1e35a", "papermill": { @@ -1909,7 +1909,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78", + "id": "79", "metadata": { "id": "d322c48a", "papermill": { @@ -1939,7 +1939,7 @@ }, { "cell_type": "markdown", - "id": "79", + "id": "80", "metadata": { "id": "e8e9ffc8", "papermill": { @@ -1958,7 +1958,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80", + "id": "81", "metadata": { "id": "33fcc2b4", "papermill": { @@ -1980,7 +1980,7 @@ }, { "cell_type": "markdown", - "id": "81", + "id": "82", "metadata": { "id": "Ebrv1spfKWxH" }, @@ -1991,7 +1991,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82", + "id": "83", "metadata": { "id": "9a7410a9", "papermill": { @@ -2031,7 +2031,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83", + "id": "84", "metadata": { "id": "1a34a9e8", "papermill": { @@ -2065,7 +2065,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84", + "id": "85", "metadata": { "id": "d0c93355", "papermill": { @@ -2178,7 +2178,7 @@ { "cell_type": "code", "execution_count": null, - "id": "85", + "id": "86", "metadata": { "id": "bf344442", "papermill": { @@ -2208,7 +2208,7 @@ }, { "cell_type": "markdown", - "id": "86", + "id": "87", "metadata": { "id": "cddc45ff", "papermill": { @@ -2227,7 +2227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87", + "id": "88", "metadata": { "id": "9d4b8690", "papermill": { @@ -2258,7 +2258,7 @@ }, { "cell_type": "markdown", - "id": "88", + "id": "89", "metadata": { "id": "92a77dd6", "papermill": { diff --git a/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd b/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd index bf3f40c1..e721fe31 100644 --- a/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd +++ b/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd @@ -6,9 +6,12 @@ output: html_document # Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth ```{r} -#install and load package +# install and load package install.packages("dagitty") install.packages("ggdag") +``` + +```{r} library(dagitty) library(ggdag) ``` @@ -24,9 +27,9 @@ Here we have **One graph (where F determines X):** ```{r} -#generate a DAGs and plot them +# generate a DAGs and plot them -G1 = dagitty('dag{ +G1 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -38,14 +41,14 @@ F -> D X -> Y}') -ggdag(G1)+ theme_dag() +ggdag(G1) + theme_dag() ``` **List minimal adjustment sets to identify causal effects $D \to Y$** ```{r} -adjustmentSets( G1, "D", "Y",effect="total" ) +adjustmentSets(G1, "D", "Y", effect = "total") ``` **What is the underlying principle?** @@ -55,9 +58,9 @@ Here conditioning on X blocks backdoor paths from Y to D (Pearl). Dagitty corre **Another Graph (wherere $X$ determines $F$):** ```{r} -#generate a couple of DAGs and plot them +# generate a couple of DAGs and plot them -G2 = dagitty('dag{ +G2 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -69,11 +72,11 @@ F -> D X -> Y}') -ggdag(G2)+ theme_dag() +ggdag(G2) + theme_dag() ``` ```{r} -adjustmentSets( G2, "D", "Y", effect="total" ) +adjustmentSets(G2, "D", "Y", effect = "total") ``` **One more graph (encompassing the previous ones), where (F, X) are jointly determined by latent factors $A$.** @@ -83,7 +86,7 @@ We can allow in fact the whole triple (D, F, X) to be jointly determined by late This is much more realistic graph to consider. ```{r} -G3 = dagitty('dag{ +G3 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -97,9 +100,9 @@ A -> X A -> D X -> Y}') -adjustmentSets( G3, "D", "Y", effect="total" ) +adjustmentSets(G3, "D", "Y", effect = "total") -ggdag(G3)+ theme_dag() +ggdag(G3) + theme_dag() ``` # Threat to Identification: @@ -107,7 +110,7 @@ ggdag(G3)+ theme_dag() What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case.) ```{r} -G4 = dagitty('dag{ +G4 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -123,11 +126,11 @@ F -> Y X -> Y}') -ggdag(G4)+ theme_dag() +ggdag(G4) + theme_dag() ``` ```{r} -adjustmentSets( G4, "D", "Y",effect="total" ) +adjustmentSets(G4, "D", "Y", effect = "total") ``` **Note that no output means that there is no valid adustment set (among observed variables).** @@ -137,7 +140,7 @@ adjustmentSets( G4, "D", "Y",effect="total" ) Introduce Match Amount $M$. The match amount is a potential important mediator (why mediator?). $M$ is not observed. Luckily, adjusting for $X$ still works if there is no arrow $F \to M$. ```{r} -G5 = dagitty('dag{ +G5 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -155,15 +158,15 @@ M -> Y X -> M X -> Y}') -print( adjustmentSets( G5, "D", "Y",effect="total" ) ) +print(adjustmentSets(G5, "D", "Y", effect = "total")) -ggdag(G5)+ theme_dag() +ggdag(G5) + theme_dag() ``` If there is an $F \to M$ arrow, then adjusting for $X$ is not sufficient. ```{r} -G6 = dagitty('dag{ +G6 <- dagitty('dag{ Y [outcome,pos="4, 0"] D [exposure,pos="0, 0"] X [confounder, pos="2,-2"] @@ -182,9 +185,9 @@ M -> Y X -> M X -> Y}') -print( adjustmentSets( G6, "D", "Y" ),effect="total" ) +print(adjustmentSets(G6, "D", "Y"), effect = "total") -ggdag(G6)+ theme_dag() +ggdag(G6) + theme_dag() ``` Again, note that no output was returned for the adjustment set. There is no valid adjustment set here. diff --git a/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd b/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd index 406f0f9d..d35fbafb 100644 --- a/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd +++ b/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd @@ -21,7 +21,9 @@ install.packages("hdm") install.packages("randomForest") install.packages("glmnet") install.packages("sandwich") +``` +```{r} library(xtable) library(randomForest) library(hdm) @@ -40,16 +42,17 @@ dim(data) ``` ```{r} -y = as.matrix(data[,1]) # outcome: growth rate -d = as.matrix(data[,3]) # treatment: initial wealth -x = as.matrix(data[,-c(1,2,3)]) # controls: country characteristics +y <- as.matrix(data[, 1]) # outcome: growth rate +d <- as.matrix(data[, 3]) # treatment: initial wealth +x <- as.matrix(data[, -c(1, 2, 3)]) # controls: country characteristics # some summary statistics -cat(sprintf("\nThe length of y is %g \n", length(y) )) -cat(sprintf("\nThe number of features in x is %g \n", dim(x)[2] )) +cat(sprintf("\nThe length of y is %g \n", length(y))) +cat(sprintf("\nThe number of features in x is %g \n", dim(x)[2])) -lres=summary(lm(y~d +x))$coef[2,1:2] -cat(sprintf("\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \ncoef (se) = %g (%g)\n", lres[1] , lres[2])) +lres <- summary(lm(y ~ d + x))$coef[2, 1:2] +cat(sprintf("\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \ncoef (se) = %g (%g)\n", + lres[1], lres[2])) ``` # DML algorithm @@ -72,26 +75,26 @@ Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \q $$ ```{r} -DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2) { - nobs <- nrow(x) #number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices - I <- split(1:nobs, foldid) #split observation indices into folds +dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 2) { + nobs <- nrow(x) # number of observations + foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices + I <- split(1:nobs, foldid) # split observation indices into folds ytil <- dtil <- rep(NA, nobs) cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold - cat(b," ") - } - rfit <- lm(ytil ~ dtil) #estimate the main parameter by regressing one residual on the other - coef.est <- coef(rfit)[2] #extract coefficient - se <- sqrt(vcovHC(rfit)[2,2]) #record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals + for (b in seq_along(I)) { + dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out + yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold + yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold + cat(b, " ") + } + rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other + coef.est <- coef(rfit)[2] # extract coefficient + se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est, se)) # printing output + return(list(coef.est = coef.est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals } ``` @@ -102,84 +105,100 @@ We now run through DML using as first stage models: 4. Mix of Random Forest and Lasso ```{r} -#DML with OLS +# DML with OLS cat(sprintf("\nDML with OLS w/o feature selection \n")) -dreg <- function(x,d){ glmnet(x, d, lambda = 0) } #ML method= OLS using glmnet; using lm gives bugs -yreg <- function(x,y){ glmnet(x, y, lambda = 0) } #ML method = OLS -DML2.OLS = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) +dreg <- function(x, d) { + glmnet(x, d, lambda = 0) +} # ML method= OLS using glmnet; using lm gives bugs +yreg <- function(x, y) { + glmnet(x, y, lambda = 0) +} # ML method = OLS +dml2_ols <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) -#DML with Lasso: +# DML with Lasso: cat(sprintf("\nDML with Lasso \n")) -dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method= lasso from hdm -yreg <- function(x,y){ rlasso(x,y, post=FALSE) } #ML method = lasso from hdm -DML2.lasso = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) +dreg <- function(x, d) { + rlasso(x, d, post = FALSE) +} # ML method= lasso from hdm +yreg <- function(x, y) { + rlasso(x, y, post = FALSE) +} # ML method = lasso from hdm +dml2_lasso <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) -#DML with Random Forest: +# DML with Random Forest: cat(sprintf("\nDML with Random Forest \n")) -dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) - -#DML MIX: +dreg <- function(x, d) { + randomForest(x, d) +} # ML method=Forest +yreg <- function(x, y) { + randomForest(x, y) +} # ML method=Forest +dml2_rf <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) + +# DML MIX: cat(sprintf("\nDML with Lasso for D and Random Forest for Y \n")) -dreg <- function(x,d){ rlasso(x,d, post=FALSE) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -DML2.mix = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10) +dreg <- function(x, d) { + rlasso(x, d, post = FALSE) +} # ML method=Forest +yreg <- function(x, y) { + randomForest(x, y) +} # ML method=Forest +dml2_mix <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) ``` Now we examine the RMSE of D and Y to see which method performs well in the first-stage. We print all results below in the following table: ```{r} -prRes.D<- c( mean((DML2.OLS$dtil)^2), mean((DML2.lasso$dtil)^2), mean((DML2.RF$dtil)^2), mean((DML2.mix$dtil)^2)); -prRes.Y<- c(mean((DML2.OLS$ytil)^2), mean((DML2.lasso$ytil)^2),mean((DML2.RF$ytil)^2),mean((DML2.mix$ytil)^2)); -prRes<- rbind(sqrt(prRes.D), sqrt(prRes.Y)); -rownames(prRes)<- c("RMSE D", "RMSE Y"); -colnames(prRes)<- c("OLS", "Lasso", "RF", "Mix") +pr_res_d <- c(mean((dml2_ols$dtil)^2), mean((dml2_lasso$dtil)^2), mean((dml2_rf$dtil)^2), mean((dml2_mix$dtil)^2)) +pr_res_y <- c(mean((dml2_ols$ytil)^2), mean((dml2_lasso$ytil)^2), mean((dml2_rf$ytil)^2), mean((dml2_mix$ytil)^2)) +pr_res <- rbind(sqrt(pr_res_d), sqrt(pr_res_y)) +rownames(pr_res) <- c("RMSE D", "RMSE Y") +colnames(pr_res) <- c("OLS", "Lasso", "RF", "Mix") ``` ```{r} -table <- matrix(0,4,4) +table <- matrix(0, 4, 4) # Point Estimate -table[1,1] <- as.numeric(DML2.OLS$coef.est) -table[2,1] <- as.numeric(DML2.lasso$coef.est) -table[3,1] <- as.numeric(DML2.RF$coef.est) -table[4,1] <- as.numeric(DML2.mix$coef.est) +table[1, 1] <- as.numeric(dml2_ols$coef.est) +table[2, 1] <- as.numeric(dml2_lasso$coef.est) +table[3, 1] <- as.numeric(dml2_rf$coef.est) +table[4, 1] <- as.numeric(dml2_mix$coef.est) # SE -table[1,2] <- as.numeric(DML2.OLS$se) -table[2,2] <- as.numeric(DML2.lasso$se) -table[3,2] <- as.numeric(DML2.RF$se) -table[4,2] <- as.numeric(DML2.mix$se) +table[1, 2] <- as.numeric(dml2_ols$se) +table[2, 2] <- as.numeric(dml2_lasso$se) +table[3, 2] <- as.numeric(dml2_rf$se) +table[4, 2] <- as.numeric(dml2_mix$se) # RMSE Y -table[1,3] <- as.numeric(prRes[2,1]) -table[2,3] <- as.numeric(prRes[2,2]) -table[3,3] <- as.numeric(prRes[2,3]) -table[4,3] <- as.numeric(prRes[2,4]) +table[1, 3] <- as.numeric(pr_res[2, 1]) +table[2, 3] <- as.numeric(pr_res[2, 2]) +table[3, 3] <- as.numeric(pr_res[2, 3]) +table[4, 3] <- as.numeric(pr_res[2, 4]) # RMSE D -table[1,4] <- as.numeric(prRes[1,1]) -table[2,4] <- as.numeric(prRes[1,2]) -table[3,4] <- as.numeric(prRes[1,3]) -table[4,4] <- as.numeric(prRes[1,4]) +table[1, 4] <- as.numeric(pr_res[1, 1]) +table[2, 4] <- as.numeric(pr_res[1, 2]) +table[3, 4] <- as.numeric(pr_res[1, 3]) +table[4, 4] <- as.numeric(pr_res[1, 4]) # print results -colnames(table) <- c("Estimate","Standard Error", "RMSE Y", "RMSE D") +colnames(table) <- c("Estimate", "Standard Error", "RMSE Y", "RMSE D") rownames(table) <- c("OLS", "Lasso", "RF", "RF/Lasso Mix") table ``` ```{r} -print(table, digit=3) +print(table, digit = 3) ``` ```{r} -tab<- xtable(table, digits=3) -print(tab, type="latex") +tab <- xtable(table, digits = 3) +print(tab, type = "latex") ``` diff --git a/PM4/r_dml_inference_for_gun_ownership.Rmd b/PM4/r_dml_inference_for_gun_ownership.Rmd index 3a60890b..512479d3 100644 --- a/PM4/r_dml_inference_for_gun_ownership.Rmd +++ b/PM4/r_dml_inference_for_gun_ownership.Rmd @@ -30,7 +30,9 @@ install.packages("tensorflow") install.packages("xtable") install.packages("dplyr") install.packages("sandwich") +``` +```{r} library(glmnet) library(randomForest) library(xgboost) @@ -66,96 +68,99 @@ We first reweight time and county variables as the data are population weighted. # enter nonlinearly and flexibly. ## County FE -county.vars <- select(data, starts_with('X_J')) +county_vars <- select(data, starts_with("X_J")) ## Time variables and population weights # Pull out time variables -time.vars <- select(data, starts_with('X_T')) +time_vars <- select(data, starts_with("X_T")) # Use these to construct population weights -popweights <- rowSums(time.vars) +pop_weights <- rowSums(time_vars) # Unweighted time variables -time.vars <- time.vars/popweights +time_vars <- time_vars / pop_weights # For any columns with only zero (like the first one), just drop -time.vars <- time.vars[, colSums(time.vars != 0) > 0] +time_vars <- time_vars[, colSums(time_vars != 0) > 0] # Create time index -time.ind <- rowSums(time.vars*(seq(1:20))) +time_ind <- rowSums(time_vars * (seq(1:20))) ``` Now we create initial conditions, county-level averages, and time period averages. ```{r} - ###### Create new data frame with variables we'll use +###### Create new data frame with variables we'll use # Function to find variable names -varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { +var_list <- function(df = NULL, type = c("numeric", "factor", "character"), pattern = "", exclude = NULL) { vars <- character(0) if (any(type %in% "numeric")) { - vars <- c(vars,names(df)[sapply(df,is.numeric)]) + vars <- c(vars, names(df)[sapply(df, is.numeric)]) } if (any(type %in% "factor")) { - vars <- c(vars,names(df)[sapply(df,is.factor)]) + vars <- c(vars, names(df)[sapply(df, is.factor)]) } if (any(type %in% "character")) { - vars <- c(vars,names(df)[sapply(df,is.character)]) + vars <- c(vars, names(df)[sapply(df, is.character)]) } - vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] + vars[(!vars %in% exclude) & grepl(vars, pattern = pattern)] } # census control variables -census <- NULL -census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") +census <- NULL +census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL", "^HI", "^HS", "^INC", "^LF", "^LN", + "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") -for(i in 1:length(census_var)){ - census <- append(census, varlist(data, pattern=census_var[i])) +for (i in seq_along(census_var)) { + census <- append(census, var_list(data, pattern = census_var[i])) } # other control variables -X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") -X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") +X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") +X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") # "treatment" variable -d <- "logfssl" +d <- "logfssl" # outcome variable -y <- "logghomr" +y <- "logghomr" # new data frame for time index -usedata <- as.data.frame(time.ind) -colnames(usedata) <- "time.ind" -usedata[,"weights"] <- popweights +usedata <- as.data.frame(time_ind) +colnames(usedata) <- "time_ind" +usedata[, "weights"] <- pop_weights -varlist <- c(y,d,X1,X2,census) -for(i in 1:length(varlist)){ - usedata[, varlist[i]] <- data[,varlist[i]] +var_list <- c(y, d, X1, X2, census) +for (i in seq_along(var_list)) { + usedata[, var_list[i]] <- data[, var_list[i]] } ####################### Construct county specific means, # time specific means, initial conditions # Initial conditions -varlist0 <- c(y,X1,X2,census) -for(i in 1:length(varlist0)){ - usedata[, paste(varlist0[i],"0" , sep="")] <- kronecker(usedata[time.ind == 1,varlist0[i]], - rep(1,20)) +var_list0 <- c(y, X1, X2, census) +for (i in seq_along(var_list0)) { + usedata[, paste(var_list0[i], "0", sep = "")] <- kronecker( + usedata[time_ind == 1, var_list0[i]], + rep(1, 20) + ) } # County means -varlistJ <- c(X1,X2,census) -county.vars <- as.matrix(county.vars) -for(i in 1:length(varlistJ)){ - usedata[, paste(varlistJ[i],"J" , sep="")] <- - county.vars%*%qr.solve(county.vars,as.matrix(usedata[,varlistJ[i]])) +var_list_j <- c(X1, X2, census) +county_vars <- as.matrix(county_vars) +for (i in seq_along(var_list_j)) { + usedata[, paste(var_list_j[i], "J", sep = "")] <- + county_vars %*% qr.solve(county_vars, as.matrix(usedata[, var_list_j[i]])) } # Time means -time.vars <- as.matrix(time.vars) -for(i in 1:length(varlistJ)){ - usedata[, paste(varlistJ[i],"T" , sep="")] <- - time.vars%*%qr.solve(time.vars,as.matrix(usedata[,varlistJ[i]])) +time_vars <- as.matrix(time_vars) +for (i in seq_along(var_list_j)) { + usedata[, paste(var_list_j[i], "T", sep = "")] <- + time_vars %*% qr.solve(time_vars, as.matrix(usedata[, var_list_j[i]])) } ``` @@ -170,12 +175,12 @@ After preprocessing the data, as a baseline model, we first look at simple regre # Simple regression lm0 <- lm(logghomr ~ logfssl, data = usedata) vc0 <- vcovHC(lm0) -cat("Baseline OLS:",lm0$coefficients[2]," (",sqrt(vc0[2,2]),")\n") +cat("Baseline OLS:", lm0$coefficients[2], " (", sqrt(vc0[2, 2]), ")\n") # Confidence Interval with HC3 covariance -tt <- qt(c(0.025,0.975),summary(lm0)$df[2]) +tt <- qt(c(0.025, 0.975), summary(lm0)$df[2]) se <- sqrt(diag(vc0)) ci <- coef(lm0) + se %o% tt -cat("2.5%: ", ci[2,1],"97.5%: ", ci[2,2]) +cat("2.5%: ", ci[2, 1], "97.5%: ", ci[2, 2]) ``` The point estimate is $0.302$ with the confidence interval ranging from 0.277 to 0.327. This @@ -186,10 +191,10 @@ Next we estimate with the baseline set of controls. ```{r} # Regression on baseline controls -varlist <- c(d,X1,X2,census) -lmC <- lm(paste("logghomr ~",paste(varlist, collapse = "+")), data = usedata) +var_list <- c(d, X1, X2, census) +lmC <- lm(paste("logghomr ~", paste(var_list, collapse = "+")), data = usedata) vcC <- vcovHC(lmC) -cat("OLS with Controls:",lmC$coefficients["logfssl"]," (",sqrt(vcC["logfssl","logfssl"]),")\n") +cat("OLS with Controls:", lmC$coefficients["logfssl"], " (", sqrt(vcC["logfssl", "logfssl"]), ")\n") ``` @@ -198,17 +203,17 @@ We can also run our regression with time and space averages as controls. ```{r} # Regression on time and cross sectional averages -varlistX <- c(X1,X2,census) -varlistMeans <- c(d,X1,X2,census) -for(i in 1:length(varlistX)){ - varlistMeans <- c(varlistMeans,paste(varlistX[i],"J" , sep="")) +var_list_x <- c(X1, X2, census) +var_list_means <- c(d, X1, X2, census) +for (i in seq_along(var_list_x)) { + var_list_means <- c(var_list_means, paste(var_list_x[i], "J", sep = "")) } -for(i in 1:length(varlistX)){ - varlistMeans <- c(varlistMeans,paste(varlistX[i],"T" , sep="")) +for (i in seq_along(var_list_x)) { + var_list_means <- c(var_list_means, paste(var_list_x[i], "T", sep = "")) } -lmM <- lm(paste("logghomr ~",paste(varlistMeans, collapse = "+")), data = usedata) +lmM <- lm(paste("logghomr ~", paste(var_list_means, collapse = "+")), data = usedata) vcM <- vcovHC(lmM) -cat("OLS with Averages:",lmM$coefficients["logfssl"]," (",sqrt(vcM["logfssl","logfssl"]),")\n") +cat("OLS with Averages:", lmM$coefficients["logfssl"], " (", sqrt(vcM["logfssl", "logfssl"]), ")\n") ``` Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we now include all controls. @@ -217,7 +222,7 @@ Since our goal is to estimate the effect of gun ownership after controlling for # Regression on all controls lmA <- lm(logghomr ~ ., data = usedata) vcA <- vcovHC(lmA) -cat("OLS All:",lmA$coefficients["logfssl"]," (",sqrt(vcA["logfssl","logfssl"]),")\n") +cat("OLS All:", lmA$coefficients["logfssl"], " (", sqrt(vcA["logfssl", "logfssl"]), ")\n") ``` After controlling for a rich set of characteristics, the point estimate of gun ownership attenuates to 0.179. @@ -259,109 +264,129 @@ set.seed(123) # Cross-fitting n <- nrow(usedata) Kf <- 5 # Number of cross-fitting folds -sampleframe <- rep(1:Kf, ceiling(n/Kf)) -cvgroup <- sample(sampleframe, size=n, replace = FALSE) # Cross-fitting groups +sampleframe <- rep(1:Kf, ceiling(n / Kf)) +cvgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups # Initialize variables for cross-fit predictions -yhat.r <- matrix(NA,n,10) # Going to consider 10 learners -dhat.r <- matrix(NA,n,10) +yhat_r <- matrix(NA, n, 10) # Going to consider 10 learners +dhat_r <- matrix(NA, n, 10) # Cross-fitting loop -for(k in 1:Kf) { - cat("fold: ", k,"\n") +for (k in 1:Kf) { + cat("fold: ", k, "\n") indk <- cvgroup == k - ktrain <- usedata[!indk,] - ktest <- usedata[indk,] + ktrain <- usedata[!indk, ] + ktest <- usedata[indk, ] #### Simple regression models #### # Simple regression - yhat.r[indk,1] <- ktest$logghomr - mean(ktrain$logghomr) - dhat.r[indk,1] <- ktest$logfssl - mean(ktrain$logfssl) + yhat_r[indk, 1] <- ktest$logghomr - mean(ktrain$logghomr) + dhat_r[indk, 1] <- ktest$logfssl - mean(ktrain$logfssl) # Baseline controls - varlist <- c(X1,X2,census) - lmyk.C <- lm(paste("logghomr ~",paste(varlist, collapse = "+")), data = ktrain) - yhat.r[indk,2] <- ktest$logghomr - predict(lmyk.C, ktest) - lmdk.C <- lm(paste("logfssl ~",paste(varlist, collapse = "+")), data = ktrain) - dhat.r[indk,2] <- ktest$logfssl - predict(lmdk.C, ktest) + var_list <- c(X1, X2, census) + lmyk_c <- lm(paste("logghomr ~", paste(var_list, collapse = "+")), data = ktrain) + yhat_r[indk, 2] <- ktest$logghomr - predict(lmyk_c, ktest) + lmdk_c <- lm(paste("logfssl ~", paste(var_list, collapse = "+")), data = ktrain) + dhat_r[indk, 2] <- ktest$logfssl - predict(lmdk_c, ktest) # All controls - lmyk.A <- lm(logghomr ~ .-logfssl, data = ktrain) - yhat.r[indk,3] <- ktest$logghomr - predict(lmyk.A, ktest) - lmdk.A <- lm(logfssl ~ .-logghomr, data = ktrain) - dhat.r[indk,3] <- ktest$logfssl - predict(lmdk.A, ktest) + lmyk_a <- lm(logghomr ~ . - logfssl, data = ktrain) + yhat_r[indk, 3] <- ktest$logghomr - predict(lmyk_a, ktest) + lmdk_a <- lm(logfssl ~ . - logghomr, data = ktrain) + dhat_r[indk, 3] <- ktest$logfssl - predict(lmdk_a, ktest) #### Penalized Linear Models #### # Lasso - default CV tuning - ytrain <- as.matrix(usedata[!indk,"logghomr"]) - dtrain <- as.matrix(usedata[!indk,"logfssl"]) - xtrain <- as.matrix(usedata[!indk,!names(usedata) %in% - c("logghomr", "logfssl")]) - ytest <- as.matrix(usedata[indk,"logghomr"]) - dtest <- as.matrix(usedata[indk,"logfssl"]) - xtest <- as.matrix(usedata[indk,!names(usedata) %in% - c("logghomr", "logfssl")]) + ytrain <- as.matrix(usedata[!indk, "logghomr"]) + dtrain <- as.matrix(usedata[!indk, "logfssl"]) + xtrain <- as.matrix(usedata[!indk, !names(usedata) %in% + c("logghomr", "logfssl")]) + ytest <- as.matrix(usedata[indk, "logghomr"]) + dtest <- as.matrix(usedata[indk, "logfssl"]) + xtest <- as.matrix(usedata[indk, !names(usedata) %in% + c("logghomr", "logfssl")]) - lassoyk <- cv.glmnet(xtrain,ytrain) - yhat.r[indk,4] <- ytest - predict(lassoyk, newx = xtest, s = "lambda.min") + lassoyk <- cv.glmnet(xtrain, ytrain) + yhat_r[indk, 4] <- ytest - predict(lassoyk, newx = xtest, s = "lambda.min") - lassodk <- cv.glmnet(xtrain,dtrain) - dhat.r[indk,4] <- dtest - predict(lassodk, newx = xtest, s = "lambda.min") + lassodk <- cv.glmnet(xtrain, dtrain) + dhat_r[indk, 4] <- dtest - predict(lassodk, newx = xtest, s = "lambda.min") # Ridge - ridgeyk <- cv.glmnet(xtrain,ytrain,alpha = 0) - yhat.r[indk,5] <- ytest - predict(ridgeyk, newx = xtest, s = "lambda.min") + ridgeyk <- cv.glmnet(xtrain, ytrain, alpha = 0) + yhat_r[indk, 5] <- ytest - predict(ridgeyk, newx = xtest, s = "lambda.min") - ridgedk <- cv.glmnet(xtrain,dtrain, alpha = 0) - dhat.r[indk,5] <- dtest - predict(ridgedk, newx = xtest, s = "lambda.min") + ridgedk <- cv.glmnet(xtrain, dtrain, alpha = 0) + dhat_r[indk, 5] <- dtest - predict(ridgedk, newx = xtest, s = "lambda.min") # EN, .5 - no cv over alpha - enyk <- cv.glmnet(xtrain,ytrain,alpha = .5) - yhat.r[indk,6] <- ytest - predict(enyk, newx = xtest, s = "lambda.min") + enyk <- cv.glmnet(xtrain, ytrain, alpha = .5) + yhat_r[indk, 6] <- ytest - predict(enyk, newx = xtest, s = "lambda.min") - endk <- cv.glmnet(xtrain,dtrain, alpha = .5) - dhat.r[indk,6] <- dtest - predict(endk, newx = xtest, s = "lambda.min") + endk <- cv.glmnet(xtrain, dtrain, alpha = .5) + dhat_r[indk, 6] <- dtest - predict(endk, newx = xtest, s = "lambda.min") #### Flexible regression models #### # Random forest - rfyk <- randomForest(logghomr ~ .-logfssl, data = ktrain) - yhat.r[indk,7] <- ktest$logghomr - predict(rfyk, ktest) - rfdk <- randomForest(logfssl ~ .-logghomr, data = ktrain) - dhat.r[indk,7] <- ktest$logfssl - predict(rfdk, ktest) + rfyk <- randomForest(logghomr ~ . - logfssl, data = ktrain) + yhat_r[indk, 7] <- ktest$logghomr - predict(rfyk, ktest) + rfdk <- randomForest(logfssl ~ . - logghomr, data = ktrain) + dhat_r[indk, 7] <- ktest$logfssl - predict(rfdk, ktest) # Boosted tree - depth 4 - xgb_train.y = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[!indk,"logghomr"])) - xgb_test.y = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[indk,"logghomr"])) - xgb_train.d = xgb.DMatrix(data = as.matrix(usedata[!indk,!names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[!indk,"logfssl"])) - xgb_test.d = xgb.DMatrix(data = as.matrix(usedata[indk,!names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[indk,"logfssl"])) - - byk = xgb.cv(data = xgb_train.y, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5) - best.iter = which.min(as.matrix(byk$evaluation_log[,4])) - byk = xgboost(data = xgb_train.y, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4) - yhat.r[indk,8] = ktest$logghomr - predict(byk, newdata = xgb_test.y, - iterationrange = c(1,(best.iter+1))) - - bdk = xgb.cv(data = xgb_train.d, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5) - best.iter = which.min(as.matrix(bdk$evaluation_log[,4])) - bdk = xgboost(data = xgb_train.d, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4) - dhat.r[indk,8] = ktest$logfssl - predict(bdk, newdata = xgb_test.d, - iterationrange = c(1,(best.iter+1))) + xgb_train_y <- xgb.DMatrix( + data = as.matrix(usedata[!indk, !names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[!indk, "logghomr"]) + ) + xgb_test_y <- xgb.DMatrix( + data = as.matrix(usedata[indk, !names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[indk, "logghomr"]) + ) + xgb_train_d <- xgb.DMatrix( + data = as.matrix(usedata[!indk, !names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[!indk, "logfssl"]) + ) + xgb_test_d <- xgb.DMatrix( + data = as.matrix(usedata[indk, !names(usedata) %in% + c("logghomr", "logfssl")]), + label = as.matrix(usedata[indk, "logfssl"]) + ) + + byk <- xgb.cv( + data = xgb_train_y, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5 + ) + best_iter <- which.min(as.matrix(byk$evaluation_log[, 4])) + byk <- xgboost( + data = xgb_train_y, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4 + ) + yhat_r[indk, 8] <- ktest$logghomr - predict(byk, + newdata = xgb_test_y, + iterationrange = c(1, (best_iter + 1)) + ) + + bdk <- xgb.cv( + data = xgb_train_d, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5 + ) + best_iter <- which.min(as.matrix(bdk$evaluation_log[, 4])) + bdk <- xgboost( + data = xgb_train_d, + nrounds = 1000, verbose = 0, eta = .1, max_depth = 4 + ) + dhat_r[indk, 8] <- ktest$logfssl - predict(bdk, + newdata = xgb_test_d, + iterationrange = c(1, (best_iter + 1)) + ) #### Neural Networks #### @@ -371,163 +396,178 @@ for(k in 1:Kf) { xtrainNN <- scale(xtrain, center = mean, scale = std) xtestNN <- scale(xtest, center = mean, scale = std) - xtestNN <- xtestNN[,which(!is.nan(colMeans(xtrainNN)))] - xtrainNN <- xtrainNN[,which(!is.nan(colMeans(xtrainNN)))] + xtestNN <- xtestNN[, which(!is.nan(colMeans(xtrainNN)))] + xtrainNN <- xtrainNN[, which(!is.nan(colMeans(xtrainNN)))] # DNN 50/50/50/50, .5 dropout NNmodely <- keras_model_sequential() - NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + NNmodely %>% + layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% layer_dense(units = 1) NNmodely %>% compile( loss = "mse", - optimizer = optimizer_rmsprop()) + optimizer = optimizer_rmsprop() + ) - fit.NNmodely <- NNmodely %>% fit( + fit_nn_model_y <- NNmodely %>% fit( xtrainNN, ytrain, epochs = 200, batch_size = 200, validation_split = .2, verbose = 0 ) - yhat.r[indk,9] <- ktest$logghomr - predict(NNmodely, xtestNN) + yhat_r[indk, 9] <- ktest$logghomr - predict(NNmodely, xtestNN) NNmodeld <- keras_model_sequential() - NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% + NNmodeld %>% + layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = 'relu') %>% + layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = .5) %>% layer_dense(units = 1) NNmodeld %>% compile( loss = "mse", - optimizer = optimizer_rmsprop()) + optimizer = optimizer_rmsprop() + ) - fit.NNmodeld <- NNmodeld %>% fit( + fit_nn_model_d <- NNmodeld %>% fit( xtrainNN, dtrain, epochs = 200, batch_size = 200, validation_split = .2, verbose = 0 ) - dhat.r[indk,9] <- ktest$logfssl - predict(NNmodeld, xtestNN) + dhat_r[indk, 9] <- ktest$logfssl - predict(NNmodeld, xtestNN) # DNN 50/50/50/50, early stopping NNmodely <- keras_model_sequential() - NNmodely %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% - layer_dense(units = 50, activation = 'relu') %>% - layer_dense(units = 50, activation = 'relu') %>% - layer_dense(units = 50, activation = 'relu') %>% + NNmodely %>% + layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% + layer_dense(units = 50, activation = "relu") %>% + layer_dense(units = 50, activation = "relu") %>% + layer_dense(units = 50, activation = "relu") %>% layer_dense(units = 1) NNmodely %>% compile( loss = "mse", - optimizer = optimizer_rmsprop()) + optimizer = optimizer_rmsprop() + ) - early.stop <- callback_early_stopping(monitor = "val_loss", patience = 25, - restore_best_weights = TRUE) + early_stop <- callback_early_stopping( + monitor = "val_loss", patience = 25, + restore_best_weights = TRUE + ) - fit.NNmodely <- NNmodely %>% fit( + fit_nn_model_y <- NNmodely %>% fit( xtrainNN, ytrain, epochs = 200, batch_size = 200, validation_split = .2, verbose = 0, - callbacks = list(early.stop) + callbacks = list(early_stop) ) - yhat.r[indk,10] <- ktest$logghomr - predict(NNmodely, xtestNN) + yhat_r[indk, 10] <- ktest$logghomr - predict(NNmodely, xtestNN) NNmodeld <- keras_model_sequential() - NNmodeld %>% layer_dense(units = 50, activation = 'relu', input_shape = c(ncol(xtrainNN))) %>% - layer_dense(units = 50, activation = 'relu') %>% - layer_dense(units = 50, activation = 'relu') %>% - layer_dense(units = 50, activation = 'relu') %>% + NNmodeld %>% + layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% + layer_dense(units = 50, activation = "relu") %>% + layer_dense(units = 50, activation = "relu") %>% + layer_dense(units = 50, activation = "relu") %>% layer_dense(units = 1) NNmodeld %>% compile( loss = "mse", - optimizer = optimizer_rmsprop()) + optimizer = optimizer_rmsprop() + ) - early.stop <- callback_early_stopping(monitor = "val_loss", patience = 25, - restore_best_weights = TRUE) + early_stop <- callback_early_stopping( + monitor = "val_loss", patience = 25, + restore_best_weights = TRUE + ) - fit.NNmodeld <- NNmodeld %>% fit( + fit_nn_model_d <- NNmodeld %>% fit( xtrainNN, dtrain, epochs = 200, batch_size = 200, validation_split = .2, verbose = 0, - callbacks = list(early.stop) + callbacks = list(early_stop) ) - dhat.r[indk,10] <- ktest$logfssl - predict(NNmodeld, xtestNN) - + dhat_r[indk, 10] <- ktest$logfssl - predict(NNmodeld, xtestNN) } ################################################################################ # Predictions done, now DML -RMSE.y <- sqrt(colMeans(yhat.r^2)) -RMSE.d <- sqrt(colMeans(dhat.r^2)) +rmse_y <- sqrt(colMeans(yhat_r^2)) +rmse_d <- sqrt(colMeans(dhat_r^2)) # dml coefficient estimates -b.dml <- rep(NA,10) -s.dml <- rep(NA,10) -for(k in 1:10){ - lm.k <- lm(yhat.r[,k] ~ dhat.r[,k]-1) - v.k <- vcovHC(lm.k) - b.dml[k] <- lm.k$coefficients - s.dml[k] <- sqrt(v.k) +b_dml <- rep(NA, 10) +s_dml <- rep(NA, 10) +for (k in 1:10) { + lm_k <- lm(yhat_r[, k] ~ dhat_r[, k] - 1) + v_k <- vcovHC(lm_k) + b_dml[k] <- lm_k$coefficients + s_dml[k] <- sqrt(v_k) } # "best" coefficient estimate -lm.k <- lm(yhat.r[,which.min(RMSE.y)] ~ dhat.r[,which.min(RMSE.d)]-1) -v.k <- vcovHC(lm.k) -b.dml[11] <- lm.k$coefficients -s.dml[11] <- sqrt(v.k) +lm_k <- lm(yhat_r[, which.min(rmse_y)] ~ dhat_r[, which.min(rmse_d)] - 1) +v_k <- vcovHC(lm_k) +b_dml[11] <- lm_k$coefficients +s_dml[11] <- sqrt(v_k) # ls model average -yhat <- usedata$logghomr - yhat.r -dhat <- usedata$logfssl - dhat.r - -ma.y <- lm(usedata$logghomr ~ yhat-1) -ma.d <- lm(usedata$logfssl ~ dhat-1) -weights.y <- ma.y$coefficients -weights.d <- ma.d$coefficients -lm.k <- lm(ma.y$residuals ~ ma.d$residuals-1) -v.k <- vcovHC(lm.k) -b.dml[12] <- lm.k$coefficients -s.dml[12] <- sqrt(v.k) +yhat <- usedata$logghomr - yhat_r +dhat <- usedata$logfssl - dhat_r + +ma_y <- lm(usedata$logghomr ~ yhat - 1) +ma_d <- lm(usedata$logfssl ~ dhat - 1) +weights_y <- ma_y$coefficients +weights_d <- ma_d$coefficients +lm_k <- lm(ma_y$residuals ~ ma_d$residuals - 1) +v_k <- vcovHC(lm_k) +b_dml[12] <- lm_k$coefficients +s_dml[12] <- sqrt(v_k) ## Display results table1 <- matrix(0, 10, 2) -table1[,1] <- RMSE.y -table1[,2] <- RMSE.d -colnames(table1)<- c("RMSE Y","RMSE D") -rownames(table1)<- c("OLS - No Controls", "OLS - Basic", "OLS - All", - "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", - "Random Forest","Boosted trees - depth 4", - "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping") -tab1<- xtable(table1, digits =c(0,4,4)) +table1[, 1] <- rmse_y +table1[, 2] <- rmse_d +colnames(table1) <- c("RMSE Y", "RMSE D") +rownames(table1) <- c( + "OLS - No Controls", "OLS - Basic", "OLS - All", + "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", + "Random Forest", "Boosted trees - depth 4", + "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping" +) +tab1 <- xtable(table1, digits = c(0, 4, 4)) tab1 table2 <- matrix(0, 12, 2) -table2[,1] <- b.dml -table2[,2] <- s.dml -colnames(table2)<- c("Point Estimate","Std. Error") -rownames(table2)<- c("OLS - No Controls", "OLS - Basic", "OLS - All", - "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", - "Random Forest","Boosted trees - depth 4", - "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping", - "Best","Least Squares Model Average") -tab2<- xtable(table2, digits =c(0,4,4)) +table2[, 1] <- b_dml +table2[, 2] <- s_dml +colnames(table2) <- c("Point Estimate", "Std. Error") +rownames(table2) <- c( + "OLS - No Controls", "OLS - Basic", "OLS - All", + "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", + "Random Forest", "Boosted trees - depth 4", + "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping", + "Best", "Least Squares Model Average" +) +tab2 <- xtable(table2, digits = c(0, 4, 4)) tab2 ``` ```{r} -print(xtable(table1,type="latex")) -print(xtable(table2,type="latex")) +print(xtable(table1, type = "latex")) +print(xtable(table2, type = "latex")) ``` From 3a5f3eaebc623a20a4228ed91d9b5db5ed5c9a6a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 23:48:47 -0700 Subject: [PATCH 163/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 52 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 71ed8d06..7cf9f1f2 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -1,7 +1,10 @@ -name: Convert and Lint R Notebooks +name: Check and Transform R Notebooks on: push: + pull_request: + branches: + - main schedule: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC @@ -19,17 +22,51 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 + - name: Find changed notebooks in PR + if: "(github.event_name == 'pull_request')" + id: find_notebooks_pr + run: | + git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} + git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + + - name: Find changed notebooks in Push + if: "(github.event_name == 'push')" + id: find_notebooks_push + run: | + git diff --name-only HEAD^ HEAD > changed_files.txt + grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + + - name: Check if any notebooks changed + id: check_notebooks + run: | + if [${{ github.event_name }} == 'pull_request'] || [${{ github.event_name }} == 'push']; then + grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt + if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then + echo "notebooks_changed=true" >> $GITHUB_ENV + else + echo "notebooks_changed=false" >> $GITHUB_ENV + echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." + fi + else + # we run all folders if it is the weekly scheduled run + echo "notebooks_changed=true" >> $GITHUB_ENV + fi + - name: Install system dependencies + if: env.notebooks_changed == 'true' run: | sudo apt-get update sudo apt-get install -y libcurl4-openssl-dev - name: Set up Python + if: env.notebooks_changed == 'true' uses: actions/setup-python@v2 with: python-version: '3.10' # Specify your Python version here - name: Install Python dependencies + if: env.notebooks_changed == 'true' run: | python -m pip install --upgrade pip pip install nbstripout @@ -37,13 +74,16 @@ jobs: shell: bash - name: Set up R + if: env.notebooks_changed == 'true' uses: r-lib/actions/setup-r@v2 - name: Install rmarkdown, knitr, and lintr packages + if: env.notebooks_changed == 'true' run: | R -e 'install.packages(c("rmarkdown", "knitr", "lintr", "xfun", "remotes"), repos="https://cloud.r-project.org")' - name: Strip outputs from .irnb files + if: env.notebooks_changed == 'true' run: | for notebook in ${{ matrix.directory }}/*.irnb; do ipynb_notebook="${notebook%.irnb}.ipynb" @@ -53,6 +93,7 @@ jobs: done - name: Convert .irnb to .Rmd and .R + if: env.notebooks_changed == 'true' run: | R -e ' files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) @@ -64,6 +105,7 @@ jobs: ' - name: Lint .Rmd files + if: env.notebooks_changed == 'true' id: lint run: | R -e ' @@ -83,6 +125,7 @@ jobs: ' - name: Execute R scripts and log output + if: env.notebooks_changed == 'true' id: execute run: | log_file="${{ matrix.directory }}_r_script_execution.log" @@ -125,36 +168,41 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - name: Upload execution log + if: env.notebooks_changed == 'true' uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-script-execution-log path: ${{ matrix.directory }}_r_script_execution.log - name: Zip .R files + if: env.notebooks_changed == 'true' run: | mkdir r_scripts mv ${{ matrix.directory }}/*.R r_scripts/ zip -r ${{ matrix.directory }}_r_scripts.zip r_scripts - name: Upload artifact + if: env.notebooks_changed == 'true' uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-scripts path: ${{ matrix.directory }}_r_scripts.zip - name: Delete .R files and zip + if: env.notebooks_changed == 'true' run: | rm -rf r_scripts rm ${{ matrix.directory }}_r_scripts.zip - name: Check if there are any changes + if: env.notebooks_changed == 'true' id: verify_diff run: | git pull git diff --quiet ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd || echo "changed=true" >> $GITHUB_OUTPUT - name: Commit and push stripped .irnb and .Rmd files - if: steps.verify_diff.outputs.changed == 'true' + if: "(env.notebooks_changed == 'true') && (steps.verify_diff.outputs.changed == 'true')" run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' From ce79cece3a0f063ddc0c0667b05cd9eea4e5bf73 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 23:53:48 -0700 Subject: [PATCH 164/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 7cf9f1f2..6168fce2 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -34,7 +34,12 @@ jobs: if: "(github.event_name == 'push')" id: find_notebooks_push run: | - git diff --name-only HEAD^ HEAD > changed_files.txt + if git rev-parse --verify HEAD^ >/dev/null 2>&1; then + git diff --name-only HEAD^ HEAD > changed_files.txt + else + # For initial commit or shallow clone + git diff --name-only HEAD > changed_files.txt + fi grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed From 55fef5f603488573c03ddf2dfef50cc89ddfb283 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Fri, 19 Jul 2024 23:57:38 -0700 Subject: [PATCH 165/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 6168fce2..83fa246b 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -20,7 +20,9 @@ jobs: directory: ['PM1', 'PM2', 'PM3', 'PM4', 'PM5', 'CM1', 'CM2', 'CM3', 'AC1', 'AC2', 'T'] steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 + with: + fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} - name: Find changed notebooks in PR if: "(github.event_name == 'pull_request')" @@ -34,12 +36,7 @@ jobs: if: "(github.event_name == 'push')" id: find_notebooks_push run: | - if git rev-parse --verify HEAD^ >/dev/null 2>&1; then - git diff --name-only HEAD^ HEAD > changed_files.txt - else - # For initial commit or shallow clone - git diff --name-only HEAD > changed_files.txt - fi + git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed From 6d2ea3e6debaae77aca5d2949d9f1220c446e8d6 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 00:00:49 -0700 Subject: [PATCH 166/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 83fa246b..5b5eac42 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -42,7 +42,7 @@ jobs: - name: Check if any notebooks changed id: check_notebooks run: | - if [${{ github.event_name }} == 'pull_request'] || [${{ github.event_name }} == 'push']; then + if ['${{ github.event_name }}' == 'pull_request'] || ['${{ github.event_name }}' == 'push']; then grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV From d544e3eba3b38d49b2955588e672dbbbea101af6 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 00:03:04 -0700 Subject: [PATCH 167/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 5b5eac42..1095aa23 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -42,7 +42,7 @@ jobs: - name: Check if any notebooks changed id: check_notebooks run: | - if ['${{ github.event_name }}' == 'pull_request'] || ['${{ github.event_name }}' == 'push']; then + if [$GITHUB_EVENT_NAME == 'pull_request'] || [$GITHUB_EVENT_NAME == 'push']; then grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV From 7f2851ad45cefd3e1c8238fbb00b6e6bceae4883 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 00:05:58 -0700 Subject: [PATCH 168/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 29 +++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 1095aa23..2574f3e6 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -25,7 +25,7 @@ jobs: fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} - name: Find changed notebooks in PR - if: "(github.event_name == 'pull_request')" + if: github.event_name == 'pull_request' id: find_notebooks_pr run: | git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} @@ -33,28 +33,31 @@ jobs: grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push - if: "(github.event_name == 'push')" + if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - - name: Check if any notebooks changed + - name: Check if any notebooks changed in PR or Push + if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | - if [$GITHUB_EVENT_NAME == 'pull_request'] || [$GITHUB_EVENT_NAME == 'push']; then - grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt - if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then - echo "notebooks_changed=true" >> $GITHUB_ENV - else - echo "notebooks_changed=false" >> $GITHUB_ENV - echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." - fi - else - # we run all folders if it is the weekly scheduled run + grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt + if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV + else + echo "notebooks_changed=false" >> $GITHUB_ENV + echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." fi + - name: Set notebooks changed to true for schedule + if: github.event_name == 'schedule' + id: set_check_notebooks_true + run: | + # we run all folders if it is the weekly scheduled run + echo "notebooks_changed=true" >> $GITHUB_ENV + - name: Install system dependencies if: env.notebooks_changed == 'true' run: | From 7260a2b6f822e402858292ef5d60737194eee552 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 00:09:32 -0700 Subject: [PATCH 169/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 2574f3e6..21c7f74b 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -43,7 +43,7 @@ jobs: if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | - grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt + grep '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else From 8137ef4d9fde28f14c05accb6850f6fa7908f515 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:00:47 -0700 Subject: [PATCH 170/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 21c7f74b..6182267e 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -44,12 +44,12 @@ jobs: id: check_notebooks run: | grep '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt - if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then - echo "notebooks_changed=true" >> $GITHUB_ENV - else - echo "notebooks_changed=false" >> $GITHUB_ENV - echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." - fi + # if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then + # echo "notebooks_changed=true" >> $GITHUB_ENV + # else + # echo "notebooks_changed=false" >> $GITHUB_ENV + # echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." + # fi - name: Set notebooks changed to true for schedule if: github.event_name == 'schedule' From 32eb9b9204560d80f9fdb056110ed3761886f70d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:03:35 -0700 Subject: [PATCH 171/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 6182267e..bcc09f6e 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -43,6 +43,7 @@ jobs: if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | + cat changed_notebooks.txt grep '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt # if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then # echo "notebooks_changed=true" >> $GITHUB_ENV From ef4d9d8bdf11a908ea1986c7caf937b95d083f41 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:07:07 -0700 Subject: [PATCH 172/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index bcc09f6e..54ccf30a 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -44,13 +44,13 @@ jobs: id: check_notebooks run: | cat changed_notebooks.txt - grep '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt - # if grep -q '${{ matrix.directory }}/*.irnb$' changed_notebooks.txt; then - # echo "notebooks_changed=true" >> $GITHUB_ENV - # else - # echo "notebooks_changed=false" >> $GITHUB_ENV - # echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." - # fi + grep '^${{ matrix.directory }}/.*\.irnb$' changed_notebooks.txt + if grep -q '^${{ matrix.directory }}/.*\.irnb$' changed_notebooks.txt; then + echo "notebooks_changed=true" >> $GITHUB_ENV + else + echo "notebooks_changed=false" >> $GITHUB_ENV + echo "No R notebooks changed in folder ${{ matrix.directory }} in this PR." + fi - name: Set notebooks changed to true for schedule if: github.event_name == 'schedule' From d3ca4056d2fdc7a4aeb680ec09ce920aee8d4ae4 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:31:15 -0700 Subject: [PATCH 173/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index 54ccf30a..c22969f7 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -44,7 +44,6 @@ jobs: id: check_notebooks run: | cat changed_notebooks.txt - grep '^${{ matrix.directory }}/.*\.irnb$' changed_notebooks.txt if grep -q '^${{ matrix.directory }}/.*\.irnb$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else From d965c445595d7413afe3444c90ba5269d8ba4dc0 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:33:57 -0700 Subject: [PATCH 174/261] Update r-colliderbias-hollywood.irnb --- CM2/r-colliderbias-hollywood.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index c032372c..536ba9a5 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -31,7 +31,7 @@ }, "outputs": [], "source": [ - "install.packages(\"dagitty\")" + "install.packages(\"dagitty\");" ] }, { From 62716a190094d675f4d8d615a35e9af4a533bf5e Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:36:09 -0700 Subject: [PATCH 175/261] Update r-colliderbias-hollywood.irnb --- CM2/r-colliderbias-hollywood.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index 536ba9a5..c032372c 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -31,7 +31,7 @@ }, "outputs": [], "source": [ - "install.packages(\"dagitty\");" + "install.packages(\"dagitty\")" ] }, { From d1df00dfd92da1e745aaf31082b1822b78857e8f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:38:53 -0700 Subject: [PATCH 176/261] Update transform-R-to-Rmd.yml --- .github/workflows/transform-R-to-Rmd.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/transform-R-to-Rmd.yml index c22969f7..5ce200f9 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/transform-R-to-Rmd.yml @@ -55,7 +55,8 @@ jobs: if: github.event_name == 'schedule' id: set_check_notebooks_true run: | - # we run all folders if it is the weekly scheduled run + # we run all folders if it is the weekly scheduled run to + # check if something broke due to changes in dependencies echo "notebooks_changed=true" >> $GITHUB_ENV - name: Install system dependencies From 9e994435b26f352b907b60ba372b475f072db37a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:47:50 -0700 Subject: [PATCH 177/261] Update and rename transform-R-to-Rmd.yml to check-and-transform-R-notebooks.yml --- ...orm-R-to-Rmd.yml => check-and-transform-R-notebooks.yml} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename .github/workflows/{transform-R-to-Rmd.yml => check-and-transform-R-notebooks.yml} (94%) diff --git a/.github/workflows/transform-R-to-Rmd.yml b/.github/workflows/check-and-transform-R-notebooks.yml similarity index 94% rename from .github/workflows/transform-R-to-Rmd.yml rename to .github/workflows/check-and-transform-R-notebooks.yml index 5ce200f9..4d6bc3d5 100644 --- a/.github/workflows/transform-R-to-Rmd.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -30,21 +30,21 @@ jobs: run: | git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt - grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep '\.irnb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q '^${{ matrix.directory }}/.*\.irnb$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV From db62573cc636e6eb966894308fbec7b0a8e8eebb Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 05:55:18 -0700 Subject: [PATCH 178/261] Update check-and-transform-R-notebooks.yml --- .github/workflows/check-and-transform-R-notebooks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 4d6bc3d5..2858d063 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -30,21 +30,21 @@ jobs: run: | git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt - grep -E '\.irnb$|check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep -E '\.irnb$|check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV From 5ad0ed9a5e54ab4bb396495d70cc7956c20e94ca Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:07:50 -0700 Subject: [PATCH 179/261] Merged python actions into one for both PR and schedule --- .../check-and-transform-R-notebooks.yml | 2 +- .github/workflows/python-notebooks.yml | 64 +++++++++++++++---- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 2858d063..f915a70e 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -9,7 +9,7 @@ on: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC concurrency: - group: convert-lint-notebooks + group: convert-lint-notebooks-${{ github.ref }} cancel-in-progress: true jobs: diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 5a1b0fff..13f9a371 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -2,6 +2,7 @@ name: Run Jupyter Notebooks on: push: + pull_request: branches: - main schedule: @@ -23,10 +24,47 @@ jobs: steps: - name: Checkout repository if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" - uses: actions/checkout@v2 + uses: actions/checkout@v3 + with: + fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} + + - name: Find changed notebooks in PR + if: github.event_name == 'pull_request' + id: find_notebooks_pr + run: | + git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} + git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + grep -E '\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + - name: Find changed notebooks in Push + if: github.event_name == 'push' + id: find_notebooks_push + run: | + git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt + grep -E '\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + + - name: Check if any notebooks changed in PR or Push + if: (github.event_name == 'push') || (github.event_name == 'pull_request') + id: check_notebooks + run: | + cat changed_notebooks.txt + if grep -q -E '^${{ matrix.directory }}/.*\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_notebooks.txt; then + echo "notebooks_changed=true" >> $GITHUB_ENV + else + echo "notebooks_changed=false" >> $GITHUB_ENV + echo "No Python notebooks changed in folder ${{ matrix.directory }} in this PR." + fi + + - name: Set notebooks changed to true for schedule + if: github.event_name == 'schedule' + id: set_check_notebooks_true + run: | + # we run all folders if it is the weekly scheduled run to + # check if something broke due to changes in dependencies + echo "notebooks_changed=true" >> $GITHUB_ENV + - name: Install git - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get update @@ -39,25 +77,25 @@ jobs: shell: bash - name: Install libgraphviz-dev if folder is CM3 - if: matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest' + if: "(env.notebooks_changed == 'true') && (matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest')" run: sudo apt-get install -y libgraphviz-dev shell: bash - name: Install R if folder is CM3 - if: matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest' + if: "(env.notebooks_changed == 'true') && (matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest')" run: | sudo apt-get update sudo apt-get install -y r-base shell: bash - name: Set up Python - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/setup-python@v2 with: python-version: '3.10' # Use Python 3.10 - name: Install dependencies - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi @@ -65,7 +103,7 @@ jobs: shell: bash - name: Run Flake8 linting on notebooks - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: lint run: | mkdir -p linting_logs @@ -78,7 +116,7 @@ jobs: shell: bash - name: Convert Jupyter notebooks to Python scripts - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: convert run: | mkdir -p converted_scripts @@ -90,7 +128,7 @@ jobs: shell: bash - name: Run converted Python scripts with IPython - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: execute run: | mkdir -p logs @@ -104,7 +142,7 @@ jobs: shell: bash - name: Aggregate and report errors and warnings - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | echo "Aggregating errors and warnings..." grep -E "Traceback|Error:|Exception:|ModuleNotFoundError|FutureWarning|TypeError" logs/*.txt linting_logs/*.txt > logs/errors_and_warnings.txt || true @@ -113,19 +151,19 @@ jobs: shell: bash - name: Upload linting logs - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/upload-artifact@v2 with: name: linting-logs-${{ matrix.folder }}-${{ matrix.os }} path: linting_logs - name: Upload execution logs - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/upload-artifact@v2 with: name: execution-logs-${{ matrix.folder }}-${{ matrix.os }} path: logs - name: Check for errors - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0')" + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 From ce920f2da957057c5cf14b153e5ffc26fd1ec8fe Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:18:17 -0700 Subject: [PATCH 180/261] Update python-notebooks.yml --- .github/workflows/python-notebooks.yml | 34 +++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 13f9a371..c3a22a09 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -1,4 +1,4 @@ -name: Run Jupyter Notebooks +name: Run and Test Jupyter Notebooks on: push: @@ -9,11 +9,11 @@ on: - cron: '0 0 * * 0' # Runs once a week on Sunday at midnight concurrency: - group: run-notebooks-${{ github.ref }} + group: test-notebooks-${{ github.ref }} cancel-in-progress: true jobs: - run-notebooks: + test-notebooks: runs-on: ${{ matrix.os }} strategy: matrix: @@ -99,8 +99,15 @@ jobs: run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install jupyter nbconvert flake8 flake8-nb ipython + pip install jupyter nbconvert flake8 flake8-nb ipython nbstripout shell: bash + + - name: Strip outputs from .ipynb files + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + run: | + for notebook in ${{ matrix.directory }}/*.ipynb; do + nbstripout "$notebook" + done - name: Run Flake8 linting on notebooks if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" @@ -167,3 +174,22 @@ jobs: - name: Check for errors if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 + + - name: Check if there are any changes (e.g. stripped outputs) + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + id: verify_diff + run: | + git pull + git diff --quiet ${{ matrix.directory }}/*.ipynb || echo "changed=true" >> $GITHUB_OUTPUT + + - name: Commit and push stripped .ipynb files + if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))) && (steps.verify_diff.outputs.changed == 'true')" + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git pull + git add ${{ matrix.directory }}/*.ipynb + git commit -m 'Strip outputs from .ipynb files in ${{ matrix.directory }}' + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From bc63d5c3526764268eaba235fd66f79f420f4dba Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:25:43 -0700 Subject: [PATCH 181/261] updated the two test jobs --- .../check-and-transform-R-notebooks.yml | 2 +- .github/workflows/pr-check.yml | 143 ------------------ .github/workflows/python-notebooks.yml | 4 +- 3 files changed, 3 insertions(+), 146 deletions(-) delete mode 100644 .github/workflows/pr-check.yml diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index f915a70e..470b979d 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -52,7 +52,7 @@ jobs: fi - name: Set notebooks changed to true for schedule - if: github.event_name == 'schedule' + if: "! ((github.event_name == 'push') || (github.event_name == 'pull_request'))" id: set_check_notebooks_true run: | # we run all folders if it is the weekly scheduled run to diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml deleted file mode 100644 index 0ad3941b..00000000 --- a/.github/workflows/pr-check.yml +++ /dev/null @@ -1,143 +0,0 @@ -name: Test Changed Jupyter Notebooks - -on: - pull_request: - branches: - - main - -jobs: - test-notebooks: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10' # Use Python 3.10 - - - name: Find changed notebooks - id: find_notebooks - run: | - git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} - git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt - grep '\.ipynb$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - - - name: Check if any notebooks changed - id: check_notebooks - run: | - if grep -q '\.ipynb$' changed_notebooks.txt; then - echo "notebooks_changed=true" >> $GITHUB_ENV - else - echo "notebooks_changed=false" >> $GITHUB_ENV - echo "No Jupyter notebooks changed in this PR." - fi - - - name: Install flake8 - if: env.notebooks_changed == 'true' - run: | - python -m pip install --upgrade pip - pip install flake8 flake8-nb - shell: bash - - - name: Run Flake8 linting on changed notebooks - if: env.notebooks_changed == 'true' - id: lint - run: | - mkdir -p linting_logs - flake8_errors=0 - while IFS= read -r notebook; do - [ "$(basename "$notebook")" = "python-dosearch.ipynb" ] && continue - echo "Linting $notebook" - flake8-nb --config=.flake8 "$notebook" > "linting_logs/$(basename "$notebook" .ipynb)_linting.txt" 2>&1 || flake8_errors=$((flake8_errors+1)) - cat "linting_logs/$(basename "$notebook" .ipynb)_linting.txt" - done < changed_notebooks.txt - echo "flake8_errors=$flake8_errors" >> $GITHUB_ENV - shell: bash - - - name: Upload linting logs - if: env.notebooks_changed == 'true' - uses: actions/upload-artifact@v2 - with: - name: linting-logs - path: linting_logs - - - name: Check for linting errors - if: "(env.notebooks_changed == 'true') && (env.flake8_errors != '0')" - run: exit 1 - - - name: Install git - if: env.notebooks_changed == 'true' - run: | - sudo apt-get update - sudo apt-get install -y git - shell: bash - - - name: Install libgraphviz-dev if folder - if: env.notebooks_changed == 'true' - run: sudo apt-get install -y libgraphviz-dev - shell: bash - - - name: Install R if folder - if: env.notebooks_changed == 'true' - run: | - sudo apt-get update - sudo apt-get install -y r-base - shell: bash - - - name: Install dependencies - if: env.notebooks_changed == 'true' - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install jupyter nbconvert ipython - shell: bash - - - name: Convert changed Jupyter notebooks to Python scripts - if: env.notebooks_changed == 'true' - id: convert - run: | - mkdir -p converted_scripts - while IFS= read -r notebook; do - echo "Processing $notebook" - [ -e "$notebook" ] || continue - [ "$(basename "$notebook")" = "DoubleML_and_Feature_Engineering_with_BERT.ipynb" ] && continue - jupyter nbconvert --to script "$notebook" --output-dir converted_scripts - done < changed_notebooks.txt - shell: bash - - - name: Run converted Python scripts with IPython - if: env.notebooks_changed == 'true' - id: execute - run: | - mkdir -p logs - script_errors=0 - for script in converted_scripts/*.py; do - [ -e "$script" ] || continue - echo "Running $script" - ipython "$script" > "logs/$(basename "$script" .py).txt" 2>&1 || script_errors=$((script_errors+1)) - done - echo "script_errors=$script_errors" >> $GITHUB_ENV - shell: bash - - - name: Aggregate and report errors and warnings - if: env.notebooks_changed == 'true' - run: | - echo "Aggregating errors and warnings..." - grep -E "Traceback|Error:|Exception:|ModuleNotFoundError|FutureWarning|TypeError" logs/*.txt linting_logs/*.txt > logs/errors_and_warnings.txt || true - echo "Errors and Warnings:" - cat logs/errors_and_warnings.txt - shell: bash - - - name: Upload execution logs - if: env.notebooks_changed == 'true' - uses: actions/upload-artifact@v2 - with: - name: execution-logs - path: logs - - - name: Check for errors - if: "(env.notebooks_changed == 'true') && (env.flake8_errors != '0' || env.script_errors != '0')" - run: exit 1 diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index c3a22a09..c5ba4e6b 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -55,8 +55,8 @@ jobs: echo "No Python notebooks changed in folder ${{ matrix.directory }} in this PR." fi - - name: Set notebooks changed to true for schedule - if: github.event_name == 'schedule' + - name: Set notebooks changed to true for schedule (or other ways of triggering the job) + if: "! ((github.event_name == 'push') || (github.event_name == 'pull_request'))" id: set_check_notebooks_true run: | # we run all folders if it is the weekly scheduled run to From 9f8a38feec66a2be8ce04ddc6188308e92554d56 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:28:30 -0700 Subject: [PATCH 182/261] updated name of jobs --- .github/workflows/check-and-transform-R-notebooks.yml | 4 ++-- .github/workflows/python-notebooks.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 470b979d..ef6a2f75 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -9,11 +9,11 @@ on: - cron: '0 12 * * 0' # Runs every Sunday at 12 PM UTC concurrency: - group: convert-lint-notebooks-${{ github.ref }} + group: test-R-notebooks-${{ github.ref }} cancel-in-progress: true jobs: - convert-lint-notebooks: + test-R-notebooks: runs-on: ubuntu-latest strategy: matrix: diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index c5ba4e6b..e27b463d 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -9,11 +9,11 @@ on: - cron: '0 0 * * 0' # Runs once a week on Sunday at midnight concurrency: - group: test-notebooks-${{ github.ref }} + group: test-python-notebooks-${{ github.ref }} cancel-in-progress: true jobs: - test-notebooks: + test-python-notebooks: runs-on: ${{ matrix.os }} strategy: matrix: From cac2379f27dc245510ec17edba502cb4a74920ac Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:31:24 -0700 Subject: [PATCH 183/261] folder->directory in yaml file --- .github/workflows/python-notebooks.yml | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index e27b463d..336de661 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -19,11 +19,11 @@ jobs: matrix: # Removed: {windows-latest, macos-latest} as they were taking too long. Consider adding back in the future os: [ubuntu-latest] - folder: [PM1, PM2, PM3, PM4, PM5, CM1, CM2, CM3, AC1, AC2, T] + directory: [PM1, PM2, PM3, PM4, PM5, CM1, CM2, CM3, AC1, AC2, T] steps: - name: Checkout repository - if: "! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" + if: "! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))" uses: actions/checkout@v3 with: fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} @@ -64,7 +64,7 @@ jobs: echo "notebooks_changed=true" >> $GITHUB_ENV - name: Install git - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get update @@ -77,25 +77,25 @@ jobs: shell: bash - name: Install libgraphviz-dev if folder is CM3 - if: "(env.notebooks_changed == 'true') && (matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest')" + if: "(env.notebooks_changed == 'true') && (matrix.directory == 'CM3' && matrix.os == 'ubuntu-latest')" run: sudo apt-get install -y libgraphviz-dev shell: bash - name: Install R if folder is CM3 - if: "(env.notebooks_changed == 'true') && (matrix.folder == 'CM3' && matrix.os == 'ubuntu-latest')" + if: "(env.notebooks_changed == 'true') && (matrix.directory == 'CM3' && matrix.os == 'ubuntu-latest')" run: | sudo apt-get update sudo apt-get install -y r-base shell: bash - name: Set up Python - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/setup-python@v2 with: python-version: '3.10' # Use Python 3.10 - name: Install dependencies - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi @@ -103,19 +103,19 @@ jobs: shell: bash - name: Strip outputs from .ipynb files - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | for notebook in ${{ matrix.directory }}/*.ipynb; do nbstripout "$notebook" done - name: Run Flake8 linting on notebooks - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: lint run: | mkdir -p linting_logs flake8_errors=0 - for notebook in ${{ matrix.folder }}/*.ipynb; do + for notebook in ${{ matrix.directory }}/*.ipynb; do [ "$(basename "$notebook")" = "python-dosearch.ipynb" ] && continue flake8-nb --config=.flake8 "$notebook" > "linting_logs/$(basename "$notebook" .ipynb)_linting.txt" 2>&1 || flake8_errors=$((flake8_errors+1)) done @@ -123,11 +123,11 @@ jobs: shell: bash - name: Convert Jupyter notebooks to Python scripts - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: convert run: | mkdir -p converted_scripts - for notebook in ${{ matrix.folder }}/*.ipynb; do + for notebook in ${{ matrix.directory }}/*.ipynb; do [ -e "$notebook" ] || continue [ "$(basename "$notebook")" = "DoubleML_and_Feature_Engineering_with_BERT.ipynb" ] && continue jupyter nbconvert --to script "$notebook" --output-dir converted_scripts @@ -135,7 +135,7 @@ jobs: shell: bash - name: Run converted Python scripts with IPython - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: execute run: | mkdir -p logs @@ -149,7 +149,7 @@ jobs: shell: bash - name: Aggregate and report errors and warnings - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | echo "Aggregating errors and warnings..." grep -E "Traceback|Error:|Exception:|ModuleNotFoundError|FutureWarning|TypeError" logs/*.txt linting_logs/*.txt > logs/errors_and_warnings.txt || true @@ -158,32 +158,32 @@ jobs: shell: bash - name: Upload linting logs - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/upload-artifact@v2 with: - name: linting-logs-${{ matrix.folder }}-${{ matrix.os }} + name: linting-logs-${{ matrix.directory }}-${{ matrix.os }} path: linting_logs - name: Upload execution logs - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" uses: actions/upload-artifact@v2 with: - name: execution-logs-${{ matrix.folder }}-${{ matrix.os }} + name: execution-logs-${{ matrix.directory }}-${{ matrix.os }} path: logs - name: Check for errors - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 - name: Check if there are any changes (e.g. stripped outputs) - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: verify_diff run: | git pull git diff --quiet ${{ matrix.directory }}/*.ipynb || echo "changed=true" >> $GITHUB_OUTPUT - name: Commit and push stripped .ipynb files - if: "(env.notebooks_changed == 'true') && (! (matrix.folder == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))) && (steps.verify_diff.outputs.changed == 'true')" + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))) && (steps.verify_diff.outputs.changed == 'true')" run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' From 031bc42616ecc4d82f752fd04177ffdec5c80cc7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 13:37:58 +0000 Subject: [PATCH 184/261] Strip outputs from .ipynb files in AC1 --- ...alysis-with-sensmakr-and-debiased-ml.ipynb | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/AC1/python-sensitivity-analysis-with-sensmakr-and-debiased-ml.ipynb b/AC1/python-sensitivity-analysis-with-sensmakr-and-debiased-ml.ipynb index c9a69f20..b74be456 100644 --- a/AC1/python-sensitivity-analysis-with-sensmakr-and-debiased-ml.ipynb +++ b/AC1/python-sensitivity-analysis-with-sensmakr-and-debiased-ml.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "martial-negative", + "id": "0", "metadata": { "id": "martial-negative", "papermill": { @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "criminal-workplace", + "id": "1", "metadata": { "id": "criminal-workplace", "papermill": { @@ -101,7 +101,7 @@ }, { "cell_type": "markdown", - "id": "continuous-marshall", + "id": "2", "metadata": { "id": "continuous-marshall", "papermill": { @@ -129,7 +129,7 @@ }, { "cell_type": "markdown", - "id": "oW_mOo_wcpmV", + "id": "3", "metadata": { "id": "oW_mOo_wcpmV" }, @@ -145,7 +145,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aPAq16YXlcNx", + "id": "4", "metadata": { "id": "aPAq16YXlcNx" }, @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "qOOPGSB0lWM9", + "id": "5", "metadata": { "id": "qOOPGSB0lWM9" }, @@ -183,7 +183,7 @@ }, { "cell_type": "markdown", - "id": "hidden-packing", + "id": "6", "metadata": { "id": "hidden-packing", "papermill": { @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "authorized-transformation", + "id": "7", "metadata": { "id": "authorized-transformation", "papermill": { @@ -244,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "careful-dollar", + "id": "8", "metadata": { "id": "careful-dollar", "papermill": { @@ -262,7 +262,7 @@ }, { "cell_type": "markdown", - "id": "YHtKZ44_inRb", + "id": "9", "metadata": { "id": "YHtKZ44_inRb" }, @@ -278,7 +278,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8TpwVjQeilSu", + "id": "10", "metadata": { "id": "8TpwVjQeilSu" }, @@ -291,7 +291,7 @@ { "cell_type": "code", "execution_count": null, - "id": "EgDePhZViw2-", + "id": "11", "metadata": { "id": "EgDePhZViw2-" }, @@ -320,7 +320,7 @@ { "cell_type": "code", "execution_count": null, - "id": "TFKiFOk2ILI-", + "id": "12", "metadata": { "id": "TFKiFOk2ILI-" }, @@ -336,7 +336,7 @@ { "cell_type": "code", "execution_count": null, - "id": "hdQ9lV5eF5cD", + "id": "13", "metadata": { "id": "hdQ9lV5eF5cD" }, @@ -357,7 +357,7 @@ }, { "cell_type": "markdown", - "id": "built-enlargement", + "id": "14", "metadata": { "id": "built-enlargement", "papermill": { @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "respective-sister", + "id": "15", "metadata": { "id": "respective-sister", "papermill": { @@ -421,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "sorted-hands", + "id": "16", "metadata": { "id": "sorted-hands", "papermill": { @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3v-cR2kWpUQN", + "id": "17", "metadata": { "id": "3v-cR2kWpUQN" }, @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77qZoF5NpbA8", + "id": "18", "metadata": { "id": "77qZoF5NpbA8" }, @@ -544,7 +544,7 @@ { "cell_type": "code", "execution_count": null, - "id": "t-g4IiwQi_x0", + "id": "19", "metadata": { "id": "t-g4IiwQi_x0" }, @@ -610,7 +610,7 @@ { "cell_type": "code", "execution_count": null, - "id": "SoSr-sqVpc4d", + "id": "20", "metadata": { "id": "SoSr-sqVpc4d" }, @@ -636,7 +636,7 @@ { "cell_type": "code", "execution_count": null, - "id": "HH51ejTcqgT2", + "id": "21", "metadata": { "id": "HH51ejTcqgT2" }, @@ -648,7 +648,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aW1ySANwjylk", + "id": "22", "metadata": { "id": "aW1ySANwjylk" }, @@ -660,7 +660,7 @@ { "cell_type": "code", "execution_count": null, - "id": "BO__qNSje6lS", + "id": "23", "metadata": { "id": "BO__qNSje6lS" }, @@ -674,7 +674,7 @@ { "cell_type": "code", "execution_count": null, - "id": "Z972yBSSk6UX", + "id": "24", "metadata": { "id": "Z972yBSSk6UX" }, @@ -689,7 +689,7 @@ }, { "cell_type": "markdown", - "id": "charged-mauritius", + "id": "25", "metadata": { "id": "charged-mauritius", "papermill": { @@ -707,7 +707,7 @@ }, { "cell_type": "markdown", - "id": "charitable-placement", + "id": "26", "metadata": { "id": "charitable-placement", "papermill": { @@ -726,7 +726,7 @@ { "cell_type": "code", "execution_count": null, - "id": "U7XnplWkRd_P", + "id": "27", "metadata": { "id": "U7XnplWkRd_P" }, @@ -751,7 +751,7 @@ }, { "cell_type": "markdown", - "id": "cUxDc1mYdMHH", + "id": "28", "metadata": { "id": "cUxDc1mYdMHH" }, @@ -764,7 +764,7 @@ { "cell_type": "code", "execution_count": null, - "id": "V1tYIFMCeGbQ", + "id": "29", "metadata": { "id": "V1tYIFMCeGbQ" }, @@ -812,7 +812,7 @@ { "cell_type": "code", "execution_count": null, - "id": "EImLRgnb0peq", + "id": "30", "metadata": { "id": "EImLRgnb0peq" }, @@ -834,7 +834,7 @@ { "cell_type": "code", "execution_count": null, - "id": "mNk1o3xBTTwr", + "id": "31", "metadata": { "id": "mNk1o3xBTTwr" }, @@ -848,7 +848,7 @@ { "cell_type": "code", "execution_count": null, - "id": "obvious-there", + "id": "32", "metadata": { "id": "obvious-there", "papermill": { @@ -874,7 +874,7 @@ { "cell_type": "code", "execution_count": null, - "id": "JihrkBjEYcOG", + "id": "33", "metadata": { "id": "JihrkBjEYcOG" }, From c36de39888902eaf2b6b4097deef9d451535fc40 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 13:41:10 +0000 Subject: [PATCH 185/261] Strip outputs from .ipynb files in PM3 --- ...unctional-approximation-by-nn-and-rf.ipynb | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/PM3/python-functional-approximation-by-nn-and-rf.ipynb b/PM3/python-functional-approximation-by-nn-and-rf.ipynb index 1bf61b86..bad75217 100644 --- a/PM3/python-functional-approximation-by-nn-and-rf.ipynb +++ b/PM3/python-functional-approximation-by-nn-and-rf.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90a71fe6", + "id": "0", "metadata": { "id": "90a71fe6" }, @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "elegant-proxy", + "id": "1", "metadata": { "id": "elegant-proxy", "papermill": { @@ -43,7 +43,7 @@ { "cell_type": "code", "execution_count": null, - "id": "376635dd", + "id": "2", "metadata": { "id": "376635dd" }, @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d608d3dd", + "id": "3", "metadata": { "id": "d608d3dd" }, @@ -71,7 +71,7 @@ }, { "cell_type": "markdown", - "id": "widespread-mention", + "id": "4", "metadata": { "id": "widespread-mention", "papermill": { @@ -90,7 +90,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c39fd891", + "id": "5", "metadata": { "id": "c39fd891" }, @@ -103,7 +103,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d16f253", + "id": "6", "metadata": { "id": "1d16f253" }, @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ecd8e8f", + "id": "7", "metadata": { "id": "0ecd8e8f" }, @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d725ab29", + "id": "8", "metadata": { "id": "d725ab29" }, @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "local-saturn", + "id": "9", "metadata": { "id": "local-saturn", "papermill": { @@ -163,7 +163,7 @@ }, { "cell_type": "markdown", - "id": "international-serum", + "id": "10", "metadata": { "id": "international-serum", "papermill": { @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c89a0c5", + "id": "11", "metadata": { "id": "2c89a0c5" }, @@ -199,7 +199,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fcfc51f2", + "id": "12", "metadata": { "id": "fcfc51f2" }, @@ -213,7 +213,7 @@ }, { "cell_type": "markdown", - "id": "infrared-belgium", + "id": "13", "metadata": { "id": "infrared-belgium", "papermill": { @@ -232,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5118cea", + "id": "14", "metadata": { "id": "f5118cea" }, @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c4f42fc", + "id": "15", "metadata": { "id": "6c4f42fc" }, @@ -260,7 +260,7 @@ { "cell_type": "code", "execution_count": null, - "id": "137e44e0", + "id": "16", "metadata": { "id": "137e44e0" }, @@ -273,7 +273,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1649101f", + "id": "17", "metadata": { "id": "1649101f" }, @@ -287,7 +287,7 @@ }, { "cell_type": "markdown", - "id": "psychological-venice", + "id": "18", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", @@ -308,7 +308,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c061994", + "id": "19", "metadata": { "id": "9c061994" }, @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2a1d7cc0", + "id": "20", "metadata": { "id": "2a1d7cc0" }, @@ -336,7 +336,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf2ce5b", + "id": "21", "metadata": { "id": "acf2ce5b" }, @@ -351,7 +351,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9db1d51", + "id": "22", "metadata": { "id": "a9db1d51" }, @@ -365,7 +365,7 @@ { "cell_type": "code", "execution_count": null, - "id": "539ebb01", + "id": "23", "metadata": { "id": "539ebb01" }, @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "f261e981", + "id": "24", "metadata": { "id": "f261e981" }, @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "85047mypXqrD", + "id": "25", "metadata": { "id": "85047mypXqrD" }, @@ -404,7 +404,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49e3b46e", + "id": "26", "metadata": { "id": "49e3b46e" }, @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7defdcf2", + "id": "27", "metadata": { "id": "7defdcf2" }, @@ -436,7 +436,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f25a6ff", + "id": "28", "metadata": { "id": "0f25a6ff" }, @@ -449,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d25c8bfe", + "id": "29", "metadata": { "id": "d25c8bfe" }, @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf4b2ce6", + "id": "30", "metadata": { "id": "cf4b2ce6" }, @@ -478,7 +478,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74d5b869", + "id": "31", "metadata": { "id": "74d5b869" }, @@ -493,7 +493,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff746610", + "id": "32", "metadata": { "id": "ff746610" }, @@ -507,7 +507,7 @@ { "cell_type": "code", "execution_count": null, - "id": "510bfdb9", + "id": "33", "metadata": { "id": "510bfdb9" }, @@ -520,7 +520,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8678cba2", + "id": "34", "metadata": { "id": "8678cba2" }, @@ -535,7 +535,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1907bf04", + "id": "35", "metadata": { "id": "1907bf04" }, From d0e5e66c01217ed0b19412c60bb0aaa8d86a4307 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 13:45:32 +0000 Subject: [PATCH 186/261] Strip outputs from .ipynb files in T --- T/CATE-estimation.ipynb | 12 ++++-------- ...4_Regression_Discontinuity_on_Progresa_Data.ipynb | 3 +-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/T/CATE-estimation.ipynb b/T/CATE-estimation.ipynb index 0fb1a30f..01597d11 100644 --- a/T/CATE-estimation.ipynb +++ b/T/CATE-estimation.ipynb @@ -756,8 +756,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "rK9X06wY3DBq", - "scrolled": true + "id": "rK9X06wY3DBq" }, "outputs": [], "source": [ @@ -784,8 +783,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "IbMnTqwY3DBq", - "scrolled": true + "id": "IbMnTqwY3DBq" }, "outputs": [], "source": [ @@ -864,8 +862,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "rJFgOCA_3DBq", - "scrolled": true + "id": "rJFgOCA_3DBq" }, "outputs": [], "source": [ @@ -923,8 +920,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "4Xe2bn2R3DBq", - "scrolled": true + "id": "4Xe2bn2R3DBq" }, "outputs": [], "source": [ diff --git a/T/T_4_Regression_Discontinuity_on_Progresa_Data.ipynb b/T/T_4_Regression_Discontinuity_on_Progresa_Data.ipynb index f4b3b547..4290677d 100644 --- a/T/T_4_Regression_Discontinuity_on_Progresa_Data.ipynb +++ b/T/T_4_Regression_Discontinuity_on_Progresa_Data.ipynb @@ -611,8 +611,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "bzp90cR4I6RL", - "scrolled": true + "id": "bzp90cR4I6RL" }, "outputs": [], "source": [ From 50afb74582131bd5aaaff515f306f3eedea7d482 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 13:46:16 +0000 Subject: [PATCH 187/261] Strip outputs from .ipynb files in AC2 --- ...sed-ml-for-partially-linear-iv-model.ipynb | 24 -- AC2/python-dml-401k-IV.ipynb | 222 +++++++++--------- 2 files changed, 111 insertions(+), 135 deletions(-) diff --git a/AC2/python-debiased-ml-for-partially-linear-iv-model.ipynb b/AC2/python-debiased-ml-for-partially-linear-iv-model.ipynb index b882ddb6..d1383f4f 100644 --- a/AC2/python-debiased-ml-for-partially-linear-iv-model.ipynb +++ b/AC2/python-debiased-ml-for-partially-linear-iv-model.ipynb @@ -130,12 +130,6 @@ "execution_count": null, "metadata": { "_kg_hide-output": true, - "execution": { - "iopub.execute_input": "2021-04-23T10:41:33.973149Z", - "iopub.status.busy": "2021-04-23T10:41:33.971090Z", - "iopub.status.idle": "2021-04-23T10:42:08.602961Z", - "shell.execute_reply": "2021-04-23T10:42:08.601638Z" - }, "id": "yGW6JhG5zcL5", "papermill": { "duration": 34.670095, @@ -169,12 +163,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:08.664371Z", - "iopub.status.busy": "2021-04-23T10:42:08.629661Z", - "iopub.status.idle": "2021-04-23T10:42:10.458175Z", - "shell.execute_reply": "2021-04-23T10:42:10.456976Z" - }, "id": "j2WVUbBDzcL-", "papermill": { "duration": 1.846109, @@ -444,12 +432,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:16.301968Z", - "iopub.status.busy": "2021-04-23T10:42:16.300698Z", - "iopub.status.idle": "2021-04-23T10:42:32.390351Z", - "shell.execute_reply": "2021-04-23T10:42:32.388488Z" - }, "id": "rsUnPDfpzcMB", "papermill": { "duration": 16.107321, @@ -572,12 +554,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-23T10:42:32.545479Z", - "iopub.status.busy": "2021-04-23T10:42:32.543976Z", - "iopub.status.idle": "2021-04-23T10:42:33.002321Z", - "shell.execute_reply": "2021-04-23T10:42:33.001039Z" - }, "id": "k9bB2O13zcME", "papermill": { "duration": 0.478528, diff --git a/AC2/python-dml-401k-IV.ipynb b/AC2/python-dml-401k-IV.ipynb index f9ce5bcd..6c8f52f0 100644 --- a/AC2/python-dml-401k-IV.ipynb +++ b/AC2/python-dml-401k-IV.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "narrative-sailing", + "id": "0", "metadata": { "id": "narrative-sailing", "papermill": { @@ -20,7 +20,7 @@ }, { "cell_type": "markdown", - "id": "ready-appearance", + "id": "1", "metadata": { "id": "ready-appearance", "papermill": { @@ -43,7 +43,7 @@ }, { "cell_type": "markdown", - "id": "divine-phoenix", + "id": "2", "metadata": { "id": "divine-phoenix", "papermill": { @@ -64,7 +64,7 @@ { "cell_type": "code", "execution_count": null, - "id": "teq-s8pb3a82", + "id": "3", "metadata": { "id": "teq-s8pb3a82" }, @@ -76,7 +76,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75662e18", + "id": "4", "metadata": { "id": "75662e18" }, @@ -106,7 +106,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e66e77f", + "id": "5", "metadata": { "id": "6e66e77f" }, @@ -119,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65a2d086", + "id": "6", "metadata": { "id": "65a2d086" }, @@ -131,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "00884061", + "id": "7", "metadata": { "id": "00884061" }, @@ -143,7 +143,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d3dc85c", + "id": "8", "metadata": { "id": "1d3dc85c" }, @@ -156,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "looking-invention", + "id": "9", "metadata": { "id": "looking-invention", "papermill": { @@ -174,7 +174,7 @@ }, { "cell_type": "markdown", - "id": "received-nutrition", + "id": "10", "metadata": { "id": "received-nutrition", "papermill": { @@ -193,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d26d552", + "id": "11", "metadata": { "id": "0d26d552" }, @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "material-sending", + "id": "12", "metadata": { "id": "material-sending", "papermill": { @@ -224,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7741bb6", + "id": "13", "metadata": { "id": "a7741bb6" }, @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "awful-antigua", + "id": "14", "metadata": { "id": "awful-antigua", "papermill": { @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef070d79", + "id": "15", "metadata": { "id": "ef070d79" }, @@ -268,7 +268,7 @@ }, { "cell_type": "markdown", - "id": "cross-priority", + "id": "16", "metadata": { "id": "cross-priority", "papermill": { @@ -287,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33cd014e", + "id": "17", "metadata": { "id": "33cd014e" }, @@ -300,7 +300,7 @@ }, { "cell_type": "markdown", - "id": "suitable-vulnerability", + "id": "18", "metadata": { "id": "suitable-vulnerability", "papermill": { @@ -319,7 +319,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9354414", + "id": "19", "metadata": { "id": "a9354414" }, @@ -339,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "c4c3e489", + "id": "20", "metadata": { "id": "c4c3e489" }, @@ -350,7 +350,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2cfdf4c", + "id": "21", "metadata": { "id": "a2cfdf4c" }, @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53b1283d", + "id": "22", "metadata": { "id": "53b1283d" }, @@ -391,7 +391,7 @@ { "cell_type": "code", "execution_count": null, - "id": "824ee320", + "id": "23", "metadata": { "id": "824ee320" }, @@ -405,7 +405,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b35bbd34", + "id": "24", "metadata": { "id": "b35bbd34" }, @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8c5e20e7", + "id": "25", "metadata": { "id": "8c5e20e7" }, @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "de8e13e7", + "id": "26", "metadata": { "id": "de8e13e7" }, @@ -440,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "61e02e77", + "id": "27", "metadata": { "id": "61e02e77" }, @@ -450,7 +450,7 @@ }, { "cell_type": "markdown", - "id": "83869c99", + "id": "28", "metadata": { "id": "83869c99" }, @@ -479,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d743ebe", + "id": "29", "metadata": { "id": "0d743ebe" }, @@ -493,7 +493,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e7bd792d", + "id": "30", "metadata": { "id": "e7bd792d" }, @@ -507,7 +507,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ebedcb0", + "id": "31", "metadata": { "id": "7ebedcb0" }, @@ -518,7 +518,7 @@ }, { "cell_type": "markdown", - "id": "cH3YkvuSZfP0", + "id": "32", "metadata": { "id": "cH3YkvuSZfP0" }, @@ -528,7 +528,7 @@ }, { "cell_type": "markdown", - "id": "b218c25e", + "id": "33", "metadata": { "id": "b218c25e" }, @@ -539,7 +539,7 @@ { "cell_type": "code", "execution_count": null, - "id": "333c19ed", + "id": "34", "metadata": { "id": "333c19ed" }, @@ -598,7 +598,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d01b969", + "id": "35", "metadata": { "id": "7d01b969" }, @@ -623,7 +623,7 @@ }, { "cell_type": "markdown", - "id": "50821e2a", + "id": "36", "metadata": { "id": "50821e2a" }, @@ -634,7 +634,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3245907f", + "id": "37", "metadata": { "id": "3245907f" }, @@ -650,7 +650,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb8e6883", + "id": "38", "metadata": { "id": "cb8e6883" }, @@ -662,7 +662,7 @@ }, { "cell_type": "markdown", - "id": "30317504", + "id": "39", "metadata": { "id": "30317504" }, @@ -672,7 +672,7 @@ }, { "cell_type": "markdown", - "id": "TCnwbZAk3hPd", + "id": "40", "metadata": { "id": "TCnwbZAk3hPd" }, @@ -683,7 +683,7 @@ { "cell_type": "code", "execution_count": null, - "id": "24237b0c", + "id": "41", "metadata": { "id": "24237b0c" }, @@ -699,7 +699,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de68f123", + "id": "42", "metadata": { "id": "de68f123" }, @@ -711,7 +711,7 @@ }, { "cell_type": "markdown", - "id": "1b45a835", + "id": "43", "metadata": { "id": "1b45a835" }, @@ -722,7 +722,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4007ff4e", + "id": "44", "metadata": { "id": "4007ff4e" }, @@ -737,7 +737,7 @@ { "cell_type": "code", "execution_count": null, - "id": "801cf22e", + "id": "45", "metadata": { "id": "801cf22e" }, @@ -749,7 +749,7 @@ }, { "cell_type": "markdown", - "id": "f831c007", + "id": "46", "metadata": { "id": "f831c007" }, @@ -760,7 +760,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36d6e07a", + "id": "47", "metadata": { "id": "36d6e07a" }, @@ -775,7 +775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cdb226d", + "id": "48", "metadata": { "id": "2cdb226d" }, @@ -787,7 +787,7 @@ }, { "cell_type": "markdown", - "id": "58ccc781", + "id": "49", "metadata": { "id": "58ccc781" }, @@ -798,7 +798,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cedPovUlUcJt", + "id": "50", "metadata": { "id": "cedPovUlUcJt" }, @@ -813,7 +813,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76fd3e93", + "id": "51", "metadata": { "id": "76fd3e93" }, @@ -825,7 +825,7 @@ }, { "cell_type": "markdown", - "id": "911a6099", + "id": "52", "metadata": { "id": "911a6099" }, @@ -836,7 +836,7 @@ { "cell_type": "code", "execution_count": null, - "id": "EEaB50ly5FhE", + "id": "53", "metadata": { "id": "EEaB50ly5FhE" }, @@ -848,7 +848,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32bfd161", + "id": "54", "metadata": { "id": "32bfd161" }, @@ -867,7 +867,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e6bf41b6", + "id": "55", "metadata": { "id": "e6bf41b6" }, @@ -880,7 +880,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0841262d", + "id": "56", "metadata": { "id": "0841262d" }, @@ -893,7 +893,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0a3a5fd", + "id": "57", "metadata": { "id": "d0a3a5fd" }, @@ -906,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87e10995", + "id": "58", "metadata": { "id": "87e10995" }, @@ -918,7 +918,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd2fd52a", + "id": "59", "metadata": { "id": "dd2fd52a" }, @@ -930,7 +930,7 @@ }, { "cell_type": "markdown", - "id": "a14f6004", + "id": "60", "metadata": { "id": "a14f6004" }, @@ -941,7 +941,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3622cc4d", + "id": "61", "metadata": { "id": "3622cc4d" }, @@ -967,7 +967,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f668209", + "id": "62", "metadata": { "id": "9f668209" }, @@ -979,7 +979,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6eb57a1", + "id": "63", "metadata": { "id": "f6eb57a1" }, @@ -990,7 +990,7 @@ }, { "cell_type": "markdown", - "id": "8326cd85", + "id": "64", "metadata": { "id": "8326cd85" }, @@ -1001,7 +1001,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0e7e6d9", + "id": "65", "metadata": { "id": "e0e7e6d9" }, @@ -1015,7 +1015,7 @@ }, { "cell_type": "markdown", - "id": "74e607de", + "id": "66", "metadata": { "id": "74e607de" }, @@ -1026,7 +1026,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7be6a610", + "id": "67", "metadata": { "id": "7be6a610" }, @@ -1038,7 +1038,7 @@ }, { "cell_type": "markdown", - "id": "db39aa34", + "id": "68", "metadata": { "id": "db39aa34" }, @@ -1048,7 +1048,7 @@ }, { "cell_type": "markdown", - "id": "7255c0a8", + "id": "69", "metadata": { "id": "7255c0a8" }, @@ -1086,7 +1086,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efc66d4b", + "id": "70", "metadata": { "id": "efc66d4b" }, @@ -1163,7 +1163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d65020c", + "id": "71", "metadata": { "id": "7d65020c" }, @@ -1189,7 +1189,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d894f58", + "id": "72", "metadata": { "id": "7d894f58" }, @@ -1205,7 +1205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c663299a", + "id": "73", "metadata": { "id": "c663299a" }, @@ -1218,7 +1218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8617137d", + "id": "74", "metadata": { "id": "8617137d" }, @@ -1233,7 +1233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "365ff0f4", + "id": "75", "metadata": { "id": "365ff0f4" }, @@ -1246,7 +1246,7 @@ { "cell_type": "code", "execution_count": null, - "id": "571112a4", + "id": "76", "metadata": { "id": "571112a4" }, @@ -1261,7 +1261,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04228398", + "id": "77", "metadata": { "id": "04228398" }, @@ -1274,7 +1274,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cba4c9e9", + "id": "78", "metadata": { "id": "cba4c9e9" }, @@ -1289,7 +1289,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b221dba2", + "id": "79", "metadata": { "id": "b221dba2" }, @@ -1301,7 +1301,7 @@ }, { "cell_type": "markdown", - "id": "9c96e854", + "id": "80", "metadata": { "id": "9c96e854" }, @@ -1312,7 +1312,7 @@ { "cell_type": "code", "execution_count": null, - "id": "457b513f", + "id": "81", "metadata": { "id": "457b513f" }, @@ -1332,7 +1332,7 @@ { "cell_type": "code", "execution_count": null, - "id": "621d086a", + "id": "82", "metadata": { "id": "621d086a" }, @@ -1345,7 +1345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a348199a", + "id": "83", "metadata": { "id": "a348199a" }, @@ -1358,7 +1358,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b8c3648", + "id": "84", "metadata": { "id": "4b8c3648" }, @@ -1371,7 +1371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e7326607", + "id": "85", "metadata": { "id": "e7326607" }, @@ -1384,7 +1384,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0cad169d", + "id": "86", "metadata": { "id": "0cad169d" }, @@ -1397,7 +1397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50a9df7f", + "id": "87", "metadata": { "id": "50a9df7f" }, @@ -1409,7 +1409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "788a551b", + "id": "88", "metadata": { "id": "788a551b" }, @@ -1421,7 +1421,7 @@ }, { "cell_type": "markdown", - "id": "062e99e8", + "id": "89", "metadata": { "id": "062e99e8" }, @@ -1432,7 +1432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f36700c", + "id": "90", "metadata": { "id": "7f36700c" }, @@ -1443,7 +1443,7 @@ }, { "cell_type": "markdown", - "id": "a1ecb257", + "id": "91", "metadata": { "id": "a1ecb257" }, @@ -1454,7 +1454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2b38fb1", + "id": "92", "metadata": { "id": "e2b38fb1" }, @@ -1481,7 +1481,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60ace0a2", + "id": "93", "metadata": { "id": "60ace0a2" }, @@ -1493,7 +1493,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7bb488d", + "id": "94", "metadata": { "id": "f7bb488d" }, @@ -1504,7 +1504,7 @@ }, { "cell_type": "markdown", - "id": "225cb858", + "id": "95", "metadata": { "id": "225cb858" }, @@ -1514,7 +1514,7 @@ }, { "cell_type": "markdown", - "id": "8cda6326", + "id": "96", "metadata": { "id": "8cda6326" }, @@ -1527,7 +1527,7 @@ { "cell_type": "code", "execution_count": null, - "id": "257b9330", + "id": "97", "metadata": { "id": "257b9330" }, @@ -1539,7 +1539,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6108d89d", + "id": "98", "metadata": { "id": "6108d89d" }, @@ -1551,7 +1551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49587174", + "id": "99", "metadata": { "id": "49587174" }, @@ -1569,7 +1569,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54524cc0", + "id": "100", "metadata": { "id": "54524cc0" }, @@ -1581,7 +1581,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fa3f1497", + "id": "101", "metadata": { "id": "fa3f1497" }, @@ -1592,7 +1592,7 @@ }, { "cell_type": "markdown", - "id": "216ed55d", + "id": "102", "metadata": { "id": "216ed55d" }, @@ -1619,7 +1619,7 @@ { "cell_type": "code", "execution_count": null, - "id": "122ab658", + "id": "103", "metadata": { "id": "122ab658" }, @@ -1639,7 +1639,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e878a86", + "id": "104", "metadata": { "id": "7e878a86" }, @@ -1651,7 +1651,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd8e664a", + "id": "105", "metadata": { "id": "dd8e664a" }, @@ -1662,7 +1662,7 @@ }, { "cell_type": "markdown", - "id": "a629b091", + "id": "106", "metadata": { "id": "a629b091" }, @@ -1677,7 +1677,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f29d8e7", + "id": "107", "metadata": { "id": "4f29d8e7" }, @@ -1689,7 +1689,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff0bca33", + "id": "108", "metadata": { "id": "ff0bca33" }, @@ -1703,7 +1703,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47f735fb", + "id": "109", "metadata": { "id": "47f735fb" }, @@ -1736,7 +1736,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bdd0ac4d", + "id": "110", "metadata": { "id": "bdd0ac4d" }, From efe6fff005e0d2a593bf05d729df81cc9a10bd81 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 06:57:22 -0700 Subject: [PATCH 188/261] Limit on push to only main branch Now the tests wont run every time you push on your own branch. They will only run on PRs and if for any reason someone managed to sneak a push on the main branch. --- .github/workflows/check-and-transform-R-notebooks.yml | 2 ++ .github/workflows/python-notebooks.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index ef6a2f75..5df5aee0 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -2,6 +2,8 @@ name: Check and Transform R Notebooks on: push: + branches: + - main pull_request: branches: - main diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 336de661..c8d7ae8b 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -2,6 +2,8 @@ name: Run and Test Jupyter Notebooks on: push: + branches: + - main pull_request: branches: - main From a8ba9f8e7c668a5c3591115e0aba009f93c3218e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 13:59:17 +0000 Subject: [PATCH 189/261] Strip outputs from .ipynb files in PM4 --- PM4/python-dml-401k.ipynb | 225 +++++++++--------- ...ion-analysis-of-401-k-example-w-dags.ipynb | 48 ---- 2 files changed, 112 insertions(+), 161 deletions(-) diff --git a/PM4/python-dml-401k.ipynb b/PM4/python-dml-401k.ipynb index 2c4f8f2d..6da0a927 100644 --- a/PM4/python-dml-401k.ipynb +++ b/PM4/python-dml-401k.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "narrative-sailing", + "id": "0", "metadata": { "id": "narrative-sailing" }, @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "ready-appearance", + "id": "1", "metadata": { "id": "ready-appearance" }, @@ -27,7 +27,7 @@ }, { "cell_type": "markdown", - "id": "divine-phoenix", + "id": "2", "metadata": { "id": "divine-phoenix" }, @@ -40,7 +40,7 @@ { "cell_type": "code", "execution_count": null, - "id": "lj0WLB1_EwxI", + "id": "3", "metadata": { "id": "lj0WLB1_EwxI" }, @@ -53,7 +53,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75662e18", + "id": "4", "metadata": { "id": "75662e18" }, @@ -83,7 +83,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e66e77f", + "id": "5", "metadata": { "id": "6e66e77f" }, @@ -96,7 +96,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65a2d086", + "id": "6", "metadata": { "id": "65a2d086" }, @@ -108,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "00884061", + "id": "7", "metadata": { "id": "00884061" }, @@ -120,10 +120,9 @@ { "cell_type": "code", "execution_count": null, - "id": "1d3dc85c", + "id": "8", "metadata": { - "id": "1d3dc85c", - "scrolled": true + "id": "1d3dc85c" }, "outputs": [], "source": [ @@ -134,7 +133,7 @@ }, { "cell_type": "markdown", - "id": "looking-invention", + "id": "9", "metadata": { "id": "looking-invention" }, @@ -144,7 +143,7 @@ }, { "cell_type": "markdown", - "id": "received-nutrition", + "id": "10", "metadata": { "id": "received-nutrition" }, @@ -155,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "MaLLqrNdcK09", + "id": "11", "metadata": { "id": "MaLLqrNdcK09" }, @@ -167,7 +166,7 @@ }, { "cell_type": "markdown", - "id": "material-sending", + "id": "12", "metadata": { "id": "material-sending" }, @@ -178,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7741bb6", + "id": "13", "metadata": { "id": "a7741bb6" }, @@ -190,7 +189,7 @@ }, { "cell_type": "markdown", - "id": "awful-antigua", + "id": "14", "metadata": { "id": "awful-antigua" }, @@ -201,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef070d79", + "id": "15", "metadata": { "id": "ef070d79" }, @@ -214,7 +213,7 @@ }, { "cell_type": "markdown", - "id": "cross-priority", + "id": "16", "metadata": { "id": "cross-priority" }, @@ -225,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33cd014e", + "id": "17", "metadata": { "id": "33cd014e" }, @@ -238,7 +237,7 @@ }, { "cell_type": "markdown", - "id": "suitable-vulnerability", + "id": "18", "metadata": { "id": "suitable-vulnerability" }, @@ -249,7 +248,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9354414", + "id": "19", "metadata": { "id": "a9354414" }, @@ -270,7 +269,7 @@ }, { "cell_type": "markdown", - "id": "c4c3e489", + "id": "20", "metadata": { "id": "c4c3e489" }, @@ -281,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53b1283d", + "id": "21", "metadata": { "id": "53b1283d" }, @@ -306,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "824ee320", + "id": "22", "metadata": { "id": "824ee320" }, @@ -320,7 +319,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b35bbd34", + "id": "23", "metadata": { "id": "b35bbd34" }, @@ -332,7 +331,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8c5e20e7", + "id": "24", "metadata": { "id": "8c5e20e7" }, @@ -345,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "a745a6f2", + "id": "25", "metadata": { "id": "a745a6f2" }, @@ -355,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "10538ad3", + "id": "26", "metadata": { "id": "10538ad3" }, @@ -365,7 +364,7 @@ }, { "cell_type": "markdown", - "id": "6fcf3165", + "id": "27", "metadata": { "id": "6fcf3165" }, @@ -378,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "d198da92", + "id": "28", "metadata": { "id": "d198da92" }, @@ -388,7 +387,7 @@ }, { "cell_type": "markdown", - "id": "045487a5", + "id": "29", "metadata": { "id": "045487a5" }, @@ -399,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1ee13ea8", + "id": "30", "metadata": { "id": "1ee13ea8" }, @@ -453,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35c70f74", + "id": "31", "metadata": { "id": "35c70f74" }, @@ -476,7 +475,7 @@ }, { "cell_type": "markdown", - "id": "b0e6276a", + "id": "32", "metadata": { "id": "b0e6276a" }, @@ -487,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21f57b4e", + "id": "33", "metadata": { "id": "21f57b4e" }, @@ -502,7 +501,7 @@ { "cell_type": "code", "execution_count": null, - "id": "265fb305", + "id": "34", "metadata": { "id": "265fb305" }, @@ -514,7 +513,7 @@ }, { "cell_type": "markdown", - "id": "30cb77e8", + "id": "35", "metadata": { "id": "30cb77e8" }, @@ -527,7 +526,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ba3ba66e", + "id": "36", "metadata": { "id": "ba3ba66e" }, @@ -542,7 +541,7 @@ { "cell_type": "code", "execution_count": null, - "id": "WFODGyZX4io9", + "id": "37", "metadata": { "id": "WFODGyZX4io9" }, @@ -554,7 +553,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cbbaa344", + "id": "38", "metadata": { "id": "cbbaa344" }, @@ -566,7 +565,7 @@ }, { "cell_type": "markdown", - "id": "fluid-gregory", + "id": "39", "metadata": { "id": "fluid-gregory" }, @@ -576,7 +575,7 @@ }, { "cell_type": "markdown", - "id": "c4791084", + "id": "40", "metadata": { "id": "c4791084" }, @@ -587,7 +586,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de0d4030", + "id": "41", "metadata": { "id": "de0d4030" }, @@ -601,7 +600,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5073e3a", + "id": "42", "metadata": { "id": "e5073e3a" }, @@ -613,7 +612,7 @@ }, { "cell_type": "markdown", - "id": "1adab609", + "id": "43", "metadata": { "id": "1adab609" }, @@ -624,7 +623,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3a3266c", + "id": "44", "metadata": { "id": "e3a3266c" }, @@ -638,7 +637,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e82ddc9f", + "id": "45", "metadata": { "id": "e82ddc9f" }, @@ -650,7 +649,7 @@ }, { "cell_type": "markdown", - "id": "415d5b98", + "id": "46", "metadata": { "id": "415d5b98" }, @@ -661,7 +660,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9016b1a8", + "id": "47", "metadata": { "id": "9016b1a8" }, @@ -675,7 +674,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10f4e992", + "id": "48", "metadata": { "id": "10f4e992" }, @@ -687,7 +686,7 @@ }, { "cell_type": "markdown", - "id": "identical-smith", + "id": "49", "metadata": { "id": "identical-smith" }, @@ -697,7 +696,7 @@ }, { "cell_type": "markdown", - "id": "9c8dc0c4", + "id": "50", "metadata": { "id": "9c8dc0c4" }, @@ -708,7 +707,7 @@ { "cell_type": "code", "execution_count": null, - "id": "r_kFRdEyFjQc", + "id": "51", "metadata": { "id": "r_kFRdEyFjQc" }, @@ -720,7 +719,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d0d2b4e", + "id": "52", "metadata": { "id": "2d0d2b4e" }, @@ -737,7 +736,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e2ba772", + "id": "53", "metadata": { "id": "6e2ba772" }, @@ -749,7 +748,7 @@ }, { "cell_type": "markdown", - "id": "9bc171ce", + "id": "54", "metadata": { "id": "9bc171ce" }, @@ -759,7 +758,7 @@ }, { "cell_type": "markdown", - "id": "fe9d4e9e", + "id": "55", "metadata": { "id": "fe9d4e9e" }, @@ -770,7 +769,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b6f64eba", + "id": "56", "metadata": { "id": "b6f64eba" }, @@ -785,7 +784,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2cffa502", + "id": "57", "metadata": { "id": "2cffa502" }, @@ -798,7 +797,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21c2a08a", + "id": "58", "metadata": { "id": "21c2a08a" }, @@ -811,7 +810,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b5a1b0b", + "id": "59", "metadata": { "id": "6b5a1b0b" }, @@ -823,7 +822,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9855e83e", + "id": "60", "metadata": { "id": "9855e83e" }, @@ -835,7 +834,7 @@ }, { "cell_type": "markdown", - "id": "b97cb96c", + "id": "61", "metadata": { "id": "b97cb96c" }, @@ -846,7 +845,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5214c9d", + "id": "62", "metadata": { "id": "f5214c9d" }, @@ -903,7 +902,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a671f5e", + "id": "63", "metadata": { "id": "6a671f5e" }, @@ -916,7 +915,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6bfa361", + "id": "64", "metadata": { "id": "f6bfa361" }, @@ -928,7 +927,7 @@ }, { "cell_type": "markdown", - "id": "comprehensive-graphics", + "id": "65", "metadata": { "id": "comprehensive-graphics" }, @@ -938,7 +937,7 @@ }, { "cell_type": "markdown", - "id": "square-craps", + "id": "66", "metadata": { "id": "square-craps" }, @@ -948,7 +947,7 @@ }, { "cell_type": "markdown", - "id": "large-welcome", + "id": "67", "metadata": { "id": "large-welcome" }, @@ -961,7 +960,7 @@ }, { "cell_type": "markdown", - "id": "perfect-reliance", + "id": "68", "metadata": { "id": "perfect-reliance" }, @@ -973,7 +972,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a0b5d47", + "id": "69", "metadata": { "id": "5a0b5d47" }, @@ -1029,7 +1028,7 @@ }, { "cell_type": "markdown", - "id": "MRgN7S5TXlfF", + "id": "70", "metadata": { "id": "MRgN7S5TXlfF" }, @@ -1040,7 +1039,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ZpHWxbk7L7Sl", + "id": "71", "metadata": { "id": "ZpHWxbk7L7Sl" }, @@ -1064,7 +1063,7 @@ }, { "cell_type": "markdown", - "id": "2szrjAObipxP", + "id": "72", "metadata": { "id": "2szrjAObipxP" }, @@ -1075,7 +1074,7 @@ { "cell_type": "code", "execution_count": null, - "id": "zQnSP-hDlVPw", + "id": "73", "metadata": { "id": "zQnSP-hDlVPw" }, @@ -1096,7 +1095,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b0fe147", + "id": "74", "metadata": { "id": "1b0fe147" }, @@ -1110,7 +1109,7 @@ { "cell_type": "code", "execution_count": null, - "id": "259efb1c", + "id": "75", "metadata": { "id": "259efb1c" }, @@ -1123,7 +1122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5335633c", + "id": "76", "metadata": { "id": "5335633c" }, @@ -1137,7 +1136,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c5d5a12", + "id": "77", "metadata": { "id": "3c5d5a12" }, @@ -1150,7 +1149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69f9338c", + "id": "78", "metadata": { "id": "69f9338c" }, @@ -1164,7 +1163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d275ed3", + "id": "79", "metadata": { "id": "8d275ed3" }, @@ -1176,7 +1175,7 @@ }, { "cell_type": "markdown", - "id": "9a6e0994", + "id": "80", "metadata": { "id": "9a6e0994" }, @@ -1186,7 +1185,7 @@ }, { "cell_type": "markdown", - "id": "a8eadf06", + "id": "81", "metadata": { "id": "a8eadf06" }, @@ -1197,7 +1196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fee26e18", + "id": "82", "metadata": { "id": "fee26e18" }, @@ -1215,7 +1214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22518880", + "id": "83", "metadata": { "id": "22518880" }, @@ -1228,7 +1227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd04a3cd", + "id": "84", "metadata": { "id": "dd04a3cd" }, @@ -1241,7 +1240,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5088756e", + "id": "85", "metadata": { "id": "5088756e" }, @@ -1254,7 +1253,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a415bd9", + "id": "86", "metadata": { "id": "9a415bd9" }, @@ -1266,7 +1265,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30800bbc", + "id": "87", "metadata": { "id": "30800bbc" }, @@ -1279,7 +1278,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9325941a", + "id": "88", "metadata": { "id": "9325941a" }, @@ -1352,7 +1351,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5401f09d", + "id": "89", "metadata": { "id": "5401f09d" }, @@ -1364,7 +1363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1606f1bb", + "id": "90", "metadata": { "id": "1606f1bb" }, @@ -1376,7 +1375,7 @@ }, { "cell_type": "markdown", - "id": "743cde38", + "id": "91", "metadata": { "id": "743cde38" }, @@ -1387,7 +1386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ce30440", + "id": "92", "metadata": { "id": "0ce30440" }, @@ -1399,7 +1398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93fcee1c", + "id": "93", "metadata": { "id": "93fcee1c" }, @@ -1410,7 +1409,7 @@ }, { "cell_type": "markdown", - "id": "af5867dc", + "id": "94", "metadata": { "id": "af5867dc" }, @@ -1421,7 +1420,7 @@ }, { "cell_type": "markdown", - "id": "MCC8KTmCoLOZ", + "id": "95", "metadata": { "id": "MCC8KTmCoLOZ" }, @@ -1434,7 +1433,7 @@ { "cell_type": "code", "execution_count": null, - "id": "401bc6b6", + "id": "96", "metadata": { "id": "401bc6b6" }, @@ -1446,7 +1445,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ebd88f6b", + "id": "97", "metadata": { "id": "ebd88f6b" }, @@ -1459,7 +1458,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe3ba9f4", + "id": "98", "metadata": { "id": "fe3ba9f4" }, @@ -1474,7 +1473,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a52d86c5", + "id": "99", "metadata": { "id": "a52d86c5" }, @@ -1486,7 +1485,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4d6c5b24", + "id": "100", "metadata": { "id": "4d6c5b24" }, @@ -1501,7 +1500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "473f350b", + "id": "101", "metadata": { "id": "473f350b" }, @@ -1519,7 +1518,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea53e99a", + "id": "102", "metadata": { "id": "ea53e99a" }, @@ -1531,7 +1530,7 @@ { "cell_type": "code", "execution_count": null, - "id": "519426af", + "id": "103", "metadata": { "id": "519426af" }, @@ -1552,7 +1551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3794606", + "id": "104", "metadata": { "id": "e3794606" }, @@ -1563,7 +1562,7 @@ }, { "cell_type": "markdown", - "id": "3ca94a11", + "id": "105", "metadata": { "id": "3ca94a11" }, @@ -1573,7 +1572,7 @@ }, { "cell_type": "markdown", - "id": "qj9BrMhWpk70", + "id": "106", "metadata": { "id": "qj9BrMhWpk70" }, @@ -1584,7 +1583,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea90e4e1", + "id": "107", "metadata": { "id": "ea90e4e1" }, @@ -1596,7 +1595,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9930faf2", + "id": "108", "metadata": { "id": "9930faf2" }, @@ -1610,7 +1609,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c146dd15", + "id": "109", "metadata": { "id": "c146dd15" }, @@ -1625,7 +1624,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e7b7b7c", + "id": "110", "metadata": { "id": "9e7b7b7c" }, diff --git a/PM4/python-identification-analysis-of-401-k-example-w-dags.ipynb b/PM4/python-identification-analysis-of-401-k-example-w-dags.ipynb index d16d471f..ef0a5e9f 100644 --- a/PM4/python-identification-analysis-of-401-k-example-w-dags.ipynb +++ b/PM4/python-identification-analysis-of-401-k-example-w-dags.ipynb @@ -23,12 +23,6 @@ "metadata": { "_execution_state": "idle", "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a", - "execution": { - "iopub.execute_input": "2021-04-20T21:06:30.088435Z", - "iopub.status.busy": "2021-04-20T21:06:30.086566Z", - "iopub.status.idle": "2021-04-20T21:06:55.456113Z", - "shell.execute_reply": "2021-04-20T21:06:55.454477Z" - }, "id": "t1xb29BvPIE4", "papermill": { "duration": 25.408317, @@ -153,12 +147,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:55.576427Z", - "iopub.status.busy": "2021-04-20T21:06:55.547730Z", - "iopub.status.idle": "2021-04-20T21:06:56.492498Z", - "shell.execute_reply": "2021-04-20T21:06:56.492869Z" - }, "id": "WmA30w14PIE7", "papermill": { "duration": 0.960286, @@ -241,12 +229,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:56.654884Z", - "iopub.status.busy": "2021-04-20T21:06:56.653684Z", - "iopub.status.idle": "2021-04-20T21:06:57.016513Z", - "shell.execute_reply": "2021-04-20T21:06:57.017098Z" - }, "id": "6l83sAd8PIE8", "papermill": { "duration": 0.380125, @@ -284,12 +266,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.060217Z", - "iopub.status.busy": "2021-04-20T21:06:57.058501Z", - "iopub.status.idle": "2021-04-20T21:06:57.094192Z", - "shell.execute_reply": "2021-04-20T21:06:57.092241Z" - }, "id": "qXhEMpdXPIE9", "papermill": { "duration": 0.056051, @@ -348,12 +324,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.194754Z", - "iopub.status.busy": "2021-04-20T21:06:57.192851Z", - "iopub.status.idle": "2021-04-20T21:06:57.567286Z", - "shell.execute_reply": "2021-04-20T21:06:57.565727Z" - }, "id": "7O0MbTDCPIE-", "papermill": { "duration": 0.393034, @@ -417,12 +387,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:57.637392Z", - "iopub.status.busy": "2021-04-20T21:06:57.635609Z", - "iopub.status.idle": "2021-04-20T21:06:58.024479Z", - "shell.execute_reply": "2021-04-20T21:06:58.022247Z" - }, "id": "NjqrWLhXPIE-", "papermill": { "duration": 0.40878, @@ -453,12 +417,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:58.065152Z", - "iopub.status.busy": "2021-04-20T21:06:58.063890Z", - "iopub.status.idle": "2021-04-20T21:06:58.089473Z", - "shell.execute_reply": "2021-04-20T21:06:58.088587Z" - }, "id": "BfQZaKqFPIE_", "papermill": { "duration": 0.046811, @@ -589,12 +547,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "execution": { - "iopub.execute_input": "2021-04-20T21:06:58.679151Z", - "iopub.status.busy": "2021-04-20T21:06:58.677648Z", - "iopub.status.idle": "2021-04-20T21:06:59.019377Z", - "shell.execute_reply": "2021-04-20T21:06:59.018555Z" - }, "id": "IGI7piKFPIFA", "papermill": { "duration": 0.365274, From b072d9447e6338e83f2baaba97cb65ee26c22bf1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 14:11:27 +0000 Subject: [PATCH 190/261] Strip outputs from .ipynb files in PM2 --- PM2/python_experiment_non_orthogonal.ipynb | 106 ++++++++++----------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/PM2/python_experiment_non_orthogonal.ipynb b/PM2/python_experiment_non_orthogonal.ipynb index a57e8ee6..7d0948d2 100644 --- a/PM2/python_experiment_non_orthogonal.ipynb +++ b/PM2/python_experiment_non_orthogonal.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da11e5a8", + "id": "0", "metadata": { "id": "da11e5a8" }, @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "f815a26a", + "id": "1", "metadata": { "id": "f815a26a" }, @@ -34,7 +34,7 @@ { "cell_type": "code", "execution_count": null, - "id": "q9tTqKaPflDN", + "id": "2", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -52,7 +52,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d465905f", + "id": "3", "metadata": { "id": "d465905f" }, @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d5d475d7", + "id": "4", "metadata": { "id": "d5d475d7" }, @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8c3a418", + "id": "5", "metadata": { "id": "b8c3a418" }, @@ -114,7 +114,7 @@ { "cell_type": "code", "execution_count": null, - "id": "258d88f2", + "id": "6", "metadata": { "id": "258d88f2" }, @@ -129,7 +129,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69d66c15", + "id": "7", "metadata": { "id": "69d66c15" }, @@ -149,7 +149,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db73000b", + "id": "8", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -167,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f4535a9", + "id": "9", "metadata": { "id": "0f4535a9" }, @@ -189,7 +189,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f718a18", + "id": "10", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -205,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84e660b2", + "id": "11", "metadata": { "id": "84e660b2" }, @@ -225,7 +225,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c8b1cfd5", + "id": "12", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -241,7 +241,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e58c0e9", + "id": "13", "metadata": { "id": "3e58c0e9" }, @@ -264,7 +264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67f2a63d", + "id": "14", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -280,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31e49a00", + "id": "15", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81766f99", + "id": "16", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -316,7 +316,7 @@ { "cell_type": "code", "execution_count": null, - "id": "174f80f2", + "id": "17", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -333,7 +333,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2789f752", + "id": "18", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -349,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a3a5c66c", + "id": "19", "metadata": { "id": "a3a5c66c" }, @@ -370,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "535ff01b", + "id": "20", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e1a42254", + "id": "21", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -404,7 +404,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c1cfa87", + "id": "22", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "442740a1", + "id": "23", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -436,7 +436,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc4d40e2", + "id": "24", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -453,7 +453,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e0a09f9", + "id": "25", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45cdfbba", + "id": "26", "metadata": { "id": "45cdfbba" }, @@ -501,7 +501,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48544788", + "id": "27", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -517,7 +517,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58334e8e", + "id": "28", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -535,7 +535,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1fc08ca4", + "id": "29", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -551,7 +551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2194a541", + "id": "30", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -567,7 +567,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be61450d", + "id": "31", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -583,7 +583,7 @@ { "cell_type": "code", "execution_count": null, - "id": "394859c2", + "id": "32", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -610,7 +610,7 @@ { "cell_type": "code", "execution_count": null, - "id": "893d95ba", + "id": "33", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -628,7 +628,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1711931", + "id": "34", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -646,7 +646,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3fb42054", + "id": "35", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -680,7 +680,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25303e2a", + "id": "36", "metadata": { "id": "25303e2a" }, @@ -692,7 +692,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68b51670", + "id": "37", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -708,7 +708,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f538442b", + "id": "38", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -726,7 +726,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c71cd7d", + "id": "39", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -742,7 +742,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b74cf552", + "id": "40", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -761,7 +761,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0eeedd3", + "id": "41", "metadata": { "id": "b0eeedd3" }, @@ -778,7 +778,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd843f2e", + "id": "42", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -807,7 +807,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26f93c1e", + "id": "43", "metadata": { "id": "26f93c1e" }, @@ -819,7 +819,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b30e1d54", + "id": "44", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -835,7 +835,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea9e640f", + "id": "45", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -853,7 +853,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a725e335", + "id": "46", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -869,7 +869,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5d52c197", + "id": "47", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -898,7 +898,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c289eb3", + "id": "48", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -914,7 +914,7 @@ { "cell_type": "code", "execution_count": null, - "id": "add24767", + "id": "49", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -930,7 +930,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a89f4c7", + "id": "50", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -946,7 +946,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d33463a5", + "id": "51", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -965,7 +965,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3ae78ae", + "id": "52", "metadata": { "id": "e3ae78ae" }, From f915914218954fe3a8f1cb7fd9a5a929eefd6eca Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:20:46 -0700 Subject: [PATCH 191/261] Added check for changes to .Rmd files. --- .github/workflows/check-and-transform-R-notebooks.yml | 8 +++----- PM3/r_functional_approximation_by_nn_and_rf.Rmd | 1 + README.md | 4 ++++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 5df5aee0..83d608b4 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -2,8 +2,6 @@ name: Check and Transform R Notebooks on: push: - branches: - - main pull_request: branches: - main @@ -32,21 +30,21 @@ jobs: run: | git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt - grep -E '\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep -E '\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.irnb$|\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|^${{ matrix.directory }}/.*\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd index aca4a796..2a55f428 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.Rmd +++ b/PM3/r_functional_approximation_by_nn_and_rf.Rmd @@ -16,6 +16,7 @@ install.packages("randomForest") install.packages("rpart") install.packages("gbm") install.packages("keras") + ``` ```{r} diff --git a/README.md b/README.md index 24ea6471..ef93d9be 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,7 @@ If you are facing difficulties running a notebook on your environment, try insta pip install -r requirements.txt ``` + +# For Contributors + +The .Rmd files are auto-generated by a Github Action, whenever one pushes a .irnb (R Jupyter notebook) to one of the main folders of the repo. So .Rmd files, should never be altered directly. Only changes to .irnb files should be made. Any change to a .Rmd file will be over-written by the corresponding .irnb file and will not survive the Github Action. From 46fb4e2352e9fdb7d8c6585552cefd918f6fbcbc Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:30:34 -0700 Subject: [PATCH 192/261] Update python-notebooks.yml --- .github/workflows/python-notebooks.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index c8d7ae8b..77c670ec 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -34,8 +34,9 @@ jobs: if: github.event_name == 'pull_request' id: find_notebooks_pr run: | - git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} - git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + # git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} + # git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + git diff --name-only -r HEAD^1 HEAD > changed_files.txt grep -E '\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push From 666d0633bfb9a7025386c63aa19c72876f3f9b69 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:32:57 -0700 Subject: [PATCH 193/261] Update check-and-transform-R-notebooks.yml --- .github/workflows/check-and-transform-R-notebooks.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 83d608b4..47b39071 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -28,8 +28,9 @@ jobs: if: github.event_name == 'pull_request' id: find_notebooks_pr run: | - git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} - git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + # git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} + # git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt + git diff --name-only -r HEAD^1 HEAD > changed_files.txt grep -E '\.irnb$|\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push From 3dd8654ac9a7761a9719af0e63f018ec117eadb5 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:37:53 -0700 Subject: [PATCH 194/261] Update check-and-transform-R-notebooks.yml --- .github/workflows/check-and-transform-R-notebooks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 47b39071..7552b629 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -2,6 +2,8 @@ name: Check and Transform R Notebooks on: push: + branches: + - main pull_request: branches: - main From afa4dcffab255e21da6947025f9a6cc980ff0a01 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:46:46 -0700 Subject: [PATCH 195/261] checkout branch when in PR --- .github/workflows/check-and-transform-R-notebooks.yml | 5 +++++ .github/workflows/python-notebooks.yml | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 7552b629..b82b14d3 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -203,6 +203,11 @@ jobs: rm -rf r_scripts rm ${{ matrix.directory }}_r_scripts.zip + - name: Check out the branch for pull request + if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + run: | + git checkout ${{ github.event.pull_request.head.ref }} + - name: Check if there are any changes if: env.notebooks_changed == 'true' id: verify_diff diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 77c670ec..e5d22290 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -177,12 +177,17 @@ jobs: - name: Check for errors if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 + + - name: Check out the branch for pull request + if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + run: | + git checkout ${{ github.event.pull_request.head.ref }} - name: Check if there are any changes (e.g. stripped outputs) if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" id: verify_diff run: | - git pull + git pull git diff --quiet ${{ matrix.directory }}/*.ipynb || echo "changed=true" >> $GITHUB_OUTPUT - name: Commit and push stripped .ipynb files From 1bbe9e0dd577e9057fe6867043929a846a3a690e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 14:49:27 +0000 Subject: [PATCH 196/261] Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in PM3 --- PM3/r_functional_approximation_by_nn_and_rf.Rmd | 1 - 1 file changed, 1 deletion(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd index 2a55f428..aca4a796 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.Rmd +++ b/PM3/r_functional_approximation_by_nn_and_rf.Rmd @@ -16,7 +16,6 @@ install.packages("randomForest") install.packages("rpart") install.packages("gbm") install.packages("keras") - ``` ```{r} From 9fa38c0c7be57d35e7f51a562cc5223b7798d13c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 07:54:35 -0700 Subject: [PATCH 197/261] added fetch all --- .github/workflows/check-and-transform-R-notebooks.yml | 3 ++- .github/workflows/python-notebooks.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index b82b14d3..801f0c3c 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -204,8 +204,9 @@ jobs: rm ${{ matrix.directory }}_r_scripts.zip - name: Check out the branch for pull request - if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true')" run: | + git fetch --all git checkout ${{ github.event.pull_request.head.ref }} - name: Check if there are any changes diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index e5d22290..d5665acd 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -181,6 +181,7 @@ jobs: - name: Check out the branch for pull request if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" run: | + git fetch --all git checkout ${{ github.event.pull_request.head.ref }} - name: Check if there are any changes (e.g. stripped outputs) From 433f75c46f95164ecd41beb99cafb4ef275bdb21 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 08:11:26 -0700 Subject: [PATCH 198/261] Update python-linear-model-overfitting.ipynb --- PM1/python-linear-model-overfitting.ipynb | 33 +++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 9762d5d3..25b05c01 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,12 +56,35 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + } + ], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", "regression_stats(1000, 50)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -83,7 +106,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.4" }, "papermill": { "default_parameters": {}, @@ -99,5 +122,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From ae0e80243f7139db400a5fd9d24de25398a9463c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 15:16:01 +0000 Subject: [PATCH 199/261] Strip outputs from .ipynb files in PM1 --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 25b05c01..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From bccceb4b30d437380989230a046f96d4fb14052b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 08:27:01 -0700 Subject: [PATCH 200/261] Testing PR continuous integration --- .../check-and-transform-R-notebooks.yml | 4 ++-- .github/workflows/python-notebooks.yml | 4 ++-- PM1/python-linear-model-overfitting.ipynb | 22 ++++++++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 801f0c3c..dc9b199d 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -223,7 +223,7 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git pull git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }}' - git push + git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }} [skip ci]' + git push --force-with-lease env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index d5665acd..c39f90f1 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -198,7 +198,7 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git pull git add ${{ matrix.directory }}/*.ipynb - git commit -m 'Strip outputs from .ipynb files in ${{ matrix.directory }}' - git push + git commit -m 'Strip outputs from .ipynb files in ${{ matrix.directory }} [skip ci]' + git push --force-with-lease env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 424560c0..c57c8825 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,7 +56,23 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + } + ], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From 57742b0f892cbbb92527a712e2674b5050cb34ea Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 20 Jul 2024 15:31:32 +0000 Subject: [PATCH 201/261] Strip outputs from .ipynb files in PM1 [skip ci] --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index c57c8825..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From fc90b4d7de20104dc318bc832cf32014d0f9251e Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 08:41:43 -0700 Subject: [PATCH 202/261] Update r-colliderbias-hollywood.irnb --- CM2/r-colliderbias-hollywood.irnb | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index c032372c..738dd6c7 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -29,7 +29,21 @@ "languageId": "r" } }, - "outputs": [], + "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + ], "source": [ "install.packages(\"dagitty\")" ] From bd9fd7af0b83f59f6dccc504f23f7493a41eafe6 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sat, 20 Jul 2024 08:53:33 -0700 Subject: [PATCH 203/261] Update r-colliderbias-hollywood.irnb --- CM2/r-colliderbias-hollywood.irnb | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/CM2/r-colliderbias-hollywood.irnb b/CM2/r-colliderbias-hollywood.irnb index 738dd6c7..c032372c 100644 --- a/CM2/r-colliderbias-hollywood.irnb +++ b/CM2/r-colliderbias-hollywood.irnb @@ -29,21 +29,7 @@ "languageId": "r" } }, - "outputs": [ - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - ], + "outputs": [], "source": [ "install.packages(\"dagitty\")" ] From b973e4b1cc715e2d81291a6a37d64f4422402433 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:18:20 -0700 Subject: [PATCH 204/261] Added new workflow Removed pushing stripped notebooks at PR or at schedule event. Now all stripping will happen from another workflow that happens at every push. There was a problem with pushing automated commits during a PR. So now the stripping of any remnant outputs will happen when a PR is merged, which will trigger a push event. --- .github/workflows/python-notebooks.yml | 51 ++++++----- .github/workflows/strip-python-notebooks.yml | 89 ++++++++++++++++++++ 2 files changed, 113 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/strip-python-notebooks.yml diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index c39f90f1..23eac65e 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -1,9 +1,6 @@ name: Run and Test Jupyter Notebooks on: - push: - branches: - - main pull_request: branches: - main @@ -178,27 +175,27 @@ jobs: if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 - - name: Check out the branch for pull request - if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" - run: | - git fetch --all - git checkout ${{ github.event.pull_request.head.ref }} - - - name: Check if there are any changes (e.g. stripped outputs) - if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" - id: verify_diff - run: | - git pull - git diff --quiet ${{ matrix.directory }}/*.ipynb || echo "changed=true" >> $GITHUB_OUTPUT - - - name: Commit and push stripped .ipynb files - if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))) && (steps.verify_diff.outputs.changed == 'true')" - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git pull - git add ${{ matrix.directory }}/*.ipynb - git commit -m 'Strip outputs from .ipynb files in ${{ matrix.directory }} [skip ci]' - git push --force-with-lease - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # - name: Check out the branch for pull request + # if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + # run: | + # git fetch --all + # git checkout ${{ github.event.pull_request.head.ref }} + + # - name: Check if there are any changes (e.g. stripped outputs) + # if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + # id: verify_diff + # run: | + # git pull + # git diff --quiet ${{ matrix.directory }}/*.ipynb || echo "changed=true" >> $GITHUB_OUTPUT + + # - name: Commit and push stripped .ipynb files + # if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))) && (steps.verify_diff.outputs.changed == 'true')" + # run: | + # git config --global user.name 'github-actions[bot]' + # git config --global user.email 'github-actions[bot]@users.noreply.github.com' + # git pull + # git add ${{ matrix.directory }}/*.ipynb + # git commit -m 'Strip outputs from .ipynb files in ${{ matrix.directory }} [skip ci]' + # git push --force-with-lease + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml new file mode 100644 index 00000000..ad658db8 --- /dev/null +++ b/.github/workflows/strip-python-notebooks.yml @@ -0,0 +1,89 @@ +name: Strip Python Jupyter Notebooks from Outputs + +on: + push: + +concurrency: + group: strip-python-notebooks-${{ github.ref }} + cancel-in-progress: true + +jobs: + strip-python-notebooks: + runs-on: ubuntu-latest + strategy: + matrix: + directory: [PM1, PM2, PM3, PM4, PM5, CM1, CM2, CM3, AC1, AC2, T] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.10' # Use Python 3.10 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install jupyter nbstripout + shell: bash + + - name: Strip outputs from .ipynb files + run: | + dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) + for dir in "${dirs[@]}"; do + # Check if directory exists + if [ -d "$dir" ]; then + echo "Processing directory: $dir" + # Loop over all .ipynb files in the directory (non-recursively) + for notebook in "$dir"/*.ipynb; do + # Check if there are any .ipynb files + if [ -e "$notebook" ]; then + echo "Stripping output from notebook: $notebook" + nbstripout "$notebook" + fi + done + else + echo "Directory $dir does not exist." + fi + done + + - name: Check if there are any changes (e.g. stripped outputs) + id: verify_diff + run: | + git pull + for dir in "${dirs[@]}"; do + # Check if directory exists + if [ -d "$dir" ]; then + echo "Processing directory: $dir" + if git diff --quiet "$dir"/*.ipynb; then + echo "changed=true" >> $GITHUB_OUTPUT + break 1 + fi + else + echo "Directory $dir does not exist." + fi + done + + - name: Commit and push stripped .ipynb files + if: steps.verify_diff.outputs.changed == 'true' + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git pull + for dir in "${dirs[@]}"; do + # Check if directory exists + if [ -d "$dir" ]; then + echo "Adding changed files from directory: $dir" + git add "$dir"/*.ipynb + else + echo "Directory $dir does not exist." + fi + done + git commit -m 'Strip outputs from .ipynb files [skip ci]' + git push --force-with-lease + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From e68c336b8f85412fb3cf8ac5f8481212eb95687f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:19:44 -0700 Subject: [PATCH 205/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index ad658db8..cc9c63cb 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -10,9 +10,6 @@ concurrency: jobs: strip-python-notebooks: runs-on: ubuntu-latest - strategy: - matrix: - directory: [PM1, PM2, PM3, PM4, PM5, CM1, CM2, CM3, AC1, AC2, T] steps: - name: Checkout repository From 345979b9f04801354e1b70fde1b6fdfaaa7344a3 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:30:34 -0700 Subject: [PATCH 206/261] Testing stripping --- .github/workflows/strip-python-notebooks.yml | 20 +++++++++++++----- PM1/python-linear-model-overfitting.ipynb | 22 +++++++++++++++++--- PM1/r-linear-model-overfitting.irnb | 4 +++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index cc9c63cb..b50100ef 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -1,14 +1,14 @@ -name: Strip Python Jupyter Notebooks from Outputs +name: Strip Python and R Notebooks from Outputs on: push: concurrency: - group: strip-python-notebooks-${{ github.ref }} + group: strip-notebooks-${{ github.ref }} cancel-in-progress: true jobs: - strip-python-notebooks: + strip-notebooks: runs-on: ubuntu-latest steps: @@ -43,6 +43,16 @@ jobs: nbstripout "$notebook" fi done + for notebook in "$dir"/*.irnb; do + # Check if there are any .irnb files + if [ -e "$notebook" ]; then + echo "Stripping output from notebook: $notebook" + ipynb_notebook="${notebook%.irnb}.ipynb" + mv "$notebook" "$ipynb_notebook" + nbstripout "$ipynb_notebook" + mv "$ipynb_notebook" "$notebook" + fi + done else echo "Directory $dir does not exist." fi @@ -56,7 +66,7 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" - if git diff --quiet "$dir"/*.ipynb; then + if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb; then echo "changed=true" >> $GITHUB_OUTPUT break 1 fi @@ -75,7 +85,7 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Adding changed files from directory: $dir" - git add "$dir"/*.ipynb + git add "$dir"/*.ipynb "$dir"/*.irnb else echo "Directory $dir does not exist." fi diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 424560c0..050258b2 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,7 +56,23 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + } + ], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index 60d7aafc..5385e5f8 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -65,7 +65,9 @@ "languageId": "r" } }, - "outputs": [], + "outputs": [ + "text": ["asdfas\n"] + ], "source": [ "\n", "set.seed(123)\n", From 7e833a55de1524dd004c24a086e66af3aa87c064 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:36:06 -0700 Subject: [PATCH 207/261] Update strip-python-notebooks.yml Using a formal temp file name to avoid messing up python notebooks when moving irnb to ipynb --- .github/workflows/strip-python-notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index b50100ef..3cff1d97 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -47,7 +47,7 @@ jobs: # Check if there are any .irnb files if [ -e "$notebook" ]; then echo "Stripping output from notebook: $notebook" - ipynb_notebook="${notebook%.irnb}.ipynb" + ipynb_notebook=$(mktemp --suffix=.ipynb) mv "$notebook" "$ipynb_notebook" nbstripout "$ipynb_notebook" mv "$ipynb_notebook" "$notebook" From d75604af7bc4f8adfbefee393b775af9b101d117 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:45:52 -0700 Subject: [PATCH 208/261] Update r-linear-model-overfitting.irnb --- PM1/r-linear-model-overfitting.irnb | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index 5385e5f8..b2638c37 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -65,9 +65,6 @@ "languageId": "r" } }, - "outputs": [ - "text": ["asdfas\n"] - ], "source": [ "\n", "set.seed(123)\n", @@ -83,7 +80,22 @@ "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", "print(summary(lm(y ~ X))$adj.r.squared)\n" - ] + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[1] \"p/n is\"\n", + "[1] 0.5\n", + "[1] \"R2 is\"\n", + "[1] 0.4922339\n", + "[1] \"Adjusted R2 is\"\n", + "[1] -0.01654975\n" + ] + } + ] }, { "cell_type": "markdown", From 51104f074ebc365b140038cc5b0740084d62df57 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:50:32 -0700 Subject: [PATCH 209/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 3cff1d97..0e6bda45 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -62,11 +62,13 @@ jobs: id: verify_diff run: | git pull + dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) for dir in "${dirs[@]}"; do # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb; then + echo "Found changes in directory: $dir" echo "changed=true" >> $GITHUB_OUTPUT break 1 fi @@ -81,6 +83,7 @@ jobs: git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' git pull + dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) for dir in "${dirs[@]}"; do # Check if directory exists if [ -d "$dir" ]; then From 4b2cdbf707139fc18ce5ba837ffa451dfc068fdb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 22 Jul 2024 06:51:27 +0000 Subject: [PATCH 210/261] Strip outputs from .ipynb files [skip ci] --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- PM1/r-linear-model-overfitting.irnb | 18 ++---------------- 2 files changed, 5 insertions(+), 35 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 050258b2..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index b2638c37..60d7aafc 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -65,6 +65,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "\n", "set.seed(123)\n", @@ -80,22 +81,7 @@ "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", "print(summary(lm(y ~ X))$adj.r.squared)\n" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[1] \"p/n is\"\n", - "[1] 0.5\n", - "[1] \"R2 is\"\n", - "[1] 0.4922339\n", - "[1] \"Adjusted R2 is\"\n", - "[1] -0.01654975\n" - ] - } - ] + ] }, { "cell_type": "markdown", From 8937dfaa2fa0ad2d2968e4511762d5ca70ba5cdd Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Sun, 21 Jul 2024 23:56:27 -0700 Subject: [PATCH 211/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 22 +++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 0e6bda45..751ed53c 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -27,7 +27,16 @@ jobs: python -m pip install --upgrade pip pip install jupyter nbstripout shell: bash - + + - name: Set up R + if: env.notebooks_changed == 'true' + uses: r-lib/actions/setup-r@v2 + + - name: Install rmarkdown, knitr, and lintr packages + if: env.notebooks_changed == 'true' + run: | + R -e 'install.packages(c("rmarkdown"), repos="https://cloud.r-project.org")' + - name: Strip outputs from .ipynb files run: | dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) @@ -53,6 +62,13 @@ jobs: mv "$ipynb_notebook" "$notebook" fi done + echo "Converting .irnb to .Rmd to update the .Rmd version" + R -e ' + files <- list.files(path = "${{ dir }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + lapply(files, function(input) { + rmarkdown::convert_ipynb(input) + }) + ' else echo "Directory $dir does not exist." fi @@ -67,7 +83,7 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" - if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb; then + if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then echo "Found changes in directory: $dir" echo "changed=true" >> $GITHUB_OUTPUT break 1 @@ -88,7 +104,7 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Adding changed files from directory: $dir" - git add "$dir"/*.ipynb "$dir"/*.irnb + git add "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd else echo "Directory $dir does not exist." fi From d01e77314d5886e81403f11c0393ff2524897e43 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:06:26 -0700 Subject: [PATCH 212/261] update strip action --- .../check-and-transform-R-notebooks.yml | 53 +++++++++---------- .github/workflows/strip-python-notebooks.yml | 12 ++--- README.md | 4 +- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index dc9b199d..3c0162a3 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -1,9 +1,6 @@ -name: Check and Transform R Notebooks +name: Check R Notebooks on: - push: - branches: - - main pull_request: branches: - main @@ -203,27 +200,27 @@ jobs: rm -rf r_scripts rm ${{ matrix.directory }}_r_scripts.zip - - name: Check out the branch for pull request - if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true')" - run: | - git fetch --all - git checkout ${{ github.event.pull_request.head.ref }} - - - name: Check if there are any changes - if: env.notebooks_changed == 'true' - id: verify_diff - run: | - git pull - git diff --quiet ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd || echo "changed=true" >> $GITHUB_OUTPUT - - - name: Commit and push stripped .irnb and .Rmd files - if: "(env.notebooks_changed == 'true') && (steps.verify_diff.outputs.changed == 'true')" - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git pull - git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd - git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }} [skip ci]' - git push --force-with-lease - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # - name: Check out the branch for pull request + # if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true')" + # run: | + # git fetch --all + # git checkout ${{ github.event.pull_request.head.ref }} + + # - name: Check if there are any changes + # if: env.notebooks_changed == 'true' + # id: verify_diff + # run: | + # git pull + # git diff --quiet ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd || echo "changed=true" >> $GITHUB_OUTPUT + + # - name: Commit and push stripped .irnb and .Rmd files + # if: "(env.notebooks_changed == 'true') && (steps.verify_diff.outputs.changed == 'true')" + # run: | + # git config --global user.name 'github-actions[bot]' + # git config --global user.email 'github-actions[bot]@users.noreply.github.com' + # git pull + # git add ${{ matrix.directory }}/*.irnb ${{ matrix.directory }}/*.Rmd + # git commit -m 'Strip outputs from .irnb, convert to .Rmd, lint .Rmd files, and execute .R files in ${{ matrix.directory }} [skip ci]' + # git push --force-with-lease + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 751ed53c..22c848e9 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -1,14 +1,14 @@ -name: Strip Python and R Notebooks from Outputs +name: Strip Python and R Notebooks from Outputs and Transform .irnb to .Rmd on: push: concurrency: - group: strip-notebooks-${{ github.ref }} + group: strip-transform-notebooks-${{ github.ref }} cancel-in-progress: true jobs: - strip-notebooks: + strip-transform-notebooks: runs-on: ubuntu-latest steps: @@ -63,12 +63,12 @@ jobs: fi done echo "Converting .irnb to .Rmd to update the .Rmd version" - R -e ' - files <- list.files(path = "${{ dir }}", pattern = "\\.irnb$", full.names = TRUE, recursive = FALSE) + R -e " + files <- list.files(path = '$dir', pattern = '\\.irnb$', full.names = TRUE, recursive = FALSE) lapply(files, function(input) { rmarkdown::convert_ipynb(input) }) - ' + " else echo "Directory $dir does not exist." fi diff --git a/README.md b/README.md index ef93d9be..7f14be6d 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,6 @@ pip install -r requirements.txt # For Contributors -The .Rmd files are auto-generated by a Github Action, whenever one pushes a .irnb (R Jupyter notebook) to one of the main folders of the repo. So .Rmd files, should never be altered directly. Only changes to .irnb files should be made. Any change to a .Rmd file will be over-written by the corresponding .irnb file and will not survive the Github Action. +The .Rmd files are auto-generated by a Github Action, whenever one pushes a .irnb (R Jupyter notebook) to one of the main folders of the repo on the main branch. So .Rmd files, should never be altered directly. Only changes to .irnb files should be made. Any change to a .Rmd file will be over-written by the corresponding .irnb file and will not survive the Github Action. + +Moreover, whenever a push happens to the main branch, all python and R notebooks and all R Markdown files are stripped from their outputs. It is advisable that you always strip the notebooks before pushing to the repo. You can use `nbstripout --install` on your local git directory, which does this automatically for you. \ No newline at end of file From 24452bdb35d042ae845bb71dcbce6cbb8d9174cb Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:09:27 -0700 Subject: [PATCH 213/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 22c848e9..41e59a0e 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -64,7 +64,7 @@ jobs: done echo "Converting .irnb to .Rmd to update the .Rmd version" R -e " - files <- list.files(path = '$dir', pattern = '\\.irnb$', full.names = TRUE, recursive = FALSE) + files <- list.files(path = '$dir', pattern = '\\\\.irnb$', full.names = TRUE, recursive = FALSE) lapply(files, function(input) { rmarkdown::convert_ipynb(input) }) From 8c6e12576eb57ceb1e6a43e0bc5b5f19b74aba0f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:13:50 -0700 Subject: [PATCH 214/261] Testing strip --- .github/workflows/strip-python-notebooks.yml | 2 -- PM1/python-linear-model-overfitting.ipynb | 22 +++++++++++++++++--- PM1/r-linear-model-overfitting.irnb | 16 +++++++++++++- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 41e59a0e..2f4dd949 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -29,11 +29,9 @@ jobs: shell: bash - name: Set up R - if: env.notebooks_changed == 'true' uses: r-lib/actions/setup-r@v2 - name: Install rmarkdown, knitr, and lintr packages - if: env.notebooks_changed == 'true' run: | R -e 'install.packages(c("rmarkdown"), repos="https://cloud.r-project.org")' diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 424560c0..c337e415 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,7 +56,23 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + } + ], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index 60d7aafc..f1355cfe 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -29,7 +29,6 @@ "languageId": "r" } }, - "outputs": [], "source": [ "\n", "set.seed(123)\n", @@ -45,6 +44,21 @@ "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", "print(summary(lm(y ~ X))$adj.r.squared)\n" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[1] \"p/n is\"\n", + "[1] 1\n", + "[1] \"R2 is\"\n", + "[1] 1\n", + "[1] \"Adjusted R2 is\"\n", + "[1] NaN\n" + ] + } ] }, { From 86339f60040b6f9eeee2367440abaae0afaa2893 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 22 Jul 2024 07:18:27 +0000 Subject: [PATCH 215/261] Strip outputs from .ipynb files [skip ci] --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- PM1/r-linear-model-overfitting.irnb | 16 +--------------- 2 files changed, 4 insertions(+), 34 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index c337e415..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", diff --git a/PM1/r-linear-model-overfitting.irnb b/PM1/r-linear-model-overfitting.irnb index f1355cfe..60d7aafc 100644 --- a/PM1/r-linear-model-overfitting.irnb +++ b/PM1/r-linear-model-overfitting.irnb @@ -29,6 +29,7 @@ "languageId": "r" } }, + "outputs": [], "source": [ "\n", "set.seed(123)\n", @@ -44,21 +45,6 @@ "print(summary(lm(y ~ X))$r.squared)\n", "print(\"Adjusted R2 is\")\n", "print(summary(lm(y ~ X))$adj.r.squared)\n" - ], - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[1] \"p/n is\"\n", - "[1] 1\n", - "[1] \"R2 is\"\n", - "[1] 1\n", - "[1] \"Adjusted R2 is\"\n", - "[1] NaN\n" - ] - } ] }, { From c99e311e16f144e544d18dfe45a3cb60114aa1aa Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:21:54 -0700 Subject: [PATCH 216/261] testing strip --- .github/workflows/strip-python-notebooks.yml | 2 +- PM1/python-linear-model-overfitting.ipynb | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 2f4dd949..59bb9062 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -31,7 +31,7 @@ jobs: - name: Set up R uses: r-lib/actions/setup-r@v2 - - name: Install rmarkdown, knitr, and lintr packages + - name: Install rmarkdown package run: | R -e 'install.packages(c("rmarkdown"), repos="https://cloud.r-project.org")' diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index c337e415..7ffe1ff9 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -61,15 +61,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" + "p/n is: 1.0\n" ] } ], From 15a85e3cd47b3211b585d6d487766936219f078b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:22:52 -0700 Subject: [PATCH 217/261] Update python-linear-model-overfitting.ipynb --- PM1/python-linear-model-overfitting.ipynb | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 7ffe1ff9..a8103986 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -61,7 +61,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "p/n is: 1.0\n" + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" ] } ], From c8f685402dffeb3647bf20c5c51b28ea2feada85 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 22 Jul 2024 07:27:44 +0000 Subject: [PATCH 218/261] Strip outputs from .ipynb files [skip ci] --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index a8103986..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From 991ad5b8307003e511eee791de1f05ec3c4ec01d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:29:28 -0700 Subject: [PATCH 219/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 59bb9062..ab5f5002 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -81,6 +81,7 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" + git diff "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then echo "Found changes in directory: $dir" echo "changed=true" >> $GITHUB_OUTPUT From 49b7d82fc31250830b99ef1ad57adffef612bc89 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 00:31:14 -0700 Subject: [PATCH 220/261] Update python-linear-model-overfitting.ipynb --- PM1/python-linear-model-overfitting.ipynb | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 424560c0..679001a9 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,7 +56,23 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p/n is: 1.0\n", + "R^2 is 1.0\n", + "Adjusted R^2 is nan\n", + "p/n is: 0.5\n", + "R^2 is 0.5232392347513539\n", + "Adjusted R^2 is 0.04647846950270784\n", + "p/n is: 0.05\n", + "R^2 is 0.05610356497156044\n", + "Adjusted R^2 is 0.006424805233221531\n" + ] + } + ], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From 380ecd53a8b23f80549c611c49a9891036b88888 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 22 Jul 2024 07:35:54 +0000 Subject: [PATCH 221/261] Strip outputs from .ipynb files [skip ci] --- PM1/python-linear-model-overfitting.ipynb | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/PM1/python-linear-model-overfitting.ipynb b/PM1/python-linear-model-overfitting.ipynb index 679001a9..424560c0 100644 --- a/PM1/python-linear-model-overfitting.ipynb +++ b/PM1/python-linear-model-overfitting.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "id": "YTaOmenI7TfT" }, @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "id": "C57WYtYM7OL0", "papermill": { @@ -56,23 +56,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "p/n is: 1.0\n", - "R^2 is 1.0\n", - "Adjusted R^2 is nan\n", - "p/n is: 0.5\n", - "R^2 is 0.5232392347513539\n", - "Adjusted R^2 is 0.04647846950270784\n", - "p/n is: 0.05\n", - "R^2 is 0.05610356497156044\n", - "Adjusted R^2 is 0.006424805233221531\n" - ] - } - ], + "outputs": [], "source": [ "regression_stats(1000, 1000)\n", "regression_stats(1000, 500)\n", From ab0de707848d5864d12f88b191c53cca0e91bd59 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 11:20:26 +0300 Subject: [PATCH 222/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index ab5f5002..02c79157 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -81,8 +81,10 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" - git diff "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd + if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then + echo "No changes were found" + else echo "Found changes in directory: $dir" echo "changed=true" >> $GITHUB_OUTPUT break 1 From dcd9ffd6134410e1e72ae6d7a415dca6ed08280f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 11:28:57 +0300 Subject: [PATCH 223/261] Update strip-python-notebooks.yml --- .github/workflows/strip-python-notebooks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/strip-python-notebooks.yml index 02c79157..fc8560f1 100644 --- a/.github/workflows/strip-python-notebooks.yml +++ b/.github/workflows/strip-python-notebooks.yml @@ -2,6 +2,8 @@ name: Strip Python and R Notebooks from Outputs and Transform .irnb to .Rmd on: push: + branches: + - main concurrency: group: strip-transform-notebooks-${{ github.ref }} From b40a21b93b48ca608dd308e3e3b463bf111712d9 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 04:58:40 -0700 Subject: [PATCH 224/261] Update check-and-transform-R-notebooks.yml --- .../workflows/check-and-transform-R-notebooks.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-and-transform-R-notebooks.yml index 3c0162a3..bf5b9cc6 100644 --- a/.github/workflows/check-and-transform-R-notebooks.yml +++ b/.github/workflows/check-and-transform-R-notebooks.yml @@ -30,21 +30,28 @@ jobs: # git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} # git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt git diff --name-only -r HEAD^1 HEAD > changed_files.txt - grep -E '\.irnb$|\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + if grep -q -E '^${{ matrix.directory }}/.*\.Rmd$' changed_files.txt; then + echo "Changing directly the .Rmd files is prohibited. You should only be changing the .irnb files" + echo "The .Rmd files will be automatically generated and updated when the PR is merged in the main branch" + echo "It seems that you changed directly the following files:" + grep -E '^${{ matrix.directory }}/.*\.Rmd$' changed_files.txt + exit 1 + fi + grep -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep -E '\.irnb$|\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|^${{ matrix.directory }}/.*\.Rmd$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV From 03c6bc8c6260ccc61c382065d207605681b26074 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:01:06 -0700 Subject: [PATCH 225/261] Update python-notebooks.yml --- .github/workflows/python-notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 23eac65e..366c4d98 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -1,4 +1,4 @@ -name: Run and Test Jupyter Notebooks +name: Check Python Notebooks on: pull_request: From a59ea99922fc536dd788060bda1cf48b7370a23a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:02:32 -0700 Subject: [PATCH 226/261] Update python-notebooks.yml --- .github/workflows/python-notebooks.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/python-notebooks.yml index 366c4d98..6e63419c 100644 --- a/.github/workflows/python-notebooks.yml +++ b/.github/workflows/python-notebooks.yml @@ -34,25 +34,25 @@ jobs: # git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} # git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt git diff --name-only -r HEAD^1 HEAD > changed_files.txt - grep -E '\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.ipynb$|\.py$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep -E '\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '\.ipynb$|\.py$|\.github/workflows/python-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.ipynb$|\.github/workflows/python-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/.*\.ipynb$|^${{ matrix.directory }}/.*\.py$|\.github/workflows/python-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV - echo "No Python notebooks changed in folder ${{ matrix.directory }} in this PR." + echo "No Python notebooks or python scripts changed in folder ${{ matrix.directory }} in this PR." fi - name: Set notebooks changed to true for schedule (or other ways of triggering the job) From b423c6b335c80a71dfa1d8cd287aa636aa96949a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:08:34 -0700 Subject: [PATCH 227/261] Removed .Rmd files All the .Rmd files will be added automatically when the PR is merged to main. --- AC1/r-proxy-controls.Rmd | 200 --- ...analysis-with-sensmakr-and-debiased-ml.Rmd | 290 ---- ...iased-ml-for-partially-linear-iv-model.Rmd | 286 ---- AC2/r-dml-401k-IV.Rmd | 1309 ----------------- AC2/r-weak-iv-experiments.Rmd | 75 - ...yzing-rct-reemployment-experiment.irnb.Rmd | 198 --- ...book-analyzing-rct-with-precision.irnb.Rmd | 120 -- CM1/Old/r-notebook-some-rct-examples.irnb.Rmd | 259 ---- CM1/r-rct-penn-precision-adj.Rmd | 238 --- CM1/r-rct-vaccines.Rmd | 275 ---- CM1/r-sim-precision-adj.Rmd | 117 -- CM2/Old/r-colliderbias-hollywood.irnb.Rmd | 66 - CM2/r-colliderbias-hollywood.Rmd | 59 - CM3/Old/notebook-dagitty.irnb.Rmd | 220 --- CM3/Old/notebook-dosearch.irnb.Rmd | 235 --- CM3/r-dagitty.Rmd | 199 --- CM3/r-dosearch.Rmd | 215 --- ...-and-lasso-for-wage-gap-inference.irnb.Rmd | 419 ------ ...ols-and-lasso-for-wage-prediction.irnb.Rmd | 440 ------ ...notebook-linear-model-overfitting.irnb.Rmd | 79 - PM1/r-linear-model-overfitting.Rmd | 64 - ...r-ols-and-lasso-for-wage-gap-inference.Rmd | 398 ----- PM1/r-ols-and-lasso-for-wage-prediction.Rmd | 405 ----- PM2/Old/heterogenous-wage-effects.irnb.Rmd | 76 - PM2/Old/ml-for-wage-prediction.irnb.Rmd | 291 ---- ...experiment-on-orthogonal-learning.irnb.Rmd | 113 -- .../r-notebook-linear-penalized-regs.irnb.Rmd | 291 ---- PM2/r_convergence_hypothesis_double_lasso.Rmd | 268 ---- PM2/r_experiment_non_orthogonal.Rmd | 516 ------- PM2/r_heterogenous_wage_effects.Rmd | 102 -- PM2/r_linear_penalized_regs.Rmd | 707 --------- PM2/r_ml_for_wage_prediction.Rmd | 447 ------ PM2/r_orthogonal_orig.Rmd | 116 -- PM3/Old/automl-for-wage-prediction.irnb.Rmd | 126 -- ..._functional_approximation_by_nn_and_rf.Rmd | 193 --- PM3/r_ml_wage_prediction.Rmd | 594 -------- ...ural-networks-for-wage-prediction.irnb.Rmd | 139 -- ...so-for-the-convergence-hypothesis.irnb.Rmd | 178 --- ...-analysis-of-401-k-example-w-dags.irnb.Rmd | 213 --- PM4/r-dml-401k.Rmd | 1019 ------------- ...ation-analysis-of-401-k-example-w-dags.Rmd | 198 --- ...d_ml_for_partially_linear_model_growth.Rmd | 204 --- PM4/r_dml_inference_for_gun_ownership.Rmd | 573 -------- PM5/Autoencoders.Rmd | 262 ---- T/T-3 Diff-in-Diff Minimum Wage Example.Rmd | 667 --------- ...ression_Discontinuity_on_Progresa_Data.Rmd | 601 -------- ...or-partially-linear-iv-model-in-r.irnb.Rmd | 264 ---- ...r-ate-and-late-of-401-k-on-wealth.irnb.Rmd | 677 --------- .../dml-inference-for-gun-ownership.irnb.Rmd | 377 ----- ...erence-using-nn-for-gun-ownership.irnb.Rmd | 172 --- T/deprecated/r-weak-iv-experiments.irnb.Rmd | 92 -- ...r-conditional-average-treatment-effect.Rmd | 610 -------- ...or-partially-linear-iv-model-in-r.irnb.Rmd | 264 ---- ...sis-with-sensmakr-and-debiased-ml.irnb.Rmd | 265 ---- 54 files changed, 16781 deletions(-) delete mode 100644 AC1/r-proxy-controls.Rmd delete mode 100644 AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd delete mode 100644 AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd delete mode 100644 AC2/r-dml-401k-IV.Rmd delete mode 100644 AC2/r-weak-iv-experiments.Rmd delete mode 100644 CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd delete mode 100644 CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd delete mode 100644 CM1/Old/r-notebook-some-rct-examples.irnb.Rmd delete mode 100644 CM1/r-rct-penn-precision-adj.Rmd delete mode 100644 CM1/r-rct-vaccines.Rmd delete mode 100644 CM1/r-sim-precision-adj.Rmd delete mode 100644 CM2/Old/r-colliderbias-hollywood.irnb.Rmd delete mode 100644 CM2/r-colliderbias-hollywood.Rmd delete mode 100644 CM3/Old/notebook-dagitty.irnb.Rmd delete mode 100644 CM3/Old/notebook-dosearch.irnb.Rmd delete mode 100644 CM3/r-dagitty.Rmd delete mode 100644 CM3/r-dosearch.Rmd delete mode 100644 PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd delete mode 100644 PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd delete mode 100644 PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd delete mode 100644 PM1/r-linear-model-overfitting.Rmd delete mode 100644 PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd delete mode 100644 PM1/r-ols-and-lasso-for-wage-prediction.Rmd delete mode 100644 PM2/Old/heterogenous-wage-effects.irnb.Rmd delete mode 100644 PM2/Old/ml-for-wage-prediction.irnb.Rmd delete mode 100644 PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd delete mode 100644 PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd delete mode 100644 PM2/r_convergence_hypothesis_double_lasso.Rmd delete mode 100644 PM2/r_experiment_non_orthogonal.Rmd delete mode 100644 PM2/r_heterogenous_wage_effects.Rmd delete mode 100644 PM2/r_linear_penalized_regs.Rmd delete mode 100644 PM2/r_ml_for_wage_prediction.Rmd delete mode 100644 PM2/r_orthogonal_orig.Rmd delete mode 100644 PM3/Old/automl-for-wage-prediction.irnb.Rmd delete mode 100644 PM3/r_functional_approximation_by_nn_and_rf.Rmd delete mode 100644 PM3/r_ml_wage_prediction.Rmd delete mode 100644 PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd delete mode 100644 PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd delete mode 100644 PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd delete mode 100644 PM4/r-dml-401k.Rmd delete mode 100644 PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd delete mode 100644 PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd delete mode 100644 PM4/r_dml_inference_for_gun_ownership.Rmd delete mode 100644 PM5/Autoencoders.Rmd delete mode 100644 T/T-3 Diff-in-Diff Minimum Wage Example.Rmd delete mode 100644 T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd delete mode 100644 T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd delete mode 100644 T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd delete mode 100644 T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd delete mode 100644 T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd delete mode 100644 T/deprecated/r-weak-iv-experiments.irnb.Rmd delete mode 100644 T/dml-for-conditional-average-treatment-effect.Rmd delete mode 100644 deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd delete mode 100644 deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd diff --git a/AC1/r-proxy-controls.Rmd b/AC1/r-proxy-controls.Rmd deleted file mode 100644 index 21be891b..00000000 --- a/AC1/r-proxy-controls.Rmd +++ /dev/null @@ -1,200 +0,0 @@ ---- -title: An R Markdown document converted from "AC1/r-proxy-controls.irnb" -output: html_document ---- - -# Negative (Proxy) Controls for Unobserved Confounding - -Consider the following SEM, where $Y$ is the outcome, $D$ is the treatment, $A$ is some unobserved confounding, and $Q$, $X$, $S$ are the observed covariates. In particular, $Q$ is considered to be the proxy control treatment as it a priori has no effect on the actual outcome $Y$, and $S$ is considered to be the proxy control outcome as it a priori is not affected by the actual treatment $D$. See also [An Introduction to Proximal Causal Learning](https://arxiv.org/pdf/2009.10982.pdf), for more information on this setting. - -![proxy_dag.png](https://raw.githubusercontent.com/stanford-msande228/winter23/main/proxy_dag.png) - -Under linearity assumptions, the average treatment effect can be estimated by solving the vector of moment equations: -\begin{align} -E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) \left(\begin{aligned}\tilde{D}\\ \tilde{Q}\end{aligned}\right) \right] = 0 -\end{align} -where for every variable $V$ we denote with $\tilde{V} = V - E[V|X]$. - -When the dimension of the proxy treatment variables $Q$ is larger than the dimension of proxy outcome variables $S$, then the above system of equations is over-identified. In these settings, we first project the "technical instrument" variables $\tilde{V}=(\tilde{D}, \tilde{Q})$ onto the space of "technical treatment" variables $\tilde{W}=(\tilde{D}, \tilde{S})$ and use the projected $\tilde{V}$ as a new "technical instrument". In particular, we run an OLS regression of $\tilde{W}$ on $\tilde{V},$ and define $\tilde{Z} = E[\tilde{W}\mid \tilde{V}] = B \tilde{V}$, where the $t$-th row $\beta_t$ of the matrix $B$ is the OLS coefficient in the regression of $\tilde{W}_t$ on $\tilde{V}$. These new variables $\tilde{Z}$, can also be viewed as engineered technical instrumental variables. Then we have the exactly identified system of equations: -\begin{align} -E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) \tilde{Z} \right] := E\left[(\tilde{Y} - \alpha \tilde{D} - \delta \tilde{S}) B \left(\begin{aligned}\tilde{D}\\ \tilde{Q}\end{aligned}\right) \right] = 0 -\end{align} - -The solution to this system of equations is numerically equivalent to the following two stage algorithm: -- Run OLS of $\tilde{W}=(\tilde{D}, \tilde{S})$ on $\tilde{V}=(\tilde{D}, \tilde{Q})$ -- Define $\tilde{Z}$ as the predictions of the OLS model -- Run OLS of $\tilde{Y}$ on $\tilde{Z}$. -This is the well-known Two-Stage-Least-Squares (2SLS) algorithm for instrumental variable regression. - -Since we're considering only linear models and in a low-dimensional setting, we'll focus on just using linear IV methods. - -```{r} -install.packages("hdm") -``` - -```{r} -library(hdm) - -set.seed(1) -``` - -# Analyzing Simulated Data - -First, let's evaluate the methods on simulated data generated from a linear SEM characterized by the above DAG. For this simulation, we'll set the ATE to 2. - -```{r} -gen_data <- function(n, ate) { - X <- matrix(rnorm(n * 10), ncol = 10) - A <- 2 * X[, 1] + rnorm(n) - Q <- 10 * A + 2 * X[, 1] + rnorm(n) - S <- 5 * A + X[, 1] + rnorm(n) - D <- Q - A + 2 * X[, 1] + rnorm(n) - Y <- ate * D + 5 * A + 2 * S + 0.5 * X[, 1] + rnorm(n) - return(list(X, A, Q, S, D, Y)) -} -``` - -```{r} -data_list <- gen_data(5000, 2) -X <- data_list[[1]] -A <- data_list[[2]] -Q <- data_list[[3]] -S <- data_list[[4]] -D <- data_list[[5]] -Y <- data_list[[6]] -``` - -We define the technical instrument $V=(D, Q)$ and technical treatment $W=(D, S)$. Estimating the treatement effect is then just a matter of solving an instrument variable regression problem with instruments $V$ and treatments $W$ and looking at the first coefficient associated with $D$. - -```{r} -W <- cbind(D, S) -V <- cbind(D, Q) -``` - -```{r} -piv <- tsls(X, W, Y, V, homoscedastic = FALSE) -cat("Estimated coefficient:", piv$coefficients["D", 1], "\n") -cat("Standard error:", piv$se["D"], "\n") -``` - -# With Cross-Fitting - -We can also consider partialling out the controls using DML with cross-fitting - -```{r} -lm_dml_for_proxyiv <- function(x, d, q, s, y, dreg, qreg, yreg, sreg, nfold = 5) { - # this implements DML for a partially linear IV model - nobs <- nrow(x) - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- qtil <- stil <- rep(NA, nobs) - # obtain cross-fitted residuals - cat("fold: ") - for (b in seq_along(I)) { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - qfit <- qreg(x[-I[[b]], ], q[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out - sfit <- sreg(x[-I[[b]], ], s[-I[[b]]]) # take a fold out - dtil[I[[b]]] <- (d[I[[b]]] - x[I[[b]], ] %*% as.matrix(dfit$coefficients)) # record residual - qtil[I[[b]]] <- (q[I[[b]]] - x[I[[b]], ] %*% as.matrix(qfit$coefficients)) # record residual - ytil[I[[b]]] <- (y[I[[b]]] - x[I[[b]], ] %*% as.matrix(yfit$coefficients)) # record residial - stil[I[[b]]] <- (s[I[[b]]] - x[I[[b]], ] %*% as.matrix(sfit$coefficients)) # record residual - cat(b, " ") - } - ivfit <- tsls(y = ytil, d = cbind(dtil, stil), x = NULL, z = cbind(dtil, qtil), - intercept = FALSE, homoscedastic = FALSE) - coef_est <- ivfit$coef[1] # extract coefficient - se <- ivfit$se[1] # record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) - return(list(coef_est = coef_est, se = se, dtil = dtil, qtil = qtil, - ytil = ytil, stil = stil, foldid = foldid, spI = I)) -} -``` - -We'll just use OLS for partialling out again. We could of course try something more elaborate if we wanted. - -```{r} -dreg <- function(x, d) { - lm.fit(x, d) -} # ML method=ols -qreg <- function(x, q) { - lm.fit(x, q) -} # ML method=ols -yreg <- function(x, y) { - lm.fit(x, y) -} # ML method=ols -sreg <- function(x, s) { - lm.fit(x, s) -} # ML method=ols - -dml_piv <- lm_dml_for_proxyiv(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold = 5) -dml_piv -``` - -## Real Data - Effects of Smoking on Birth Weight - -In this study, we will be studying the effects of smoking on baby weight. We will consider the following stylized setup: - -Outcome ($Y$): baby weight - -Treatment ($D$): smoking - -Unobserved confounding ($A$): family income - -The observed covariates are put in to 3 groups: - - -* Proxy treatment control ($Q$): mother's education -* Proxy outcome control ($S$): parity (total number of previous pregnancies) -* Other observed covariates ($X$): mother's race and age - - -Education serves as a proxy treatment control $Q$ because it reflects unobserved confounding due to household income $A$ but has no direct medical effect on birth weight $Y$. Parity serves as a proxy outcome control $S$ because family size reflects household income $A$ but is not directly caused by smoking $D$ or education $Q$. - -A description of the data used can be found [here](https://www.stat.berkeley.edu/users/statlabs/data/babies.readme). - -```{r} -data <- read.table("https://www.stat.berkeley.edu/users/statlabs/data/babies23.data", header = TRUE) -summary(data) -``` - -```{r} -# Filter data to exclude entries where income, number of cigarettes smoked, -# race, age are not asked or not known -data <- data[data$race != 99, ] -data <- data[!(data$number %in% c(98, 99)), ] -data <- data[!(data$inc %in% c(98, 99)), ] -data <- data[data$age != 99, ] -dim(data) -``` - -```{r} -# Create matrices for X, D, Q, S, A, Y -X <- model.matrix(~ 0 + C(race) + age, data) -D <- model.matrix(~ 0 + number, data) -Q <- model.matrix(~ 0 + ed, data) -S <- model.matrix(~ 0 + parity, data) -A <- model.matrix(~ 0 + inc, data) -Y <- model.matrix(~ 0 + wt, data) -``` - -```{r} -# Use cross-fitting with OLS to estimate treatment effect within linear model context -dreg <- function(x, d) { - lm.fit(x, d) -} # ML method=ols -qreg <- function(x, q) { - lm.fit(x, q) -} # ML method=ols -yreg <- function(x, y) { - lm.fit(x, y) -} # ML method=ols -sreg <- function(x, s) { - lm.fit(x, s) -} # ML method=ols - -dml_bw_piv <- lm_dml_for_proxyiv(X, D, Q, S, Y, dreg, qreg, yreg, sreg, nfold = 5) -dml_bw_piv -``` - diff --git a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd b/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd deleted file mode 100644 index 96f7e7d7..00000000 --- a/AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.Rmd +++ /dev/null @@ -1,290 +0,0 @@ ---- -title: An R Markdown document converted from "AC1/r-sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb" -output: html_document ---- - -# Sensitivity Analysis for Unobserved Confounder with DML and Sensmakr - -Here we experiment with using package "sensemakr" in conjunction with debiased ML. - -## Partially Linear SEM - -Consider the SEM -\begin{align} -Y :=~& \alpha D + \delta A + f_Y(X) + \epsilon_Y, \\ -D :=~& \gamma A + f_D(X) + \epsilon_D, \\ -A :=~& f_A(X) + \epsilon_A, \\ -X :=~& \epsilon_X, -\end{align} -where, conditional on $X$, $\epsilon_Y, \epsilon_D, \epsilon_A$ are mean zero -and mutually uncorrelated. We further normalize -$$ -E[\epsilon_A^2] =1. -$$ -The key structural -parameter is $\alpha$: $$\alpha = \partial_d Y(d)$$ -where $$Y(d) := (Y: do (D=d)).$$ - -To give context to our example, we can interpret $Y$ as earnings, -$D$ as education, $A$ as ability, and $X$ as a set of observed background variables. In this example, we can interpret $\alpha$ as the returns to schooling. - -We start by applying the partialling out operator to get rid of the $X$'s in all of the equations. Define the partialling out operation of any random vector $V$ with respect to another random vector $X$ as the residual that is left after subtracting the best predictor of $V$ given $X$: -$$\tilde V = V - E [V \mid X].$$ -If $f$'s are linear, we can replace $E [V \mid X]$ -by linear projection. After partialling out, we have a simplified system: -\begin{align} -\tilde Y :=~& \alpha \tilde D + \delta \tilde A + \epsilon_Y, \\ -\tilde D :=~& \gamma \tilde A + \epsilon_D, \\ -\tilde A :=~& \epsilon_A, -\end{align} -where $\epsilon_Y$, $\epsilon_D$, and $\epsilon_A$ are uncorrelated. - -Then the projection of $\tilde Y$ on $\tilde D$ recovers -$$ -\beta = E [\tilde Y \tilde D]/ E [\tilde D^2] = \alpha + \phi, -$$ -where -$$ -\phi = \delta \gamma/ E \left[(\gamma^2 + \epsilon^2_D)\right], -$$ -is the omitted confounder bias or omitted variable bias. - -The formula follows from inserting the expression for $\tilde D$ into the definition of $\beta$ and then simplifying the resulting expression using the assumptions on the $\epsilon$'s. - -We can use this formula to bound $\phi$ directly by making assumptions on the size of $\delta$ -and $\gamma$. An alternative approach can be based on the following characterization, -based on partial $R^2$'s. This characterization essentially follows -from Cinelli and Hazlett, with the slight difference that we have adapted -the result to the partially linear model. - -*Theorem* [Omitted Confounder Bias in Terms of Partial $R^2$'s] - -In the partially linear SEM setting above, -$$ -\phi^2 = \frac{R^2_{\tilde Y \sim \tilde A \mid \tilde D} R^2_{\tilde D \sim \tilde A} }{ (1 - R^2_{\tilde D \sim \tilde A}) } \ -\frac{E \left[ (\tilde Y - \beta \tilde D)^2 \right] }{E \left[ ( \tilde D )^2 \right]}, -$$ -where $R^2_{V \sim W \mid X}$ denotes the population $R^2$ in the linear regression of $V$ on $W$, after partialling out $X$ from $V$ and $W$ linearly. - - -Therefore, if we place bounds on how much of the variation in $\tilde Y$ and in $\tilde D$ -the unobserved confounder $\tilde A$ is able to explain, we can bound the omitted confounder bias by $$\sqrt{\phi^2}.$$ - -# Empirical Example - -We consider an empirical example based on data surrounding the Darfur war. Specifically, we are interested in the effect of having experienced direct war violence on attitudes towards peace. Data is described here -https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html - -The main outcome is attitude towards peace -- ``peacefactor``. -The key variable of interest is whether the responders were directly harmed (``directlyharmed``). -We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures. -The measured confounders include dummy variables ``female``, ``farmer``, ``herder``, ``pastvoted``, along with ``age`` and household size (``hhsize``). -There is also a village indicator. We deal with village by and partialling out a full set of village dummy variables before conducting -the analysis. The standard errors will be clustered at the village level. - - -## Outline - -We will: -- mimic the partialling out procedure with machine learning tools; -- invoke Sensmakr to compute $\phi^2$ and plot sensitivity results. - -```{r} -# load package -install.packages("sensemakr") -install.packages("lfe") -install.packages("hdm") -install.packages("randomForest") -``` - -```{r} -library(sensemakr) -library(lfe) -library(hdm) -library(randomForest) - -set.seed(1) -``` - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/darfur.csv" -data <- read.csv(file) -dim(data) -attach(darfur) -``` - -## Preprocessing -Take out village fixed effects and run basic linear analysis - -```{r} -# get rid of village fixed effects -peacefactorR <- lm(peacefactor ~ village)$res -directlyharmedR <- lm(directlyharmed ~ village)$res -femaleR <- lm(female ~ village)$res -ageR <- lm(age ~ village)$res -farmerR <- lm(farmer_dar ~ village)$res -herderR <- lm(herder_dar ~ village)$res -pastvotedR <- lm(pastvoted ~ village)$res -hhsizeR <- lm(hhsize_darfur ~ village)$res - - -# Preliminary linear model analysis -summary(felm(peacefactorR ~ directlyharmedR + femaleR + - ageR + farmerR + herderR + pastvotedR + hhsizeR | 0 | 0 | village)) - -# here we are clustering standard errors at the village level -summary(felm(peacefactorR ~ femaleR + - ageR + farmerR + herderR + pastvotedR + hhsizeR | 0 | 0 | village)) - -summary(felm(directlyharmedR ~ femaleR + - ageR + farmerR + herderR + pastvotedR + hhsizeR | 0 | 0 | village)) -``` - -For benchmarking, we want to understand the "partial $R^2$" of the controls on the outcome after partialling out the variable of interest. - -```{r} -# partial out variable of interest -peacefactorR2 <- lm(peacefactorR ~ directlyharmedR)$res -femaleR2 <- lm(femaleR ~ directlyharmedR)$res -ageR2 <- lm(ageR ~ directlyharmedR)$res -farmerR2 <- lm(farmerR ~ directlyharmedR)$res -herderR2 <- lm(herderR ~ directlyharmedR)$res -pastvotedR2 <- lm(pastvotedR ~ directlyharmedR)$res -hhsizeR2 <- lm(hhsizeR ~ directlyharmedR)$res - - -# R^2 of controls after partialling out variable of interest -summary(lm(peacefactorR2 ~ femaleR2 + - ageR2 + farmerR2 + herderR2 + pastvotedR2 + hhsizeR2)) -``` - -## Lasso for partialling out controls - -```{r} -resY <- rlasso(peacefactorR ~ (femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR)^3, - post = FALSE)$res - -resD <- rlasso(directlyharmedR ~ (femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR)^3, - post = FALSE)$res - -print(c("Controls explain the following fraction of variance of Outcome", 1 - var(resY) / var(peacefactorR))) -print(c("Controls explain the following fraction of variance of Treatment", 1 - var(resD) / var(directlyharmedR))) - -dml_darfur_model <- felm(resY ~ resD | 0 | 0 | village) # cluster SEs by village - -summary(dml_darfur_model, robust = TRUE) # cluster SE by village - -dml_darfur_model <- lm(resY ~ resD) # linear model to use as input in sensemakr -``` - -## Manual Bias Analysis - -```{r} -# Main estimate -beta <- dml_darfur_model$coef[2] - -# Hypothetical values of partial R2s -r2_yc <- .13 -r2_dc <- .01 - -# Elements of the bias equation -kappa <- (r2_yc * r2_dc) / (1 - r2_dc) -varianceRatio <- mean(dml_darfur_model$res^2) / mean(resD^2) - -# Compute square bias -BiasSq <- kappa * varianceRatio - -# Compute absolute value of the bias -print(cat("absolute value of the bias:", sqrt(BiasSq)[1])) - -# plotting -gridr2_dc <- seq(0, .3, by = .001) -gridr2_yc <- kappa * (1 - gridr2_dc) / gridr2_dc -gridr2_yc <- ifelse(gridr2_yc > 1, 1, gridr2_yc) -plot(gridr2_dc, gridr2_yc, - type = "l", col = 4, xlab = "Partial R2 of Treatment with Confounder", - ylab = "Partial R2 of Outcome with Confounder", - main = paste("Combination of R2 such that |Bias| < ", round(sqrt(BiasSq), digits = 4)) -) -``` - -## Bias Analysis with Sensemakr - -```{r} -dml_darfur_sensitivity <- sensemakr( - model = dml_darfur_model, - treatment = "resD" -) -summary(dml_darfur_sensitivity) -plot(dml_darfur_sensitivity, nlevels = 15) -``` - -## Random Forest for partialling out - -The following code does DML with clustered standard errors by village - -```{r} -dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 2, clusterID) { - nobs <- nrow(x) # number of observations - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for (b in seq_along(I)) { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - cat(b, " ") - } - rfit <- felm(ytil ~ dtil | 0 | 0 | clusterID) # get clustered standard errors using felm - rfitSummary <- summary(rfit) - coef_est <- rfitSummary$coef[2] # extract coefficient - se <- rfitSummary$coef[2, 2] # record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) # printing output - return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals -} -``` - -```{r} -x <- model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR) -dim(x) - -d <- directlyharmedR -y <- peacefactorR -# DML with Random Forest: -dreg <- function(x, d) { - randomForest(x, d) -} # ML method=Forest -yreg <- function(x, y) { - randomForest(x, y) -} # ML method=Forest - -dml2_rf <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10, clusterID = village) - -resY <- dml2_rf$ytil -resD <- dml2_rf$dtil - -print(c("Controls explain the following fraction of variance of Outcome", - max(1 - var(resY) / var(peacefactorR), 0))) -print(c("Controls explain the following fraction of variance of Treatment", - max(1 - var(resD) / var(directlyharmedR), 0))) - -dml_darfur_model <- lm(resY ~ resD) -``` - -## Bias Analysis with Sensemakr - - -```{r} -dml_darfur_sensitivity <- sensemakr( - model = dml_darfur_model, - treatment = "resD" -) -summary(dml_darfur_sensitivity) - -plot(dml_darfur_sensitivity, nlevels = 15) -``` - diff --git a/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd b/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd deleted file mode 100644 index ce73b72a..00000000 --- a/AC2/r-debiased-ml-for-partially-linear-iv-model.Rmd +++ /dev/null @@ -1,286 +0,0 @@ ---- -title: An R Markdown document converted from "AC2/r-debiased-ml-for-partially-linear-iv-model.irnb" -output: html_document ---- - -# Double/Debiased ML for Partially Linear IV Model - -This is a simple implementation of Debiased Machine Learning for the Partially Linear -IV Regression Model, which provides an application of DML IV inference. - - -Reference: - -- https://arxiv.org/abs/1608.00060 -- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 - -The code is based on the book. - - -# Partially Linear IV Model - -We consider the partially linear structural equation model: -\begin{align} - Y :=~& D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ - Z :=~& m_0(X) + V, & E[V \mid X] = 0. -\end{align} - - -Note that this model is not a regression model unless $Z=D$. The model is a canonical -model in causal inference, going back to P. Wright's work on IV methods for estimating demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. - - -The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation -in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. - - -The causal DAG this model corresponds to is given by: -$$ -Z \to D, X \to (Y, Z, D), L \to (Y,D), -$$ -where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. - - - ---- - -# Example - -A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. - ----- - - - -# PLIVM in Residualized Form - -The PLIV model above can be rewritten in the following residualized form: -$$ - \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, -$$ -where -$$ - \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ - \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ - \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. -$$ - The "tilde" variables (e.g. $\tilde Y$) above represent original variables after taking out or "partialling out" - the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ - and $U$ have non-zero correlation, which automatically means that $U$ and $V$ - must have non-zero variation. - - - ------ - -# DML for PLIV Model - -Given identification, DML proceeds as follows - -Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts -to solving the three problems of predicting $Y$, $D$, and $Z$ using -$X$, using any generic ML method, giving us estimated residuals -$$ -\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). -$$ -The estimates should be of a cross-validated form, as detailed in the algorithm below. - -Estimate $\theta_0$ by the the intstrumental -variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. -Use the conventional inference for the IV regression estimator, ignoring -the estimation error in these residuals. - -The reason we work with this residualized form is that it eliminates the bias -arising when solving the prediction problem in stage 1. The role of cross-validation -is to avoid another source of bias due to potential overfitting. - -The estimator is adaptive, -in the sense that the first stage estimation errors do not affect the second -stage errors. - -```{r} -install.packages("hdm") -install.packages("AER") -install.packages("randomForest") -install.packages("lfe") -install.packages("glmnet") -``` - -```{r} -library(hdm) -library(AER) # applied econometrics library -library(randomForest) # random Forest library -library(lfe) # high-dimensional econometrics library -library(glmnet) # glm net - -set.seed(1) -``` - -```{r} -# DML for PLIVM - -dml2_for_plivm <- function(x, d, z, y, dreg, yreg, zreg, nfold = 5) { - # this implements DML2 algorithm, where there moments are estimated via DML, before constructing - # the pooled estimate of theta randomly split data into folds - nobs <- nrow(x) - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- ztil <- rep(NA, nobs) - # obtain cross-fitted residuals - cat("fold: ") - for (b in seq_along(I)) { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a folot out - dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the fold out - zhat <- predict(zfit, x[I[[b]], ], type = "response") # predict the fold out - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the fold out - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual - ztil[I[[b]]] <- (z[I[[b]]] - zhat) # record residual - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial - cat(b, " ") - } - ivfit <- tsls(y = ytil, d = dtil, x = NULL, z = ztil, intercept = FALSE) - coef_est <- ivfit$coef # extract coefficient - se <- ivfit$se # record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) - return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil, ztil = ztil)) -} -``` - ------ - -# Emprical Example: Acemoglu, Johnson, Robinson (AER). - - -* Y is log GDP; -* D is a measure of Protection from Expropriation, a proxy for -quality of insitutions; -* Z is the log of Settler's mortality; -* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) - - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/AJR.csv" -AJR <- read.csv(file) -dim(AJR) -``` - -```{r} -y <- AJR$GDP -d <- AJR$Exprop -z <- AJR$logMort -xraw <- model.matrix(~ Latitude + Africa + Asia + Namer + Samer, data = AJR) -x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, data = AJR) -dim(x) -``` - -```{r} -set.seed(1) -# DML with PostLasso -cat(sprintf("\n DML with Post-Lasso \n")) - -dreg <- function(x, d) { - hdm::rlasso(x, d) -} # ML method=lasso -yreg <- function(x, y) { - hdm::rlasso(x, y) -} # ML method=lasso -zreg <- function(x, z) { - hdm::rlasso(x, z) -} # ML method=lasso - -dml2_lasso <- dml2_for_plivm(x, d, z, y, dreg, yreg, zreg, nfold = 5) - - -# DML with Random Forest -cat(sprintf("\n DML with Random Forest \n")) - -dreg <- function(x, d) { - randomForest(x, d) -} # ML method=Forest -yreg <- function(x, y) { - randomForest(x, y) -} # ML method=Forest -zreg <- function(x, z) { - randomForest(x, z) -} # ML method=Forest - -dml2_rf <- dml2_for_plivm(xraw, d, z, y, dreg, yreg, zreg, nfold = 5) - -# Compare Forest vs Lasso -comp_tab <- matrix(NA, 3, 2) -comp_tab[1, ] <- c(sqrt(mean((dml2_rf$ytil)^2)), sqrt(mean((dml2_lasso$ytil)^2))) -comp_tab[2, ] <- c(sqrt(mean((dml2_rf$dtil)^2)), sqrt(mean((dml2_lasso$dtil)^2))) -comp_tab[3, ] <- c(sqrt(mean((dml2_rf$ztil)^2)), sqrt(mean((dml2_lasso$ztil)^2))) -rownames(comp_tab) <- c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") -colnames(comp_tab) <- c("RF", "LASSO") -print(comp_tab, digits = 3) -``` - -# Weak Instruments? - -```{r} -# using lfe package -summary(felm(dml2_lasso$dtil ~ dml2_lasso$ztil), robust = TRUE) -summary(felm(dml2_rf$dtil ~ dml2_rf$ztil), robust = TRUE) -``` - -## Anderson-Rubin Idea for Inference with Weak Instruments - -As shown above, we may have weak instruments because the t-stat in the regression $\tilde D \sim \tilde Z$ is small relative to standard rules of thumb -- for example Stock and Yogo (2005) suggest accounting for weak instruments if the first stage F-statistic is less than 10 (and more recent work suggests even larger cutoffs). - - - Here, we consider one specific approach (from Anderson and Rubin (1949)) to doing valid inference under weak identification based upon the statistic: -$$ -C(\theta) = \frac{ |E_n [(\tilde Y - \theta \tilde D) \tilde Z]|^2}{ \mathbb{V}_n [(\tilde Y - \theta \tilde D) \tilde Z ]/n}. -$$ -The empirical variance $\mathbb{V}_n$ is defined as: -\begin{align*} -\mathbb{V}_{n}[ g(W_i)] &:= \mathbb{E}_{n}g(W_i)g(W_i)' - \mathbb{E}_{n}[ g(W_i)]\mathbb{E}_{n}[ g(W_i)]'. -\end{align*} - -If $\theta_0 = \theta$, then $C(\theta) \overset{a}{\sim} N(0,1)^2 = \chi^2(1)$. Therefore, we can reject the hypothesis $\theta_0 = \theta$ at level $a$ (for example $a = .05$ for a 5\% level test) if $C(\theta)> c(1-a)$ where $c(1-a)$ is the $(1-a)$- quantile of a $\chi^2(1)$ variable. The probability of falsely rejecting the true hypothesis is approximately $a \times 100\%$. -To construct a $(1-a) \times 100$\% confidence region for $\theta$, we can then simply invert the test by collecting all parameter values that are not rejected at the $a$ level: -$$ -CR(\theta) = \{ \theta \in \Theta: C(\theta) \leq c(1-a)\}. -$$ - - -In more complex settings with many controls or controls that enter with unknown functional form, we can simply replace the residuals $\tilde Y$, $\tilde D$, and $\tilde Z$ by machine -learned cross-fitted residuals $\check Y$, $\check D$, and $ \check Z$. Thanks to the orthogonality of the IV moment condition underlying the formulation outlined above, we can formally assert that the properties of $C(\theta)$ and the subsequent testing procedure and confidence region for $\theta$ continue to hold when using cross-fitted residuals. We will further be able to apply the general procedure to cases where $D$ -is a vector, with a suitable adjustment of the statistic $C(\theta)$. - -```{r} -# DML-AR (DML with Anderson-Rubin) - -dml_ar_pliv <- function(rY, rD, rZ, grid, alpha = .05) { - n <- length(rY) - Cstat <- rep(0, length(grid)) - for (i in seq_along(grid)) { - Cstat[i] <- n * (mean((rY - grid[i] * rD) * rZ))^2 / var((rY - grid[i] * rD) * rZ) - } - LB <- min(grid[Cstat < qchisq(1 - alpha, 1)]) - UB <- max(grid[Cstat < qchisq(1 - alpha, 1)]) - plot(range(grid), range(c(Cstat)), type = "n", xlab = "Effect of institutions", ylab = "Statistic", main = " ") - lines(grid, Cstat, lty = 1, col = 1) - abline(h = qchisq(1 - alpha, 1), lty = 3, col = 4) - abline(v = LB, lty = 3, col = 2) - abline(v = UB, lty = 3, col = 2) - return(list(UB = UB, LB = LB)) -} -``` - -```{r} -dml_ar_pliv( - rY = dml2_lasso$ytil, rD = dml2_lasso$dtil, rZ = dml2_lasso$ztil, - grid = seq(-2, 2, by = .01) -) - -dml_ar_pliv( - rY = dml2_rf$ytil, rD = dml2_rf$dtil, rZ = dml2_rf$ztil, - grid = seq(-2, 2, by = .01) -) -``` - diff --git a/AC2/r-dml-401k-IV.Rmd b/AC2/r-dml-401k-IV.Rmd deleted file mode 100644 index 602944aa..00000000 --- a/AC2/r-dml-401k-IV.Rmd +++ /dev/null @@ -1,1309 +0,0 @@ ---- -title: An R Markdown document converted from "AC2/r-dml-401k-IV.irnb" -output: html_document ---- - -# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models - -## Impact of 401(k) on Financial Wealth - -We consider estimation of the effect of 401(k) participation -on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. - -One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. - -```{r} -install.packages("xtable") -install.packages("hdm") -install.packages("sandwich") -install.packages("ggplot2") -install.packages("randomForest") -install.packages("glmnet") -install.packages("rpart") -install.packages("data.table") -install.packages("gbm") -``` - -```{r} -library(xtable) -library(hdm) -library(sandwich) -library(ggplot2) -library(randomForest) -library(data.table) -library(glmnet) -library(rpart) -library(gbm) - -set.seed(123) -``` - -### Data - -The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv). -The data set can be loaded from the `hdm` package for R directly by typing: - - -```{r} -data(pension) -data <- pension -dim(data) -``` - -See the "Details" section on the description of the data set, which can be accessed by - -```{r} -help(pension) -``` - -The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. - -Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. - -```{r} -hist_e401 <- ggplot(data, aes(x = e401, fill = factor(e401))) + - geom_bar() -hist_e401 -``` - -Eligibility is highly associated with financial wealth: - -```{r} -dens_net_tfa <- ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401))) + - geom_density() + - xlim(c(-20000, 150000)) + - facet_wrap(. ~ e401) - -dens_net_tfa -``` - -The unconditional APE of e401 is about $19559$: - -```{r} -e1 <- data[data$e401 == 1, ] -e0 <- data[data$e401 == 0, ] -round(mean(e1$net_tfa) - mean(e0$net_tfa), 0) -``` - -Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: - -```{r} -p1 <- data[data$p401 == 1, ] -p0 <- data[data$p401 == 0, ] -round(mean(p1$net_tfa) - mean(p0$net_tfa), 0) -``` - -As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. - -```{r} -# instrument variable -Z <- data[, "e401"] -# treatment variable -D <- data[, "p401"] -# outcome variable -y <- data[, "net_tfa"] -``` - -### We construct the engineered features for controls - -```{r} -# Constructing the controls -x_formula <- paste("~ poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + ", - "poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") -X <- as.data.table(model.frame(x_formula, pension)) -head(X) -``` - -# Instrumental Variables: Effect of 401k Participation on Financial Assets - -## Double ML IV under Partial Linearity - -Now, we consider estimation of average treatment effects of participation in 401k, i.e. `p401`, with the binary instrument being eligibility in 401k, i.e. `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. We consider a partially linear structural equation model: -\begin{align} -Y :=~& g_Y(\epsilon_Y) D + f_Y(A, X, \epsilon_Y), \\ -D :=~& f_D(Z, X, A, \epsilon_D), \\ -Z :=~& f_Z(X, \epsilon_Z),\\ -A :=~& f_A(X, \epsilon_A), \\ -X :=~& \epsilon_X, -\end{align} -where $A$ is a vector of un-observed confounders. - -Under this structural equation model, the average treatment effect: -\begin{align} -\alpha = E[Y(1) - Y(0)] -\end{align} -can be identified by the moment restriction: -\begin{align} -E[(\tilde{Y} - \alpha \tilde{D}) \tilde{Z}] = 0 -\end{align} -where for any variable $V$, we denote with $\tilde{V} = V - E[V|X]$. - -```{r} -set.seed(1) -# family gaussian means that we'll be using square loss -yfit_lasso_cv <- cv.glmnet(as.matrix(X), y, family = "gaussian", alpha = 1) -# family gaussian means that we'll be using square loss -dfit_lasso_cv <- cv.glmnet(as.matrix(X), D, family = "gaussian", alpha = 1) -# family gaussian means that we'll be using square loss -zfit_lasso_cv <- cv.glmnet(as.matrix(X), Z, family = "gaussian", alpha = 1) - - -yhat_lasso_cv <- predict(yfit_lasso_cv, newx = as.matrix(X)) # predictions -dhat_lasso_cv <- predict(dfit_lasso_cv, newx = as.matrix(X)) # predictions -zhat_lasso_cv <- predict(zfit_lasso_cv, newx = as.matrix(X)) # predictions - -resy <- y - yhat_lasso_cv -resD <- D - dhat_lasso_cv -resZ <- Z - zhat_lasso_cv - -# Estimate -mean(resy * resZ) / mean(resZ * resD) -``` - -Recall if we want to do inference, we need to either use the theoretically driven penalty paramter for Lasso or perform cross-fitting. - -### DML with Non-Linear ML Models and Cross-fitting - -```{r} -# DML for PLIVM with D and Z as classifiers or regressors -dml2_for_plivm <- function(x, d, z, y, dreg, yreg, zreg, nfold = 5, method = "regression") { - nobs <- nrow(x) - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- ztil <- rep(NA, nobs) - # obtain cross-fitted residuals - cat("fold: ") - for (b in seq_along(I)) { - if (method == "randomforest") { - # take a fold out - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) - zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]], ], type = "prob")[, 2] # type = "prob" is like predict_proba in scikitlearn - zhat <- predict(zfit, x[I[[b]], ], type = "prob")[, 2] - # default type = "response" for regression for RF, type = "vector" for regression for Decision Trees - yhat <- predict(yfit, x[I[[b]], ]) - # record residual - # as.numeric will turn d = as.factor(d) from 0,1 to 1,2 so subtract 1! - dtil[I[[b]]] <- (as.numeric(d[I[[b]]]) - 1 - dhat) - ztil[I[[b]]] <- (as.numeric(z[I[[b]]]) - 1 - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) - } else if (method == "regression") { # works for both boosted trees and glmnet - # take a fold out - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) - zfit <- zreg(x[-I[[b]], ], z[-I[[b]]]) - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]], ], type = "response") - zhat <- predict(zfit, x[I[[b]], ], type = "response") - yhat <- predict(yfit, x[I[[b]], ], type = "response") - # record residual - dtil[I[[b]]] <- (d[I[[b]]] - dhat) - ztil[I[[b]]] <- (z[I[[b]]] - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) - } else if (method == "decisiontrees") { - # take a fold out - dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) - zfit <- zreg(x[-I[[b]], ], as.factor(z)[-I[[b]]]) - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) - # predict the fold out - dhat <- predict(dfit, x[I[[b]], ])[, 2] - zhat <- predict(zfit, x[I[[b]], ])[, 2] - yhat <- predict(yfit, x[I[[b]], ]) - # record residual - dtil[I[[b]]] <- (d[I[[b]]] - dhat) - ztil[I[[b]]] <- (z[I[[b]]] - zhat) - ytil[I[[b]]] <- (y[I[[b]]] - yhat) - } - - cat(b, " ") - } - ivfit <- tsls(y = ytil, d = dtil, x = NULL, z = ztil, intercept = FALSE) - coef_est <- ivfit$coef # extract coefficient - se <- ivfit$se # record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) - - return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil, ztil = ztil)) -} -``` - -```{r} -summary_for_plivm <- function(point, stderr, resy, resD, resZ, name) { - data <- data.frame( - estimate = point, # point estimate - stderr = stderr, # standard error - lower = point - 1.96 * stderr, # lower end of 95% confidence interval - upper = point + 1.96 * stderr, # upper end of 95% confidence interval - `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y - `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D - `rmse Z` = sqrt(mean(resZ^2)), # RMSE of model that predicts treatment D - `accuracy D` = mean(abs(resD) < 0.5), # binary classification accuracy of model for D - `accuracy Z` = mean(abs(resZ) < 0.5) # binary classification accuracy of model for Z - ) - rownames(data) <- name - return(data) -} -``` - -#### Double Lasso with Cross-Fitting - -```{r} -# DML with LassoCV -set.seed(123) -cat(sprintf("\nDML with Lasso CV \n")) - -dreg_lasso_cv <- function(x, d) { - cv.glmnet(x, d, family = "gaussian", alpha = 1, nfolds = 5) -} -yreg_lasso_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} -zreg_lasso_cv <- function(x, z) { - cv.glmnet(x, z, family = "gaussian", alpha = 1, nfolds = 5) -} - -dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_lasso_cv, yreg_lasso_cv, zreg_lasso_cv, - nfold = 5, method = "regression") -sum_lasso_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "LassoCV") -tableplr <- data.frame() -tableplr <- rbind(sum_lasso_cv) -tableplr - -ytil_lasso <- dml2_results$ytil -dtil_lasso <- dml2_results$dtil -ztil_lasso <- dml2_results$ztil -``` - -#### Using a $\ell_2$ Penalized Logistic Regression for D and Z - -```{r} -# DML with Lasso/Ridge -set.seed(123) -cat(sprintf("\nDML with Lasso/Logistic \n")) - -dreg_ridge_cv <- function(x, d) { - cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) -} -yreg_ridge_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} -zreg_ridge_cv <- function(x, z) { - cv.glmnet(x, z, family = "binomial", alpha = 0, nfolds = 5) -} - -dml2_results <- dml2_for_plivm(as.matrix(X), D, Z, y, dreg_ridge_cv, yreg_ridge_cv, zreg_ridge_cv, - nfold = 5, method = "regression") -sum_lasso_ridge_cv <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "LassoCV/LogisticCV") -tableplr <- rbind(tableplr, sum_lasso_ridge_cv) -tableplr - -ytil_ridge <- dml2_results$ytil -dtil_ridge <- dml2_results$dtil -ztil_ridge <- dml2_results$ztil -``` - -### Random Forests - -```{r} -# DML with Random Forest -set.seed(123) -cat(sprintf("\nDML with Random Forest \n")) - -dreg_rf <- function(x, d) { - randomForest(x, d, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg_rf <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest -zreg_rf <- function(x, z) { - randomForest(x, z, ntree = 1000, nodesize = 10) -} # ML method=Forest - -dml2_results <- dml2_for_plivm(as.matrix(X), as.factor(D), as.factor(Z), y, dreg_rf, yreg_rf, zreg_rf, - nfold = 5, method = "randomforest") -sum_rf <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "RF") -tableplr <- rbind(tableplr, sum_rf) -tableplr - -ytil_rf <- dml2_results$ytil -dtil_rf <- dml2_results$dtil -ztil_rf <- dml2_results$ztil -``` - -### Decision Trees - -```{r} -# DML with Decision Trees -set.seed(123) -cat(sprintf("\nDML with Decision Trees \n")) - -# decision tree takes in X as dataframe, not matrix/array -dreg_tr <- function(x, d) { - rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) -} -yreg_tr <- function(x, y) { - rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} -zreg_tr <- function(x, z) { - rpart(as.formula("Z~."), cbind(data.frame(Z = z), x), method = "class", minbucket = 10, cp = 0.001) -} - -dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_tr, yreg_tr, zreg_tr, - nfold = 5, method = "decisiontrees") -sum_tr <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "Decision Trees") -tableplr <- rbind(tableplr, sum_tr) -tableplr - -ytil_tr <- dml2_results$ytil -dtil_tr <- dml2_results$dtil -ztil_tr <- dml2_results$ztil -``` - -### Boosted Trees - -```{r} -# DML with Boosted Trees -set.seed(123) -cat(sprintf("\nDML with Boosted Trees \n")) - -# NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) -dreg_boost <- function(x, d) { - gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg_boost <- function(x, y) { - gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -zreg_boost <- function(x, z) { - gbm(as.formula("Z~."), cbind(data.frame(Z = z), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} - -# passing these through regression as type="response", and D and Z should not be factors! -dml2_results <- dml2_for_plivm(X, D, Z, y, dreg_boost, yreg_boost, zreg_boost, - nfold = 5, method = "regression") -sum_boost <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "Boosted Trees") -tableplr <- rbind(tableplr, sum_boost) -tableplr - -ytil_boost <- dml2_results$ytil -dtil_boost <- dml2_results$dtil -ztil_boost <- dml2_results$ztil -``` - -## Ensembles - -Boosted trees give the best RMSE for Y, D, and Z, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case. - -```{r} -# Best fit is boosted trees for D, Z, Y - -sum_best <- summary_for_plivm(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$ztil, name = "Best") -tableplr <- rbind(tableplr, sum_best) -tableplr -``` - -We'll form a model average with unconstrained least squares weights. - -```{r} -# Least squares model average - -dhat_lasso <- D - dtil_lasso -dhat_ridge <- D - dtil_ridge -dhat_rf <- D - dtil_rf -dhat_tr <- D - dtil_tr -dhat_boost <- D - dtil_boost - -yhat_lasso <- y - ytil_lasso -yhat_ridge <- y - ytil_ridge -yhat_rf <- y - ytil_rf -yhat_tr <- y - ytil_tr -yhat_boost <- y - ytil_boost - -zhat_lasso <- Z - ztil_lasso -zhat_ridge <- Z - ztil_ridge -zhat_rf <- Z - ztil_rf -zhat_tr <- Z - ztil_tr -zhat_boost <- Z - ztil_boost - -ma_dtil <- lm(D ~ dhat_lasso + dhat_ridge + dhat_rf + dhat_tr + dhat_boost)$residuals -ma_ytil <- lm(y ~ yhat_lasso + yhat_ridge + yhat_rf + yhat_tr + yhat_boost)$residuals -ma_ztil <- lm(Z ~ zhat_lasso + zhat_ridge + zhat_rf + zhat_tr + zhat_boost)$residuals - -ivfit <- tsls(y = ma_ytil, d = ma_dtil, x = NULL, z = ma_ztil, intercept = FALSE) -coef_est <- ivfit$coef # extract coefficient -se <- ivfit$se # record standard error - -sum_ma <- summary_for_plivm(coef_est, se, ma_ytil, ma_dtil, ma_ztil, name = "Model Average") -tableplr <- rbind(tableplr, sum_ma) -tableplr -``` - -## Inference Robust to Weak Identification - -Now we turn toward robustness when the instrument is weak. - -Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\sim X$, $D \sim X$, $Z\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract the best model so we can re-use that in DML. - -Thus, in the below analysis of robust inference, we choose Boosted Trees as they perform well. - -```{r} -robust_inference <- function(point, stderr, resD, resy, resZ, grid, alpha = 0.05) { - # Inference in the partially linear IV model that is robust to weak identification. - # grid: grid of theta values to search over when trying to identify the confidence region - # alpha: confidence level - - n <- dim(X)[1] - thr <- qchisq(1 - alpha, df = 1) - accept <- c() - - for (theta in grid) { - moment <- (resy - theta * resD) * resZ - test <- n * mean(moment)^2 / var(moment) - if (test <= thr) { - accept <- c(accept, theta) - } - } - - return(accept) -} -``` - -```{r} -grid <- seq(0, 20000, length.out = 10000) -region <- robust_inference(dml2_results$coef_est, dml2_results$stderr, dml2_results$dtil, dml2_results$ytil, - dml2_results$ztil, grid = grid) -``` - -```{r} -grid <- seq(0, 20000, length.out = 10000) -region <- robust_inference(dml2_results$coef_est, dml2_results$stderr, dml2_results$dtil, dml2_results$ytil, - dml2_results$ztil, grid = grid) -# Calculate min and max -min_region <- min(region) -max_region <- max(region) - -print(min_region) -print(max_region) -``` - -# Interactive IV Model and LATE - -Now, we consider estimation of local average treatment effects (LATE) of participation `p401`, with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: -\begin{align} -Y :=~& f_Y (D, X, A, \epsilon_Y) \\ -D :=~& f_D(Z, X, A, \epsilon_D) \in \{0,1\}, \\ -Z :=~& f_Z(X,\epsilon_Z) \in \{0,1\}, \\ -X :=~& \epsilon_X, \quad A = \epsilon_A, -\end{align} -where $\epsilon$'s are all exogenous and independent, -and -$$ -z \mapsto f_D(z , A, X, \epsilon_D) \text{ is weakly increasing (weakly monotone)}. -$$ -and $A$ is a vector of unobserved confounders. Note that in our setting monotonicity is satisfied, since participation is only feasible when it is eligible. Thus we have that $D=0$ whenever $Z=0$. Thus it can only be that $f_D(1, A, X, \epsilon_D) \geq 0 = f_D(0, A, X, \epsilon_D)$. - -In this case, we can estimate the local average treatment effect (LATE): -$$ -\alpha = E[Y(1) - Y(0) | D(1) > D(0)] -$$ -This can be identified using the Neyman orthogonal moment equation: -\begin{align} -E\left[g(1, X) - g(0, X) + H(Z) (Y - g(Z, X)) - \alpha \cdot (m(1, X) - m(0, X) + H(Z) (D - m(Z, X))\right] = 0 -\end{align} -where -\begin{align} -g(Z,X) =~& E[Y|Z,X],\\ -m(Z,X) =~& E[D|Z,X],\\ -H(Z) =~& \frac{Z}{Pr(Z=1|X)} - \frac{1 - Z}{1 - Pr(Z=1|X)} -\end{align} - -```{r} -get_dhat0 <- function(XZ0, DZ0, Xb, dreg0, type = NULL, DZ0factor = NULL) { - # train a treatment model on training data that received Z=0 and predict treatment on all data in test set - if (mean(DZ0) > 0) { # it could be that D=0, whenever Z=0 deterministically - dreg0_ <- dreg0 - if (is.null(DZ0factor)) { - dfit0 <- dreg0_((XZ0), DZ0) - } else { - dfit0 <- dreg0_((XZ0), DZ0factor) - } - if (is.null(type)) { - return(predict(dfit0, (Xb))[, 2]) - } else if (type == "prob") { - return(predict(dfit0, (Xb), type = "prob")[, 2]) - } else if (type == "reponse") { - return(predict(dfit0, (Xb), type = "response")) - } else { - stop("Invalid argument `type`.") - } - } else { - return(0) - } -} - -get_dhat1 <- function(XZ1, DZ1, Xb, dreg1, type = NULL, DZ1factor = NULL) { - # train a treamtent model on training data that received Z=1 and predict treatment on all data in test set - if (mean(DZ1) < 1) { # it could be that D=1, whenever Z=1 deterministically - dreg1_ <- dreg1 - if (is.null(DZ1factor)) { - dfit1 <- dreg1_((XZ1), DZ1) - } else { - dfit1 <- dreg1_((XZ1), DZ1factor) - } - if (is.null(type)) { - return(predict(dfit1, (Xb))[, 2]) - } else if (type == "prob") { - return(predict(dfit1, (Xb), type = "prob")[, 2]) - } else if (type == "response") { - return(predict(dfit1, (Xb), type = "response")) - } else { - stop("Invalid argument `type`.") - } - } else { - return(1) - } -} - -# DML for IIVM with D and Z as classifiers or regressors -dml2_for_iivm <- function(x, d, z, y, dreg0, dreg1, yreg0, yreg1, zreg, - trimming = 0.01, nfold = 5, method = "classification", dt = 0, bt = 0) { - # this implements DML2 algorithm, where there moments are estimated via DML, before constructing - # the pooled estimate of theta randomly split data into folds - - ## NB This method has many if statements to accommodate the various estimators we will use. - ## Unlike Python's sklearn, all methods have idfferent default arguments in their predict functions. - ## See official R documentation for details. - - yhat0 <- rep(0, length(y)) - yhat1 <- rep(0, length(y)) - dhat0 <- rep(0, length(d)) - dhat1 <- rep(0, length(d)) - zhat <- rep(0, length(Z)) - - nobs <- nrow(X) - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- ztil <- rep(NA, nobs) - - # obtain cross-fitted residuals - cat("fold: ") - for (b in seq_along(I)) { - # define helpful variables - Xb <- X[I[[b]], ] - Xnotb <- X[-I[[b]], ] - Znotb <- Z[-I[[b]]] - - # training dfs subsetted on the -I[[b]] fold - XZ0 <- X[-I[[b]], ][Z[-I[[b]]] == 0] - yZ0 <- y[-I[[b]]][Z[-I[[b]]] == 0] - XZ1 <- X[-I[[b]], ][Z[-I[[b]]] == 1] - yZ1 <- y[-I[[b]]][Z[-I[[b]]] == 1] - DZ0 <- d[-I[[b]]][Z[-I[[b]]] == 0] - DZ1 <- d[-I[[b]]][Z[-I[[b]]] == 1] - - - if (method == "regression") { - XZ0 <- as.matrix(XZ0) - XZ1 <- as.matrix(XZ1) - Xb <- as.matrix(Xb) - Xnotb <- as.matrix(Xnotb) - - # Train an outcome model on training data that received Z=0 and predict outcome on all data in the test set - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" - - # train an outcome model on training data that received Z=1 and predict outcome on all data in test set - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - # train a treatment model on training data that received Z=0 and predict treatment on all data in test set - # default type = "response", but for family binomial it's logg odds - dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "response") - dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "response") - - } else if (method == "randomforest") { - DZ0factor <- as.factor(D)[-I[[b]]][Z[-I[[b]]] == 0] - DZ1factor <- as.factor(D)[-I[[b]]][Z[-I[[b]]] == 1] - Znotb <- as.factor(Znotb) - - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb), type = "response") - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb), type = "response") - - dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "prob", DZ0factor = DZ0factor) - dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "prob", DZ1factor = DZ1factor) - - } else if (method == "decisiontrees") { - XZ0 <- as.data.frame(XZ0) - XZ1 <- as.data.frame(XZ1) - Xb <- as.data.frame(Xb) - Xnotb <- as.data.frame(Xnotb) - - yfit0 <- yreg0((XZ0), yZ0) - # default type = "response" for decision trees for continuous response - yhat0[I[[b]]] <- predict(yfit0, (Xb)) - - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, DZ0factor = as.factor(DZ0)) - dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, DZ1factor = as.factor(DZ1)) - - } else if (method == "boostedtrees") { - XZ0 <- as.data.frame(XZ0) - XZ1 <- as.data.frame(XZ1) - Xb <- as.data.frame(Xb) - Xnotb <- as.data.frame(Xnotb) - - yfit0 <- yreg0((XZ0), yZ0) - yhat0[I[[b]]] <- predict(yfit0, (Xb)) # default type = "response" for boosted trees - yfit1 <- yreg1((XZ1), yZ1) - yhat1[I[[b]]] <- predict(yfit1, (Xb)) - - dhat0[I[[b]]] <- get_dhat0(XZ0, DZ0, Xb, dreg0, type = "response") - dhat1[I[[b]]] <- get_dhat1(XZ1, DZ1, Xb, dreg1, type = "response") - - } - - # propensity scores: - if (method == "regression") { - zfit_b <- zreg((Xnotb), Znotb) - zhat_b <- predict(zfit_b, (Xb), type = "response") - } else if (method == "randomforest") { - zfit_b <- zreg((Xnotb), Znotb) - zhat_b <- predict(zfit_b, (Xb), type = "prob")[, 2] - } else if (method == "decisiontrees") { - zfit_b <- zreg((Xnotb), as.factor(Znotb)) - zhat_b <- predict(zfit_b, (Xb)) # default is prob, so get second column - zhat_b <- zhat_b[, 2] - } else if (method == "boostedtrees") { - zfit_b <- zreg((Xnotb), Znotb) - zhat_b <- predict(zfit_b, (Xb), type = "response") - } - zhat_b <- pmax(pmin(zhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] - zhat[I[[b]]] <- zhat_b - - cat(b, " ") - } - - - # Prediction of treatment and outcome for observed instrument - yhat <- yhat0 * (1 - Z) + yhat1 * Z - dhat <- dhat0 * (1 - Z) + dhat1 * Z - - # residuals - ytil <- y - yhat - dtil <- D - dhat - ztil <- Z - zhat - - # doubly robust quantity for every sample - HZ <- Z / zhat - (1 - Z) / (1 - zhat) - drZ <- yhat1 - yhat0 + (y - yhat) * HZ - drD <- dhat1 - dhat0 + (D - dhat) * HZ - coef_est <- mean(drZ) / mean(drD) - cat("point", coef_est) - psi <- drZ - coef_est * drD - Jhat <- mean(drD) - variance <- mean(psi^2) / Jhat^2 - se <- sqrt(variance / nrow(X)) - cat("se", se) - - return(list(coef_est = coef_est, se = se, yhat = yhat, dhat = dhat, zhat = zhat, ytil = ytil, - dtil = dtil, ztil = ztil, drZ = drZ, drD = drD, - yhat0 = yhat0, yhat1 = yhat1, dhat0 = dhat0, dhat1 = dhat1)) -} -``` - -```{r} -summary_for_iivm <- function(coef_est, se, yhat, dhat, zhat, ytil, dtil, ztil, drZ, drD, X, Z, D, y, name) { - summary_data <- data.frame( - estimate = coef_est, # point estimate - se = se, # standard error - lower = coef_est - 1.96 * se, # lower end of 95% confidence interval - upper = coef_est + 1.96 * se, # upper end of 95% confidence interval - rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y - rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D - rmse_Z = sqrt(mean(ztil^2)), # res of model that predicts instrument Z - accuracy_D = mean(abs(dtil) < 0.5), # binary classification accuracy of model for D - accuracy_Z = mean(abs(ztil) < 0.5) # binary classification accuracy of model for Z - ) - row.names(summary_data) <- name - return(summary_data) -} -``` - -```{r} -# DML with Lasso/Ridge -set.seed(123) -cat(sprintf("\nDML with Lasso/Logistic \n")) -# DML with Lasso/Ridge -dreg0 <- function(x, d) { - cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) -} -dreg1 <- function(x, d) { - cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) -} -yreg0 <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} -yreg1 <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} -zreg <- function(x, z) { - cv.glmnet(x, z, family = "binomial", alpha = 0, nfolds = 5) -} - -dml2_results <- dml2_for_iivm(as.matrix(X), D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, - trimming = 0.01, nfold = 5, method = "regression") -sum_lasso_ridge_cv <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, - dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, - dml2_results$drZ, dml2_results$drD, name = "LassoCV/LogisticCV") -table <- data.frame() -table <- rbind(table, sum_lasso_ridge_cv) -table - -yhat_lasso <- dml2_results$yhat -dhat_lasso <- dml2_results$dhat -yhat0_lasso <- dml2_results$yhat0 -yhat1_lasso <- dml2_results$yhat1 -dhat0_lasso <- dml2_results$dhat0 -dhat1_lasso <- dml2_results$dhat1 -zhat_lasso <- dml2_results$zhat -``` - -```{r} -# DML with Random Forest -set.seed(123) -cat(sprintf("\nDML with Random Forest \n")) - -dreg0 <- function(x, d) { - randomForest(x, d, ntree = 1000, nodesize = 10) -} # ML method=Forest -dreg1 <- function(x, d) { - randomForest(x, d, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg0 <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg1 <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest -zreg <- function(x, z) { - randomForest(x, z, ntree = 1000, nodesize = 10) -} # ML method=Forest - -dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, - trimming = 0.01, nfold = 5, method = "randomforest") -sum_rf <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, - dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, - dml2_results$drZ, dml2_results$drD, name = "RF") -table <- rbind(table, sum_rf) -table - -yhat_rf <- dml2_results$yhat -dhat_rf <- dml2_results$dhat -yhat0_rf <- dml2_results$yhat0 -yhat1_rf <- dml2_results$yhat1 -dhat0_rf <- dml2_results$dhat0 -dhat1_rf <- dml2_results$dhat1 -zhat_rf <- dml2_results$zhat -``` - -```{r} -# DML with Decision Trees -set.seed(123) -cat(sprintf("\nDML with Decision Trees \n")) - -dreg0 <- function(x, d) { - rpart(as.formula("D ~ ."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) -} -dreg1 <- function(x, d) { - rpart(as.formula("D ~ ."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) -} -yreg0 <- function(x, y) { - rpart(as.formula("y ~ ."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} -yreg1 <- function(x, y) { - rpart(as.formula("y ~ ."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} -zreg <- function(x, z) { - rpart(as.formula("Z ~ ."), cbind(data.frame(Z = z), x), method = "class", minbucket = 10, cp = 0.001) -} - -dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, - trimming = 0.01, nfold = 5, method = "decisiontrees") -sum_tr <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, - dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, - dml2_results$drZ, dml2_results$drD, name = "Decision Trees") -table <- rbind(table, sum_tr) -table - -yhat_tr <- dml2_results$yhat -dhat_tr <- dml2_results$dhat -yhat0_tr <- dml2_results$yhat0 -yhat1_tr <- dml2_results$yhat1 -dhat0_tr <- dml2_results$dhat0 -dhat1_tr <- dml2_results$dhat1 -zhat_tr <- dml2_results$zhat -``` - -```{r} -# DML with Boosted Trees -set.seed(123) -cat(sprintf("\nDML with Boosted Trees \n")) - -# NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) -dreg0 <- function(x, d) { - gbm(as.formula("D ~ ."), cbind(data.frame(D = d), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -dreg1 <- function(x, d) { - gbm(as.formula("D ~ ."), cbind(data.frame(D = d), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg0 <- function(x, y) { - gbm(as.formula("y ~ ."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg1 <- function(x, y) { - gbm(as.formula("y ~ ."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -zreg <- function(x, z) { - gbm(as.formula("Z ~ ."), cbind(data.frame(Z = z), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} - -# passing these through regression as type="response", and D and Z should not be factors! -dml2_results <- dml2_for_iivm(X, D, Z, y, dreg0, dreg1, yreg0, yreg1, zreg, - trimming = 0.01, nfold = 5, method = "boostedtrees") -sum_boost <- summary_for_iivm(dml2_results$coef_est, dml2_results$se, dml2_results$yhat, dml2_results$dhat, - dml2_results$zhat, dml2_results$ytil, dml2_results$dtil, dml2_results$ztil, - dml2_results$drZ, dml2_results$drD, name = "Boosted Trees") -table <- rbind(table, sum_boost) -table - -yhat_boost <- dml2_results$yhat -dhat_boost <- dml2_results$dhat -yhat0_boost <- dml2_results$yhat0 -yhat1_boost <- dml2_results$yhat1 -dhat0_boost <- dml2_results$dhat0 -dhat1_boost <- dml2_results$dhat1 -zhat_boost <- dml2_results$zhat -``` - -## Ensembles - -Boosted trees give the best RMSE for D and Z and random forests give the best RMSE for Y. - -```{r} -# Best fit is boosted trees for D, Z and random forests for Y - -best_yhat0 <- yhat0_rf -best_yhat1 <- yhat1_rf -best_yhat <- yhat_rf - -best_dhat0 <- dhat0_boost -best_dhat1 <- dhat1_boost -best_dhat <- dhat_boost - -best_zhat <- zhat_boost - -ytil_best <- y - best_yhat -dtil_best <- D - best_dhat -ztil_best <- Z - best_zhat - -# doubly robust quantity for every sample -HZ <- Z / best_zhat - (1 - Z) / (1 - best_zhat) -drZ <- best_yhat1 - best_yhat0 + (y - best_yhat) * HZ -drD <- best_dhat1 - best_dhat0 + (D - best_dhat) * HZ -coef_est <- mean(drZ) / mean(drD) -psi <- drZ - coef_est * drD -Jhat <- mean(drD) -variance <- mean(psi^2) / Jhat^2 -se <- sqrt(variance / nrow(X)) - -sum_best <- summary_for_iivm(coef_est, se, best_yhat, best_dhat, best_zhat, - ytil_best, dtil_best, ztil_best, drZ, drD, name = "Best") -table <- rbind(table, sum_best) -table -``` - -We'll form a model average with unconstrained least squares weights. - -```{r} -# Least squares model average -ma_dcoef <- lm(D ~ dhat_lasso + dhat_rf + dhat_tr + dhat_boost - 1)$coef -ma_ycoef <- lm(y ~ yhat_lasso + yhat_rf + yhat_tr + yhat_boost - 1)$coef -ma_zcoef <- lm(Z ~ zhat_lasso + zhat_rf + zhat_tr + zhat_boost - 1)$coef - -ma_yhat0 <- cbind(yhat0_lasso, yhat0_rf, yhat0_tr, yhat0_boost) %*% as.matrix(ma_ycoef) -ma_yhat1 <- cbind(yhat1_lasso, yhat1_rf, yhat1_tr, yhat1_boost) %*% as.matrix(ma_ycoef) -ma_dhat0 <- cbind(dhat0_lasso, dhat0_rf, dhat0_tr, dhat0_boost) %*% as.matrix(ma_dcoef) -ma_dhat1 <- cbind(dhat1_lasso, dhat1_rf, dhat1_tr, dhat1_boost) %*% as.matrix(ma_dcoef) -ma_zhat <- cbind(zhat_lasso, zhat_rf, zhat_tr, zhat_boost) %*% as.matrix(ma_zcoef) - -# Prediction of treatment and outcome for observed instrument -ma_yhat <- ma_yhat0 * (1 - Z) + ma_yhat1 * Z -ma_dhat <- ma_dhat0 * (1 - Z) + ma_dhat1 * Z - -# residuals -ma_ytil <- y - ma_yhat -ma_dtil <- D - ma_dhat -ma_ztil <- Z - ma_zhat - -# doubly robust quantity for every sample -HZ <- Z / ma_zhat - (1 - Z) / (1 - ma_zhat) -drZ <- ma_yhat1 - ma_yhat0 + (y - ma_yhat) * HZ -drD <- ma_dhat1 - ma_dhat0 + (D - ma_dhat) * HZ -coef_est <- mean(drZ) / mean(drD) -psi <- drZ - coef_est * drD -Jhat <- mean(drD) -variance <- mean(psi^2) / Jhat^2 -se <- sqrt(variance / nrow(X)) - -sum_ma <- summary_for_iivm(coef_est, se, ma_yhat, ma_dhat, ma_zhat, - ma_ytil, ma_dtil, ma_ztil, drZ, drD, name = "Model Average") -table <- rbind(table, sum_ma) -table -``` - -Comparing with the PLR model - -```{r} -tableplr -``` - -We find that the PLR model overestimates the effect by around 1k; though both sets of results have overlapping confidence intervals. - - -Again as before, ideally we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. - -As before, in the below analysis of robust inference, we choose Boosted Trees as they perform well in RMSE and accuracy on first-stage models. - -```{r} -iivm_robust_inference <- function(point, stderr, yhat, Dhat, Zhat, resy, resD, resZ, - drZ, drD, X, Z, D, y, grid, alpha = 0.05) { - # Inference in the partially linear IV model that is robust to weak identification. - # grid: grid of theta values to search over when trying to identify the confidence region - # alpha: confidence level - - n <- dim(X)[1] - thr <- qchisq(1 - alpha, df = 1) - accept <- c() - - for (theta in grid) { - moment <- drZ - theta * drD - test <- n * mean(moment)^2 / var(moment) - if (test <= thr) { - accept <- c(accept, theta) - } - } - - return(accept) -} -``` - -```{r} -grid <- seq(0, 20000, length.out = 10000) -region <- iivm_robust_inference(point = dml2_results$coef_est, stderr = dml2_results$se, - yhat = dml2_results$yhat, Dhat = dml2_results$dhat, Zhat = dml2_results$zhat, - resy = dml2_results$ytil, resD = dml2_results$dtil, resZ = dml2_results$ztil, - drZ = dml2_results$drZ, drD = dml2_results$drD, - X = X, Z = Z, D = D, y = y, grid = grid) - -# Calculate min and max -min_region <- min(region) -max_region <- max(region) - -print(min_region) -print(max_region) -``` - -We find again that the robust inference confidence region is almost identical to the normal based inference. We are most probably in the strong instrument regime. We can check the t-statistic for the effect of the instrument on the treatment to verify this. - -# DoubleML package - -There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R). - -We run through IIVM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. - -You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. - -```{r} -install.packages("DoubleML") -install.packages("mlr3learners") -install.packages("mlr3") -install.packages("data.table") -install.packages("ranger") -``` - -```{r} -library(DoubleML) -library(mlr3learners) -library(mlr3) -library(data.table) -library(ranger) -``` - -## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets - -## Interactive IV Model (IIVM) - -Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: - -\begin{align} -Y :=~& g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ -D :=~& r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ -Z :=~& m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. -\end{align} - -```{r} -# Constructing the data (as DoubleMLData) -formula_flex2 <- paste("net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + ", - "poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") -model_flex2 <- as.data.table(model.frame(formula_flex2, data)) -x_cols <- colnames(model_flex2)[-c(1, 2, 3)] -data_iv <- DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols = "e401", x_cols = x_cols) -``` - -```{r} -lgr::get_logger("mlr3")$set_threshold("warn") -lasso <- lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min") -lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") -dml_mliivm <- DoubleMLIIVM$new(data_iv, - ml_g = lasso, - ml_m = lasso_class, ml_r = lasso_class, n_folds = 5, subgroups = list( - always_takers = FALSE, - never_takers = TRUE - ) -) -dml_mliivm$fit(store_predictions = TRUE) -dml_mliivm$summary() -lasso_mliivm <- dml_mliivm$coef -lasso_std_mliivm <- dml_mliivm$se -``` - -The confidence interval for the local average treatment effect of participation is given by - -```{r} -dml_mliivm$confint(level = 0.95) -``` - -Here we can also check the accuracy of the model: - -```{r} -# variables -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$p401) -z <- as.matrix(pension$e401) - -# predictions -dml_mliivm$params_names() -g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(z=0, X) -g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(z=1, X) -g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(z=0, X) -r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(z=1, X) -r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o -``` - -```{r} -# cross-fitted RMSE: outcome -lasso_y_mliivm <- sqrt(mean((y - g_hat)^2)) -lasso_y_mliivm - -# cross-fitted RMSE: treatment -lasso_d_mliivm <- sqrt(mean((d - r_hat)^2)) -lasso_d_mliivm - -# cross-fitted RMSE: instrument -lasso_z_mliivm <- sqrt(mean((z - m_hat)^2)) -lasso_z_mliivm -``` - -Again, we repeat the procedure for the other machine learning methods: - -```{r} -# needed to run boosting -remotes::install_github("mlr-org/mlr3extralearners") -install.packages("mlr3extralearners") -install.packages("mboost") -``` - -```{r} -library(mlr3extralearners) -library(mboost) -``` - -```{r} -# Forest -randomForest <- lrn("regr.ranger") -random_forest_class <- lrn("classif.ranger") - -# Trees -trees <- lrn("regr.rpart") -trees_class <- lrn("classif.rpart") - -# Boosting -boost <- lrn("regr.glmboost") -boost_class <- lrn("classif.glmboost") -``` - -```{r} -### random forest ### - -lgr::get_logger("mlr3")$set_threshold("warn") -dml_mliivm <- DoubleMLIIVM$new(data_iv, - ml_g = randomForest, - ml_m = random_forest_class, ml_r = random_forest_class, n_folds = 3, subgroups = list( - always_takers = FALSE, - never_takers = TRUE - ) -) -dml_mliivm$fit(store_predictions = TRUE) -dml_mliivm$summary() -forest_mliivm <- dml_mliivm$coef -forest_std_mliivm <- dml_mliivm$se - -# predictions -g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -forest_y_mliivm <- sqrt(mean((y - g_hat)^2)) -forest_y_mliivm - -# cross-fitted RMSE: treatment -forest_d_mliivm <- sqrt(mean((d - r_hat)^2)) -forest_d_mliivm - -# cross-fitted RMSE: instrument -forest_z_mliivm <- sqrt(mean((z - m_hat)^2)) -forest_z_mliivm - -### trees ### - -dml_mliivm <- DoubleMLIIVM$new(data_iv, - ml_g = trees, - ml_m = trees_class, ml_r = trees_class, n_folds = 3, subgroups = list( - always_takers = FALSE, - never_takers = TRUE - ) -) -dml_mliivm$fit(store_predictions = TRUE) -dml_mliivm$summary() -tree_mliivm <- dml_mliivm$coef -tree_std_mliivm <- dml_mliivm$se - -# predictions -g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -tree_y_mliivm <- sqrt(mean((y - g_hat)^2)) -tree_y_mliivm - -# cross-fitted RMSE: treatment -tree_d_mliivm <- sqrt(mean((d - r_hat)^2)) -tree_d_mliivm - -# cross-fitted RMSE: instrument -tree_z_mliivm <- sqrt(mean((z - m_hat)^2)) -tree_z_mliivm - - -### boosting ### -dml_mliivm <- DoubleMLIIVM$new(data_iv, - ml_g = boost, - ml_m = boost_class, ml_r = boost_class, n_folds = 3, subgroups = list( - always_takers = FALSE, - never_takers = TRUE - ) -) -dml_mliivm$fit(store_predictions = TRUE) -dml_mliivm$summary() -boost_mliivm <- dml_mliivm$coef -boost_std_mliivm <- dml_mliivm$se - -# predictions -g0_hat <- as.matrix(dml_mliivm$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_mliivm$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z * g1_hat + (1 - z) * g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_mliivm$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_mliivm$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z * r1_hat + (1 - z) * r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_mliivm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -boost_y_mliivm <- sqrt(mean((y - g_hat)^2)) -boost_y_mliivm - -# cross-fitted RMSE: treatment -boost_d_mliivm <- sqrt(mean((d - r_hat)^2)) -boost_d_mliivm - -# cross-fitted RMSE: instrument -boost_z_mliivm <- sqrt(mean((z - m_hat)^2)) -boost_z_mliivm -``` - -```{r} -table <- matrix(0, 5, 4) -table[1, 1:4] <- c(lasso_mliivm, forest_mliivm, tree_mliivm, boost_mliivm) -table[2, 1:4] <- c(lasso_std_mliivm, forest_std_mliivm, tree_std_mliivm, boost_std_mliivm) -table[3, 1:4] <- c(lasso_y_mliivm, forest_y_mliivm, tree_y_mliivm, boost_y_mliivm) -table[4, 1:4] <- c(lasso_d_mliivm, forest_d_mliivm, tree_d_mliivm, boost_d_mliivm) -table[5, 1:4] <- c(lasso_z_mliivm, forest_z_mliivm, tree_z_mliivm, boost_z_mliivm) -rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D", "RMSE Z") -colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") -tab <- xtable(table, digits = 2) -tab -``` - -We report results based on four ML methods for estimating the nuisance functions used in -forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis -that the effect of 401(k) participation has no effect on financial health. - -We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation: - -```{r} -lgr::get_logger("mlr3")$set_threshold("warn") -dml_mliivm <- DoubleMLIIVM$new(data_iv, - ml_g = randomForest, - ml_m = lasso_class, ml_r = lasso_class, n_folds = 5, subgroups = list( - always_takers = FALSE, - never_takers = TRUE - ) -) -dml_mliivm$fit(store_predictions = TRUE) -dml_mliivm$summary() -best_mliivm <- dml_mliivm$coef -best_std_mliivm <- dml_mliivm$se -``` - diff --git a/AC2/r-weak-iv-experiments.Rmd b/AC2/r-weak-iv-experiments.Rmd deleted file mode 100644 index daf93cc7..00000000 --- a/AC2/r-weak-iv-experiments.Rmd +++ /dev/null @@ -1,75 +0,0 @@ ---- -title: An R Markdown document converted from "AC2/r-weak-iv-experiments.irnb" -output: html_document ---- - -# A Simple Example of Properties of IV estimator when Instruments are Weak - -```{r} -install.packages("hdm") -``` - -```{r} -library(hdm) -``` - -Simulation Design - -```{r} -# Simulation Design -set.seed(1) - -n <- 100 -beta <- .1 # .1 weak IV (change to 1.0 for strong IV) - -# One realization -U <- rnorm(n) -Z <- rnorm(n) # generate instrument -D <- beta * Z + U # generate endogenougs variable -Y <- D + U # the true causal effect is 1 - -summary(lm(D ~ Z)) # first stage is very weak here when we set beta = .1 -``` - -```{r} -summary(tsls(x = NULL, d = D, y = Y, z = Z)) -``` - -Note that the instrument is weak here (strength of the instrument is controlled by setting $\beta$) -- the t-stat is smaller than any rule-of-thumb suggested in the literature (e.g. $\sqrt{10}$) . - -# Run 10000 trials to evaluate distribution of the IV estimator - -```{r} -# Simulation Design - -set.seed(1) -B <- 10000 # trials -IVEst <- rep(0, B) - -for (i in 1:B) { - U <- rnorm(n) - Z <- rnorm(n) # generate instrument - D <- beta * Z + U # generate endogenougs variable - Y <- D + U # the true causal effect is 1 - IVEst[i] <- coef(tsls(x = NULL, d = D, y = Y, z = Z))[1, 1] -} -``` - -# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption) - -```{r} -plot(density(IVEst - 1, n = 1000, from = -5, to = 5), - col = 4, xlim = c(-5, 5), - xlab = "IV Estimator -True Effect", main = "Actual Distribution vs Gaussian" -) - -val <- seq(-5, 5, by = .05) -var <- (1 / beta^2) * (1 / 100) # theoretical variance of IV -sd <- sqrt(var) -lines(val, dnorm(val, sd = sd), col = 2, lty = 2) - -rejection_frequency <- sum((abs(IVEst - 1) / sd > 1.96)) / B - -cat(c("Rejection Frequency is ", rejection_frequency, " while we expect it to be .05")) -``` - diff --git a/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd b/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd deleted file mode 100644 index 4cd8137f..00000000 --- a/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd +++ /dev/null @@ -1,198 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Analyzing RCT data with Precision Adjustment - - -## Data - -In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in "Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data. - - -```{r} -## loading the data -Penn <- as.data.frame(read.table("../input/reemployment-experiment/penn_jae.dat", header=T )) -n <- dim(Penn)[1] -p_1 <- dim(Penn)[2] -Penn<- subset(Penn, tg==4 | tg==0) -attach(Penn) -``` - -```{r} -T4<- (tg==4) -summary(T4) -``` - -```{r} -head(Penn) -``` - - -### Model -To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model: - -$$ -Y = D \beta_1 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W')' = 0, -$$ - -where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\beta_1$ is the ATE, if the RCT assumptions hold rigorously. - - -We also consider interactive regression model: - -$$ -Y = D \alpha_1 + D W' \alpha_2 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W', DW')' = 0, -$$ -where $W$'s are demeaned (apart from the intercept), so that $\alpha_1$ is the ATE, if the RCT assumptions hold rigorously. - - -Under RCT, the projection coefficient $\beta_1$ has -the interpretation of the causal effect of the treatment on -the average outcome. We thus refer to $\beta_1$ as the average -treatment effect (ATE). Note that the covariates, here are -independent of the treatment $D$, so we can identify $\beta_1$ by -just linear regression of $Y$ on $D$, without adding covariates. -However we do add covariates in an effort to improve the -precision of our estimates of the average treatment effect. - - -### Analysis - -We consider - -* classical 2-sample approach, no adjustment (CL) -* classical linear regression adjustment (CRA) -* interactive regression adjusment (IRA) - -and carry out robust inference using the *estimatr* R packages. - - -# Carry out covariate balance check - - -This is done using "lm_robust" command which unlike "lm" in the base command automatically does the correct Eicher-Huber-White standard errors, instead othe classical non-robus formula based on the homoscdedasticity command. - -```{r} - -m <- lm(T4~(female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2) -library(lmtest) -library(sandwich) -coeftest(m, vcov = vcovHC(m, type="HC1")) - -``` - -We see that that even though this is a randomized experiment, balance conditions are failed. - - -# Model Specification - -```{r} -# model specifications - - -# no adjustment (2-sample approach) -formula_cl <- log(inuidur1)~T4 - -# adding controls -formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2 -# Omitted dummies: q1, nondurable, muld - - -ols.cl <- lm(formula_cl) -ols.cra <- lm(formula_cra) - - -ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type="HC1")) -ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type="HC1")) - -print(ols.cl) -print(ols.cra) - - -``` - -The interactive specificaiton corresponds to the approach introduced in Lin (2013). - -```{r} - -#interactive regression model; - -X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1] -dim(X) -demean<- function(x){ x - mean(x)} -X = apply(X, 2, demean) - -ols.ira = lm(log(inuidur1) ~ T4*X) -ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type="HC1")) -print(ols.ira) - - - -``` - -Next we try out partialling out with lasso - -```{r} -library(hdm) - -T4 = demean(T4) - -DX = model.matrix(~T4*X)[,-1] - -rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1)) - - -print(rlasso.ira) - - -``` - -### Results - -```{r} -str(ols.ira) -ols.ira[2,1] -``` - -```{r} -library(xtable) -table<- matrix(0, 2, 4) -table[1,1]<- ols.cl[2,1] -table[1,2]<- ols.cra[2,1] -table[1,3]<- ols.ira[2,1] -table[1,4]<- rlasso.ira[[1]][1] - -table[2,1]<- ols.cl[2,2] -table[2,2]<- ols.cra[2,2] -table[2,3]<- ols.ira[2,2] -table[2,4]<- rlasso.ira[[1]][2] - - -colnames(table)<- c("CL","CRA","IRA", "IRA w Lasso") -rownames(table)<- c("estimate", "standard error") -tab<- xtable(table, digits=5) -tab - -print(tab, type="latex", digits=5) -``` - - -Treatment group 4 experiences an average decrease of about $7.8\%$ in the length of unemployment spell. - - -Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct. - - - - diff --git a/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd b/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd deleted file mode 100644 index 667b07d3..00000000 --- a/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd +++ /dev/null @@ -1,120 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Analyzing RCT with Precision by Adjusting for Baseline Covariates - - -# Jonathan Roth's DGP - -Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have -$$ -E [Y(0) | Z] = - Z, \quad E [Y(1) |Z] = Z, \quad Z \sim N(0,1). -$$ -The CATE is -$$ -E [Y(1) - Y(0) | Z ]= 2 Z. -$$ -and the ATE is -$$ -2 E Z = 0. -$$ - -We would like to estimate the ATE as precisely as possible. - -An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and $Z$ academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require college degree, and cause higher earnings Y(1) in jobs that require college degrees. This type of scenario is reflected in the DGP set-up above. - - - -```{r} -# generate the simulated dataset -set.seed(123) # set MC seed -n = 1000 # sample size -Z = rnorm(n) # generate Z -Y0 = -Z + rnorm(n) # conditional average baseline response is -Z -Y1 = Z + rnorm(n) # conditional average treatment effect is +Z -D = (runif(n)<.2) # treatment indicator; only 20% get treated -Y = Y1*D + Y0*(1-D) # observed Y -D = D - mean(D) # demean D -Z = Z-mean(Z) # demean Z -``` - -# Analyze the RCT data with Precision Adjustment - -Consider the follow regression models: - -* classical 2-sample approach, no adjustment (CL) -* classical linear regression adjustment (CRA) -* interactive regression adjusment (IRA) - -We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). - -We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the CRA to be a correct model of the conditional expectation function of Y given D and X, which is not the case here. - -```{r} -# implement each of the models on the simulated data -CL = lm(Y ~ D) -CRA = lm(Y ~ D+ Z) #classical -IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach - -# we are interested in the coefficients on variable "D". -library(sandwich) # heterokedasticity robust standard errors -library(lmtest) # coefficient testing -coeftest(CL, vcov = vcovHC(CL, type="HC1")) -coeftest(CRA, vcov = vcovHC(CRA, type="HC1")) -coeftest(IRA, vcov = vcovHC(IRA, type="HC1")) -``` - -# Using classical standard errors (non-robust) is misleading here. - -We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. - -Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors. - - -```{r} -summary(CL) -summary(CRA) -summary(IRA) -``` - -# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -set.seed(123) -n = 1000 -B= 1000 - -CLs = rep(0, B) -CRAs = rep(0, B) -IRAs = rep(0, B) - -for ( i in 1:B){ - Z = rnorm(n) - Y0 = -Z + rnorm(n) - Y1 = Z + rnorm(n) - Z = Z - mean(Z) - D = (runif(n)<.1) - D = D- mean(D) - Y = Y1*D + Y0*(1-D) - CLs[i]= lm(Y ~ D)$coef[2] - CRAs[i] = lm(Y ~ D+ Z)$coef[2] - IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2] - } - -print("Standard deviations for estimators") - -sqrt(mean(CLs^2)) -sqrt(mean(CRAs^2)) -sqrt(mean(IRAs^2)) -``` diff --git a/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd b/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd deleted file mode 100644 index baed1868..00000000 --- a/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd +++ /dev/null @@ -1,259 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains some RCT examples for teaching. - - - -# Polio RCT - -One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized control trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated avearage treatment effect is about -$$ --40 -$$ -with the 95% confidence band (based on approximate normality of the two sample means and their differences): -$$[-52, -28].$$ -As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio. - -The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts. - -We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html): -$$ -VE = \frac{\text{Risk for Unvaccinated - Risk for Vaccinated}}{\text{Risk for Unvaccinated}}. -$$ -It describes the relative reduction in risk caused by vaccination. - - -It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals. - - - - -```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} -NV = 200745 # number of vaccinated (treated) -NU = 201229 # number of unvaccinated (control) -RV= 33/NV # average outcome for vaccinated -RU =115/NU # average outcome for unvaccinated -VE = (RU - RV)/RU; # vaccine efficacy - -# incidence per 100000 -Incidence.RV=RV*100000 -Incidence.RU=RU*100000 - -print(paste("Incidence per 100000 among treated:", round(Incidence.RV,4))) - -print(paste("Incidence per 100000 among controlled:", round(Incidence.RU,4))) - -# treatment effect - estimated reduction in incidence per 100000 people -delta.hat = 100000*(RV-RU) - -print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) - -# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli -Var.RV = RV*(1-RV)/NV -Var.RU = RU*(1-RU)/NU -Var.delta.hat = 100000^2*(Var.RV + Var.RU) -Std.delta.hat = sqrt(Var.delta.hat) - -print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) - -CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), - delta.hat +1.96*sqrt(Var.delta.hat)) - -print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", - round(CI.delta[2],4), "]" )) - -print(paste("Overall VE is", round(VE,4) )) - -# we use an approximate bootstrap to find the confidence interval of vaccine efficacy -# via Monte Carlo draws -set.seed(1) -B = 10000 # number of bootstraps -RVs = RV + rnorm(B)*sqrt(Var.RV) -RUs = RU + rnorm(B)*sqrt(Var.RU) -VEs= (RUs - RVs)/RUs - -plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") - -CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps - -print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", - round(CI.VE[2],4), "]")) -``` - -# Pfizer/BNTX Covid-19 RCT - -Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion]( -https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data. - -Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they recieved. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table. - -![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj): - -Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age group 65-75 are much wider due to the relatively small number of people in this group. We could group 65-75 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band. - -We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result. - - -```{r _uuid="d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", _cell_guid="79c7e3d0-c299-4dcb-8224-4455121ee9b0"} -NV = 19965; # number vaccinated -NU = 20172; # number unvaccinated -RV = 9/NV; # average outcome for vaccinated -RU = 169/NU; # average outcome for unvaccinated -VE = (RU - RV)/RU; # vaccine efficacy - -# incidence per 100000 -Incidence.RV=RV*100000 -Incidence.RU=RU*100000 - -print(paste("Incidence per 100000 among vaccinated:", round(Incidence.RV,4))) - -print(paste("Incidence per 100000 among unvaccinated:", round(Incidence.RU,4))) - -# treatment effect - estimated reduction in incidence per 100000 people -delta.hat = 100000*(RV-RU) - -print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) - -# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli -Var.RV = RV*(1-RV)/NV -Var.RU = RU*(1-RU)/NU -Var.delta.hat = 100000^2*(Var.RV + Var.RU) -Std.delta.hat = sqrt(Var.delta.hat) - -print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) - -CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), - delta.hat +1.96*sqrt(Var.delta.hat)) - -print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", - round(CI.delta[2],4), "]" )) - -print(paste("Overall VE is", round(VE,4) )) - -# we use an approximate bootstrap to find the VE confidence interval -# using Monte Carlo draws as before -set.seed(1) -B = 10000 -RVs = RV + rnorm(B)*sqrt(Var.RV) -RUs = RU + rnorm(B)*sqrt(Var.RU) -VEs= (RUs - RVs)/RUs - -plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") - -CI.VE = quantile(VEs, c(.025, .975)) - -print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", - round(CI.VE[2],4), "]" )) -``` - -In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older - -```{r} -# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older -NV = 3239+805; -NU = 3255+812; -RV = 1/NV; -RU = (14+5)/NU; -VE = (RU - RV)/RU; - -print(paste("Overall VE is", round(VE,4)) ) - -Var.RV = RV*(1-RV)/NV -Var.RU = RU*(1-RU)/NU - -# As before, we use an approximate bootstrap to find the confidence intervals -# using Monte Carlo draws - -set.seed(1) -B = 10000 - RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10) - RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10) - VEs= (RUs - RVs)/RUs - -plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") - -CI.VE = quantile(VEs, c(.025, .975)) - -print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", - CI.VE[2], "]" )) - -OneSidedCI.VE = quantile(VEs, c(.05)) - -print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", - 1, "]" )) -``` - -Let's try the parametric boostrap next, using the fact that the outcome is Bernouli. - -```{r} -NV = 3239+805; -NU = 3255+812; -RV = 1/NV; -RU = (14+5)/NU; -VE = (RU - RV)/RU; - -print(paste("Overall VE is", VE) ) - -set.seed(1) -B = 10000 #number of simulation draw - RVs = rbinom(100000, size= NV, prob = RV) - RUs = rbinom(100000, size= NU, prob = RU) - VEs= (RUs - RVs)/RUs - -plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") - -CI.VE = quantile(VEs, c(.025, .975)) - -print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", - CI.VE[2], "]" )) - -OneSidedCI.VE = quantile(VEs, c(.05)) - -print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", 1, "]" )) -``` - -# Exact Binomial Test Inversion - -It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy. - -```{r} -# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure -install.packages("ORCI") -library(ORCI) - -NV = 19965; -NU = 20172; -RV = 9/NV; -RU = 169/NU; -VE = (RU - RV)/RU; - -1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08)) -``` - -Note that this exactly recovers the result in the FDA table (first row). - - -Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully. - -```{r} -# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older -NV = 3239+805; -NU = 3255+812; -RV = 1/NV; -RU = (14+5)/NU; -VE = (RU - RV)/RU; - -1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08)) -``` diff --git a/CM1/r-rct-penn-precision-adj.Rmd b/CM1/r-rct-penn-precision-adj.Rmd deleted file mode 100644 index 5a2dc0fa..00000000 --- a/CM1/r-rct-penn-precision-adj.Rmd +++ /dev/null @@ -1,238 +0,0 @@ ---- -title: An R Markdown document converted from "CM1/r-rct-penn-precision-adj.irnb" -output: html_document ---- - -# Analyzing RCT data with Precision Adjustment - -```{r} -install.packages("sandwich") -install.packages("lmtest") -install.packages("xtable") -install.packages("hdm") -``` - -```{r} -library(sandwich) -library(lmtest) -library(xtable) -library(hdm) -``` - -## Data - -In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in "Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data. - - -```{r} -## loading the data -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat" -Penn <- as.data.frame(read.table(file, header = TRUE)) - -n <- dim(Penn)[1] -p_1 <- dim(Penn)[2] -Penn <- subset(Penn, tg == 4 | tg == 0) -attach(Penn) -``` - -```{r} -T4 <- (tg == 4) -summary(T4) -``` - -```{r} -head(Penn) -``` - -### Model -To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model: - -$$ -Y = D \beta_1 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W')' = 0, -$$ - -where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\beta_1$ is the ATE, if the RCT assumptions hold rigorously. - - -We also consider interactive regression model: - -$$ -Y = D \alpha_1 + D W' \alpha_2 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W', DW')' = 0, -$$ -where $W$'s are demeaned (apart from the intercept), so that $\alpha_1$ is the ATE, if the RCT assumptions hold rigorously. - -Under RCT, the projection coefficient $\beta_1$ has -the interpretation of the causal effect of the treatment on -the average outcome. We thus refer to $\beta_1$ as the average -treatment effect (ATE). Note that the covariates, here are -independent of the treatment $D$, so we can identify $\beta_1$ by -just linear regression of $Y$ on $D$, without adding covariates. -However we do add covariates in an effort to improve the -precision of our estimates of the average treatment effect. - -### Analysis - -We consider - -* classical 2-sample approach, no adjustment (CL) -* classical linear regression adjustment (CRA) -* interactive regression adjusment (IRA) - -and carry out robust inference using the *estimatr* R packages. - -# Carry out covariate balance check - - -We first look at the coefficients individually with a $t$-test, and then we adjust the $p$-values to control for family-wise error. - -The regression below is done using "type='HC1'" which computes the correct Eicker-Huber-White standard errors, instead of the classical non-robust formula based on homoscedasticity. - -```{r} -data <- model.matrix(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + - agelt35 + agegt54 + durable + lusd + husd)^2) - -# individual t-tests -m <- lm(T4 ~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + - agelt35 + agegt54 + durable + lusd + husd)^2, data = as.data.frame(data)) -coeftest(m, vcov = vcovHC(m, type = "HC1")) -``` - - - -To test balance conditions, we employ the Holm-Bonferroni step-down method. With 100+ hypotheses, the family-wise type I error, or the probability of making at least one type I error treating all hypotheses independently, is close to 1. To control for this, we adjust p-values with the following procedure. - -First, set $\alpha=0.05$ and denote the list of $n$ p-values from the regression with the vector $p$. - -1. Sort $p$ from smallest to largest, so $p_{(1)} \leq p_{(2)} \leq \cdots \leq p_{(n)}$. Denote the corresponding hypothesis for $p_{(i)}$ as $H_{(i)}$. -2. For $i=1,\ldots, n$, -- If $$p_{(i)} > \frac{\alpha}{n-i+1} $$ Break the loop and do not reject any $H_{(j)}$ for $j \geq i$. -- Else reject $H_{(i)}$ if $$p_{(i)} \leq \frac{\alpha}{n-i+1} $$ Increment $i := i+1$. - - - - -```{r} -holm_bonferroni <- function(p, alpha = 0.05) { - n <- length(p) - sig_beta <- c() - - for (i in 1:n) { - if (sort(p)[i] > alpha / (n - i + 1)) { - break - } else { - sig_beta <- c(sig_beta, order(p)[i]) - } - } - - return(sig_beta) -} - -p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type = "HC1"))[, 4]) -significant_indices <- holm_bonferroni(p_values, alpha = 0.05) -print(paste("Significant Coefficients (Indices): ", significant_indices)) -``` - -There is also a built in R function to do this. - -```{r} -p_values <- as.vector(coeftest(m, vcov = vcovHC(m, type = "HC1"))[, 4]) -holm_reject <- p.adjust(sort(p_values), "holm") <= 0.05 -holm_reject -``` - -We see that that even though this is a randomized experiment, balance conditions are failed. - - -# Model Specification - -```{r} -# model specifications - -# no adjustment (2-sample approach) -formula_cl <- log(inuidur1) ~ T4 - -# adding controls -formula_cra <- log(inuidur1) ~ T4 + (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + - agelt35 + agegt54 + durable + lusd + husd)^2 -# Omitted dummies: q1, nondurable, muld - -ols_cl <- lm(formula_cl) -ols_cra <- lm(formula_cra) - -ols_cl <- coeftest(ols_cl, vcov = vcovHC(ols_cl, type = "HC1")) -ols_cra <- coeftest(ols_cra, vcov = vcovHC(ols_cra, type = "HC1")) - -print(ols_cl) -print(ols_cra) -``` - -The interactive specificaiton corresponds to the approach introduced in Lin (2013). - -```{r} -# interactive regression model; - -X <- model.matrix(~ (female + black + othrace + factor(dep) + q2 + q3 + q4 + q5 + q6 + - agelt35 + agegt54 + durable + lusd + husd)^2)[, -1] -dim(X) - -demean <- function(x) { - x - mean(x) -} - -X <- apply(X, 2, demean) - -ols_ira <- lm(log(inuidur1) ~ T4 * X) -ols_ira <- coeftest(ols_ira, vcov = vcovHC(ols_ira, type = "HC1")) -print(ols_ira) -``` - -Next we try out partialling out with lasso - -```{r} -T4 <- demean(T4) - -DX <- model.matrix(~ T4 * X)[, -1] - -rlasso_ira <- summary(rlassoEffects(DX, log(inuidur1), index = 1)) - -print(rlasso_ira) -``` - -### Results - -```{r} -str(ols_ira) -ols_ira[2, 1] -``` - -```{r} -table <- matrix(0, 2, 4) -table[1, 1] <- ols_cl[2, 1] -table[1, 2] <- ols_cra[2, 1] -table[1, 3] <- ols_ira[2, 1] -table[1, 4] <- rlasso_ira[[1]][1] - -table[2, 1] <- ols_cl[2, 2] -table[2, 2] <- ols_cra[2, 2] -table[2, 3] <- ols_ira[2, 2] -table[2, 4] <- rlasso_ira[[1]][2] - - -colnames(table) <- c("CL", "CRA", "IRA", "IRA w Lasso") -rownames(table) <- c("estimate", "standard error") -tab <- xtable(table, digits = 5) -tab - -print(tab, type = "latex", digits = 5) -``` - -Treatment group 4 experiences an average decrease of about $7.8\%$ in the length of unemployment spell. - - -Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct. - - - diff --git a/CM1/r-rct-vaccines.Rmd b/CM1/r-rct-vaccines.Rmd deleted file mode 100644 index 43c78d85..00000000 --- a/CM1/r-rct-vaccines.Rmd +++ /dev/null @@ -1,275 +0,0 @@ ---- -title: An R Markdown document converted from "CM1/r-rct-vaccines.irnb" -output: html_document ---- - -This notebook contains some RCT examples for teaching. - -```{r} -install.packages("PropCIs") # Exact CI exploiting Bernoulli outcome using the Cornfield Procedure -``` - -```{r} -library(PropCIs) -``` - - -# Polio RCT - -One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized controlled trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated average treatment effect is about -$$ --40 -$$ -with the 95% confidence band (based on approximate normality of the two sample means and their differences): -$$[-52, -28].$$ -As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio. - -The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts. - -We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html): -$$ -VE = \frac{\text{Risk for Unvaccinated - Risk for Vaccinated}}{\text{Risk for Unvaccinated}}. -$$ -It describes the relative reduction in risk caused by vaccination. - - -It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals. - - -```{r} -NV <- 200745 # number of vaccinated (treated) -NU <- 201229 # number of unvaccinated (control) -RV <- 33 / NV # average outcome for vaccinated -RU <- 115 / NU # average outcome for unvaccinated -VE <- (RU - RV) / RU -# vaccine efficacy - -# incidence per 100000 -incidence_rv <- RV * 100000 -incidence_ru <- RU * 100000 - -print(paste("Incidence per 100000 among treated:", round(incidence_rv, 4))) - -print(paste("Incidence per 100000 among controlled:", round(incidence_ru, 4))) - -# treatment effect - estimated reduction in incidence per 100000 people -delta_hat <- 100000 * (RV - RU) - -print(paste("Estimated ATE of occurances per 100,000 is", round(delta_hat, 4))) - -# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli -var_rv <- RV * (1 - RV) / NV -var_ru <- RU * (1 - RU) / NU -var_delta_hat <- 100000^2 * (var_rv + var_ru) -std_delta_hat <- sqrt(var_delta_hat) - -print(paste("Standard deviation for ATE is", round(std_delta_hat, 4))) - -ci_delta <- c( - delta_hat - 1.96 * sqrt(var_delta_hat), - delta_hat + 1.96 * sqrt(var_delta_hat) -) - -print(paste( - "95% confidence interval of ATE is [", round(ci_delta[1], 4), ",", - round(ci_delta[2], 4), "]" -)) - -print(paste("Overall VE is", round(VE, 4))) - -# we use an approximate bootstrap to find the confidence interval of vaccine efficacy -# via Monte Carlo draws -set.seed(1) -B <- 10000 # number of bootstraps -RVs <- RV + rnorm(B) * sqrt(var_rv) -RUs <- RU + rnorm(B) * sqrt(var_ru) -VEs <- (RUs - RVs) / RUs - -plot(density(VEs), col = 2, main = "Approximate Distribution of VE estimates") - -ci_ve <- quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps - -print(paste( - "95% confidence interval of VE is [", round(ci_ve[1], 4), ",", - round(ci_ve[2], 4), "]" -)) -``` - -# Pfizer/BNTX Covid-19 RCT - -Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion]( -https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data. - -Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they received. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table. - -![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj): - -Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age groups 65-74 and 75+ are much wider due to the relatively small number of people in these groups. We could group 65-74 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band. - -We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result. - -```{r} -NV <- 19965 -# number vaccinated -NU <- 20172 -# number unvaccinated -RV <- 9 / NV -# average outcome for vaccinated -RU <- 169 / NU -# average outcome for unvaccinated -VE <- (RU - RV) / RU -# vaccine efficacy - -# incidence per 100000 -incidence_rv <- RV * 100000 -incidence_ru <- RU * 100000 - -print(paste("Incidence per 100000 among vaccinated:", round(incidence_rv, 4))) - -print(paste("Incidence per 100000 among unvaccinated:", round(incidence_ru, 4))) - -# treatment effect - estimated reduction in incidence per 100000 people -delta_hat <- 100000 * (RV - RU) - -print(paste("Estimated ATE of occurances per 100,000 is", round(delta_hat, 4))) - -# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli -var_rv <- RV * (1 - RV) / NV -var_ru <- RU * (1 - RU) / NU -var_delta_hat <- 100000^2 * (var_rv + var_ru) -std_delta_hat <- sqrt(var_delta_hat) - -print(paste("Standard deviation for ATE is", round(std_delta_hat, 4))) - -ci_delta <- c( - delta_hat - 1.96 * sqrt(var_delta_hat), - delta_hat + 1.96 * sqrt(var_delta_hat) -) - -print(paste( - "95% confidence interval of ATE is [", round(ci_delta[1], 4), ",", - round(ci_delta[2], 4), "]" -)) - -print(paste("Overall VE is", round(VE, 4))) - -# we use an approximate bootstrap to find the VE confidence interval -# using Monte Carlo draws as before -set.seed(1) -B <- 10000 -RVs <- RV + rnorm(B) * sqrt(var_rv) -RUs <- RU + rnorm(B) * sqrt(var_ru) -VEs <- (RUs - RVs) / RUs - -plot(density(VEs), col = 2, main = "Approximate Distribution of VE estimates") - -ci_ve <- quantile(VEs, c(.025, .975)) - -print(paste( - "95% confidence interval of VE is [", round(ci_ve[1], 4), ",", - round(ci_ve[2], 4), "]" -)) -``` - -In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older - -```{r} -# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older -NV <- 3239 + 805 -NU <- 3255 + 812 -RV <- 1 / NV -RU <- (14 + 5) / NU -VE <- (RU - RV) / RU -print(paste("Overall VE is", round(VE, 4))) - -var_rv <- RV * (1 - RV) / NV -var_ru <- RU * (1 - RU) / NU - -# As before, we use an approximate bootstrap to find the confidence intervals -# using Monte Carlo draws - -set.seed(1) -B <- 10000 -RVs <- RV + rnorm(B) * sqrt(var_rv) + 10^(-10) -RUs <- RU + rnorm(B) * sqrt(var_ru) + 10^(-10) -VEs <- (RUs - RVs) / RUs - -plot(density(VEs), col = 2, main = "Approximate Distribution of VE estimates") - -ci_ve <- quantile(VEs, c(.025, .975)) - -print(paste( - "two-sided 95 % confidence interval is [", ci_ve[1], ",", - ci_ve[2], "]" -)) - -one_sided_ci_ve <- quantile(VEs, c(.05)) - -print(paste( - "one-sided 95 % confidence interval is [", one_sided_ci_ve[1], ",", - 1, "]" -)) -``` - -Let's try the parametric boostrap next, using the fact that the outcome is Bernouli. - -```{r} -NV <- 3239 + 805 -NU <- 3255 + 812 -RV <- 1 / NV -RU <- (14 + 5) / NU -VE <- (RU - RV) / RU -print(paste("Overall VE is", VE)) - -set.seed(1) -B <- 10000 # number of simulation draw -RVs <- rbinom(100000, size = NV, prob = RV) -RUs <- rbinom(100000, size = NU, prob = RU) -VEs <- (RUs - RVs) / RUs - -plot(density(VEs), col = 2, main = "Approximate Distribution of VE estimates") - -ci_ve <- quantile(VEs, c(.025, .975)) - -print(paste( - "two-sided 95 % confidence interval is [", ci_ve[1], ",", - ci_ve[2], "]" -)) - -one_sided_ci_ve <- quantile(VEs, c(.05)) - -print(paste("one-sided 95 % confidence interval is [", one_sided_ci_ve[1], ",", 1, "]")) -``` - -# Exact Binomial Test Inversion - -It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy. - -```{r} -# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure -NV <- 19965 -NU <- 20172 -RV <- 9 / NV -RU <- 169 / NU -VE <- (RU - RV) / RU - -1 - riskscoreci(9, NV, 169, NU, 0.95)$conf.int[2] -1 - riskscoreci(9, NV, 169, NU, 0.95)$conf.int[1] -``` - -Note that this exactly recovers the result in the FDA table (first row). - -Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully. - -```{r} -# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older -NV <- 3239 + 805 -NU <- 3255 + 812 -RV <- 1 / NV -RU <- (14 + 5) / NU -VE <- (RU - RV) / RU - -1 - riskscoreci(1, NV, 19, NU, 0.95)$conf.int[2] -1 - riskscoreci(1, NV, 19, NU, 0.95)$conf.int[1] -``` - diff --git a/CM1/r-sim-precision-adj.Rmd b/CM1/r-sim-precision-adj.Rmd deleted file mode 100644 index da358e95..00000000 --- a/CM1/r-sim-precision-adj.Rmd +++ /dev/null @@ -1,117 +0,0 @@ ---- -title: An R Markdown document converted from "CM1/r-sim-precision-adj.irnb" -output: html_document ---- - -# Analyzing RCT with Precision by Adjusting for Baseline Covariates - -```{r} -install.packages("sandwich") -install.packages("lmtest") -``` - -```{r} -library(sandwich) # heterokedasticity robust standard errors -library(lmtest) # coefficient testing -``` - -# Jonathan Roth's DGP - -Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have -$$ -E [Y(0) | Z] = - Z, \quad E [Y(1) |Z] = Z, \quad Z \sim N(0,1). -$$ -The CATE is -$$ -E [Y(1) - Y(0) | Z ]= 2 Z. -$$ -and the ATE is -$$ -2 E Z = 0. -$$ - -We would like to estimate the ATE as precisely as possible. - -An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and let $Z$ be academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require a college degree, and cause higher earnings Y(1) in jobs that do require college degrees. This type of scenario is reflected in the DGP set-up above. - - -```{r} -# generate the simulated dataset -set.seed(123) # set MC seed -n <- 1000 # sample size -Z <- rnorm(n) # generate Z -Y0 <- -Z + rnorm(n) # conditional average baseline response is -Z -Y1 <- Z + rnorm(n) # conditional average treatment effect is +Z -D <- (runif(n) < .2) # treatment indicator; only 20% get treated -Y <- Y1 * D + Y0 * (1 - D) # observed Y -D <- D - mean(D) # demean D -Z <- Z - mean(Z) # demean Z -``` - -# Analyze the RCT data with Precision Adjustment - -Consider the follow regression models: - -* classical 2-sample approach, no adjustment (CL) -* classical linear regression adjustment (CRA) -* interactive regression adjusment (IRA) - -We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). - -We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the linear model to be a correct model of the conditional expectation function of Y given D and X, which is not the case here. - -```{r} -# implement each of the models on the simulated data -CL <- lm(Y ~ D) -CRA <- lm(Y ~ D + Z) # classical -IRA <- lm(Y ~ D + Z + Z * D) # interactive approach - -# we are interested in the coefficients on variable "D". -coeftest(CL, vcov = vcovHC(CL, type = "HC1")) -coeftest(CRA, vcov = vcovHC(CRA, type = "HC1")) -coeftest(IRA, vcov = vcovHC(IRA, type = "HC1")) -``` - -# Using classical standard errors (non-robust) is misleading here. - -We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. - -Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors. - -```{r} -summary(CL) -summary(CRA) -summary(IRA) -``` - -# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment - -```{r} -set.seed(123) -n <- 1000 -B <- 1000 - -CLs <- rep(0, B) -CRAs <- rep(0, B) -IRAs <- rep(0, B) - -for (i in 1:B) { - Z <- rnorm(n) - Y0 <- -Z + rnorm(n) - Y1 <- Z + rnorm(n) - Z <- Z - mean(Z) - D <- (runif(n) < .1) - D <- D - mean(D) - Y <- Y1 * D + Y0 * (1 - D) - CLs[i] <- lm(Y ~ D)$coef[2] - CRAs[i] <- lm(Y ~ D + Z)$coef[2] - IRAs[i] <- lm(Y ~ D + Z + Z * D)$coef[2] -} - -print("Standard deviations for estimators") - -sqrt(mean(CLs^2)) -sqrt(mean(CRAs^2)) -sqrt(mean(IRAs^2)) -``` - diff --git a/CM2/Old/r-colliderbias-hollywood.irnb.Rmd b/CM2/Old/r-colliderbias-hollywood.irnb.Rmd deleted file mode 100644 index 5c57b0b3..00000000 --- a/CM2/Old/r-colliderbias-hollywood.irnb.Rmd +++ /dev/null @@ -1,66 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Collider Bias - - -Here is a simple mnemonic example to illustate the collider or M-bias. - -Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beaty are negatively correlated" for celebrities. - -```{r} -install.packages("dagitty") -library(dagitty) -``` - -```{r} -g <- dagitty( "dag{ T -> C <- B }" ) -plot(g) -``` - -```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} -#collider bias -n=1000000 -T = rnorm(n) #talent -B = rnorm(n) #beaty -C = T+B + rnorm(n) #congeniality -T.H= subset(T, C>0) # condition on C>0 -B.H= subset(B, C>0) # condition on C>0 - -summary(lm(T~ B)) #regression of T on B -summary(lm(T~ B +C)) #regression of T on B and C -summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0. - - - -``` - -We can also use package Dagitty to illustrate collider bias, also known as M-bias. - -```{r _uuid="d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", _cell_guid="79c7e3d0-c299-4dcb-8224-4455121ee9b0"} -## If we want to infer causal effec of B on T, -## we can apply the command to figure out -## variables we should condition on: - -adjustmentSets( g, "T", "B" ) - -## empty set -- we should not condition on the additional -## variable C. - -## Generate data where C = .5T + .5B -set.seed( 123); d <- simulateSEM( g, .5 ) -confint( lm( T ~ B, d ) )["B",] # includes 0 -confint( lm( T ~ B + C, d ) )["B",] # does not include 0 - -``` diff --git a/CM2/r-colliderbias-hollywood.Rmd b/CM2/r-colliderbias-hollywood.Rmd deleted file mode 100644 index 674a968e..00000000 --- a/CM2/r-colliderbias-hollywood.Rmd +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: An R Markdown document converted from "CM2/r-colliderbias-hollywood.irnb" -output: html_document ---- - -# Collider Bias - -Here is a simple mnemonic example to illustate the collider or M-bias. - -Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beauty are negatively correlated" for celebrities. - -```{r} -install.packages("dagitty") -``` - -```{r} -library(dagitty) -``` - -```{r} -causal_graph <- dagitty("dag{ Talent -> Congen <- Beauty }") -plot(causal_graph) -``` - -```{r} -# collider bias -n <- 1000000 -Talent <- rnorm(n) # talent -Beauty <- rnorm(n) # beauty -Congen <- Talent + Beauty + rnorm(n) # congeniality -TalentH <- subset(Talent, Congen > 0) # condition on Congen>0 -BeautyH <- subset(Beauty, Congen > 0) # condition on Congen>0 - -summary(lm(Talent ~ Beauty)) # regression of Talent on Beauty -summary(lm(Talent ~ Beauty + Congen)) # regression of Talent on Beauty and Congen -summary(lm(TalentH ~ BeautyH)) # regression of Talent on Beauty, conditional on Congen>0. -``` - -We can also use package Dagitty to illustrate collider bias, also known as M-bias. - -```{r} -## If we want to infer causal effect of Beauty on Talent, -## we can apply the command to figure out -## variables we should condition on: - -adjustmentSets(causal_graph, "Talent", "Beauty") - -## empty set -- we should not condition on the additional -## variable Congen. -``` - -```{r} -## Generate data where Congen = .5*Talent + .5*Beauty -set.seed(123) -data <- simulateSEM(causal_graph, .5) -confint(lm(Talent ~ Beauty, data))["Beauty", ] -confint(lm(Talent ~ Beauty + Congen, data))["Beauty", ] -``` - diff --git a/CM3/Old/notebook-dagitty.irnb.Rmd b/CM3/Old/notebook-dagitty.irnb.Rmd deleted file mode 100644 index 73737f28..00000000 --- a/CM3/Old/notebook-dagitty.irnb.Rmd +++ /dev/null @@ -1,220 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests - - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -#install and load package -install.packages("dagitty") -install.packages("ggdag") -library(dagitty) -library(ggdag) - -``` - -# Graph Generation and Plotting - - -The following DAG is due to Judea Pearl - -```{r} -#generate a couple of DAGs and plot them - -G = dagitty('dag{ -Z1 [pos="-2,-1.5"] -X1 [pos="-2,0"] -Z2 [pos="1.5,-1.5"] -X3 [pos="1.5, 0"] -Y [outcome,pos="1.5,1.5"] -D [exposure,pos="-2,1.5"] -M [mediator, pos="0,1.5"] -X2 [pos="0,0"] -Z1 -> X1 -X1 -> D -Z1 -> X2 -Z2 -> X3 -X3 -> Y -Z2 -> X2 -D -> Y -X2 -> Y -X2 -> D -M->Y -D->M -}') - - -ggdag(G)+ theme_dag() -``` - -# Report Relatives of X2 - -```{r} -print(parents(G, "X2")) -print(children(G, "X2")) -print(ancestors(G, "X2")) -print(descendants(G, "X2")) - - -``` - -# Find Paths Between D and Y - - - -```{r} -paths(G, "D", "Y") -``` - -# List All Testable Implications of the Model - -```{r} -print( impliedConditionalIndependencies(G) ) -``` - -# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \to Y$ - -```{r} -print( adjustmentSets( G, "D", "Y" ) ) -``` - -# Identification via SWIG and D-separation - -```{r} -SWIG = dagitty('dag{ -Z1 [pos="-2,-1.5"] -X1 [pos="-2,0"] -Z2 [pos="1.5,-1.5"] -X3 [pos="1.5, 0"] -Yd [outcome,pos="1.5,1.5"] -D [exposure,pos="-2,1.5"] -d [pos="-1, 1.5"] -Md [mediator, pos="0,1.5"] -X2 [pos="0,0"] -Z1 -> X1 -X1 -> D -Z1 -> X2 -Z2 -> X3 -X3 -> Yd -Z2 -> X2 -X2 -> Yd -X2 -> D -X3-> Yd -Md-> Yd -d-> Md -}') - -ggdag(SWIG)+ theme_dag() - -``` - - -# Deduce Conditional Exogeneity or Ignorability by D-separation - - -```{r} -print( impliedConditionalIndependencies(SWIG)[5:8] ) - -``` - -This coincides with the backdoor criterion for this graph. - - -# Print All Average Effects Identifiable by Conditioning - -```{r} -for( n in names(G) ){ - for( m in children(G,n) ){ - a <- adjustmentSets( G, n, m ) - if( length(a) > 0 ){ - cat("The effect ",n,"->",m, - " is identifiable by controlling for:\n",sep="") - print( a, prefix=" * " ) - } - } -} -``` - -# Equivalence Classes - -```{r} -P=equivalenceClass(G) -plot(P) -#equivalentDAGs(G,10) -``` - -Next Consider the elemntary Triangular Model: -$$ -D \to Y, \quad X \to (D,Y). -$$ -This model has not testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$. - -```{r} -G3<- dagitty('dag{ -D -> Y -X -> D -X -> Y -} -') - -ggdag(G3)+ theme_dag() - -print(impliedConditionalIndependencies(G3)) - - -``` - -```{r} -P=equivalenceClass(G3) -plot(P) -equivalentDAGs(G3,10) - - -``` - - -# Example of Testing DAG Validity - -Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity. - - -There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests). - - -```{r} -set.seed(1) -x <- simulateSEM(G) -head(x) -#cov(x) -localTests(G, data = x, type = c("cis")) - - -``` - -Next we replaced $D$ by $\bar D$ generated differently: -$$ -\bar D= (D + Y)/2 -$$ -So basically $\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional indepdendence restrictions, exploiting linearity. We end up rejectiong these restrictions and thefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure. - - - -```{r} -x.R = x -x.R$D = (x$D+ x$Y)/2 - -localTests(G, data = x.R, type = c("cis")) - - -``` diff --git a/CM3/Old/notebook-dosearch.irnb.Rmd b/CM3/Old/notebook-dosearch.irnb.Rmd deleted file mode 100644 index bd5b8e15..00000000 --- a/CM3/Old/notebook-dosearch.irnb.Rmd +++ /dev/null @@ -1,235 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Dosearch for Causal Identification in DAGs. - - -This a simple notebook for teaching that illustrates capabilites of the "dosearch" package, which is a great tool. - -NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples. - - -```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} -install.packages("dosearch") -library("dosearch") - - -``` - -We start with the simplest graph, with the simplest example -where $D$ is policy, $Y$ is outcomes, $X$ is a confounder: -$$ -D\to Y, \quad X \to (D,Y) -$$ - - - -Now suppose we want conditional average policy effect. - -```{r} -data <- "p(y,d,x)" #data structure - -query <- "p(y | do(d),x)" #query -- target parameter - -graph <- "x -> y - x -> d - d -> y" - -dosearch(data, query, graph) -``` - -This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$: -$$ -p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x). -$$ - -```{r} -data <- "p(y,d,x)" - -query <- "p(y | do(d))" - -graph <- "x -> y - x -> d - d -> y" - - -dosearch(data, query, graph) - -``` - -This recovers the correct identification formula: -$$ -p_{Y(d)}(y) := p(y: do(d)) = \sum_{x}\left(p(x)p(y|d,x)\right) -$$ -We integreate out $x$ in the previous formula. - - - -Suppose we don't observe the confounder. The effect is generally not identified. - - -```{r} - -data <- "p(y,d)" - -query <- "p(y | do(d))" - -graph <- "x -> y - x -> d - d -> y" - -dosearch(data, query, graph) - -``` - -The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. We are interested in $D \to Y$. - -![image.png](attachment:image.png) - -Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully). - - -```{r} - -data <- "p(y,d,x2)" #observed only (Y, D, X_2) - -query<- "p(y|do(d))" #target parameter - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y -" - -dosearch(data, query, graph) - -``` - -Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work. - -```{r} - -data <- "p(y,d,x2,x3)" - -conditional.query<- "p(y|do(d),x2, x3)" #can ID conditional average effect? -query<- "p(y|do(d))" #can ID unconditional effect? - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y -" - -print(dosearch(data, conditional.query, graph)) -print(dosearch(data, query, graph)) - -``` - -This retrives correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$: - -The conditional distribution is identified by: -$$ -p_{Y(d) \mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d). -$$ - -The unconditional distribution is obtained by integration out $x_2$ and $x_3$: - -$$ -p_{Y(d) }(y) := p(y do(d)) = \sum_{x2,x3}\left(p(x_2,x_3)p(y|x_2,x_3,d)\right). -$$ - - - - - -Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \to Y$? Can we use back-door-criterion twice to get $D \to M$ and $M \to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively. - -```{r} -data <- "p(y,d, m)" - -query.dm<- "p(m|do(d))" -query.md<- "p(y|do(m))" -query<- "p(y|do(d))" - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y -" -print(dosearch(data, query.dm, graph)) -print(dosearch(data, query.md, graph)) -print(dosearch(data, query, graph)) - -``` - -So we get identification results: -First, -$$ -p_{M(d)}(m) := p(m|do(d)) = p(m|d). -$$ -Second, -$$ -p_{Y(m)}(y) := p(y|do(m)) = \sum_{d}\left(p(d)p(y|d,m)\right), -$$ -and the last by integrating the product of these two formulas: -$$ -p_{Y(d)}(y) := p(y|do(d)) = \sum_{m}\left(p(m|d)\sum_{d}\left(p(d)p(y|d,m)\right)\right) -$$ - - - -The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \to Y$. The answer is (guess) and the package correctly recovers it. - -```{r} -data <- "p(y,m) - p(m,d)" - -query.dm<- "p(m|do(d))" -query.md<- "p(y|do(m))" -query<- "p(y|do(d))" - -graph<- "z1 -> x1 -z1 -> x2 -z2 -> x2 -z2 -> x3 -x2 -> d -x2 -> y -x3 -> y -x1 -> d -d -> m -m -> y -" -print(dosearch(data, query.dm, graph)) -print(dosearch(data, query.md, graph)) -print(dosearch(data, query, graph)) -``` diff --git a/CM3/r-dagitty.Rmd b/CM3/r-dagitty.Rmd deleted file mode 100644 index d7885b91..00000000 --- a/CM3/r-dagitty.Rmd +++ /dev/null @@ -1,199 +0,0 @@ ---- -title: An R Markdown document converted from "CM3/r-dagitty.irnb" -output: html_document ---- - -There are system packages that some of the R packages need. We install them here. - -```{r} -system("sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable") -system("sudo apt-get update") -system("sudo apt-get install libglpk-dev libgmp-dev libxml2-dev") -``` - -# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests - -```{r} -# install and load package -install.packages("dagitty") -install.packages("ggdag") -``` - -```{r} -library(dagitty) -library(ggdag) -``` - -# Graph Generation and Plotting - -The following DAG is due to Judea Pearl - -```{r} -# generate a couple of DAGs and plot them - -G <- dagitty('dag{ -Z1 [pos="-2,-1.5"] -X1 [pos="-2,0"] -Z2 [pos="1.5,-1.5"] -X3 [pos="1.5, 0"] -Y [outcome,pos="1.5,1.5"] -D [exposure,pos="-2,1.5"] -M [mediator, pos="0,1.5"] -X2 [pos="0,0"] -Z1 -> X1 -X1 -> D -Z1 -> X2 -Z2 -> X3 -X3 -> Y -Z2 -> X2 -D -> Y -X2 -> Y -X2 -> D -M->Y -D->M -}') - - -ggdag(G) + theme_dag() -``` - -# Report Relatives of X2 - -```{r} -print(parents(G, "X2")) -print(children(G, "X2")) -print(ancestors(G, "X2")) -print(descendants(G, "X2")) -``` - -# Find Paths Between D and Y - - -```{r} -paths(G, "D", "Y") -``` - -# List All Testable Implications of the Model - -```{r} -print(impliedConditionalIndependencies(G)) -``` - -# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \to Y$ - -```{r} -print(adjustmentSets(G, "D", "Y")) -``` - -# Identification via SWIG and D-separation - -```{r} -SWIG <- dagitty('dag{ -Z1 [pos="-2,-1.5"] -X1 [pos="-2,0"] -Z2 [pos="1.5,-1.5"] -X3 [pos="1.5, 0"] -Yd [outcome,pos="1.5,1.5"] -D [exposure,pos="-2,1.5"] -d [pos="-1, 1.5"] -Md [mediator, pos="0,1.5"] -X2 [pos="0,0"] -Z1 -> X1 -X1 -> D -Z1 -> X2 -Z2 -> X3 -X3 -> Yd -Z2 -> X2 -X2 -> Yd -X2 -> D -X3-> Yd -Md-> Yd -d-> Md -}') - -ggdag(SWIG) + theme_dag() -``` - - -# Deduce Conditional Exogeneity or Ignorability by D-separation - -```{r} -print(impliedConditionalIndependencies(SWIG)[5:8]) -``` - -This coincides with the backdoor criterion for this graph. - -# Print All Average Effects Identifiable by Conditioning - -```{r} -for (n in names(G)) { - for (m in children(G, n)) { - a <- adjustmentSets(G, n, m) - if (length(a) > 0) { - cat("The effect ", n, "->", m, " is identifiable by controlling for:\n", sep = "") - print(a, prefix = " * ") - } - } -} -``` - -# Equivalence Classes - -```{r} -P <- equivalenceClass(G) -plot(P) -``` - -Next Consider the elemntary Triangular Model: -$$ -D \to Y, \quad X \to (D,Y). -$$ -This model has no testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$. - -```{r} -G3 <- dagitty("dag{ -D -> Y -X -> D -X -> Y -} -") - -ggdag(G3) + theme_dag() - -print(impliedConditionalIndependencies(G3)) -``` - -```{r} -P <- equivalenceClass(G3) -plot(P) -equivalentDAGs(G3, 10) -``` - -# Example of Testing DAG Validity - -Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity. - - -There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests). - -```{r} -set.seed(1) -x <- simulateSEM(G) -head(x) -localTests(G, data = x, type = c("cis")) -``` - -Next we replaced $D$ by $\bar D$ generated differently: -$$ -\bar D= (D + Y)/2. -$$ -$\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional independence restrictions, exploiting linearity. We end up rejectiong these restrictions and therefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure. - - -```{r} -xR <- x -xR$D <- (x$D + x$Y) / 2 - -localTests(G, data = xR, type = c("cis")) -``` - diff --git a/CM3/r-dosearch.Rmd b/CM3/r-dosearch.Rmd deleted file mode 100644 index b5d2b0e0..00000000 --- a/CM3/r-dosearch.Rmd +++ /dev/null @@ -1,215 +0,0 @@ ---- -title: An R Markdown document converted from "CM3/r-dosearch.irnb" -output: html_document ---- - -# Dosearch for Causal Identification in DAGs. - - -This a simple notebook for teaching that illustrates capabilites of the "dosearch" package, which is a great tool. - -NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples. - -```{r} -install.packages("dosearch") -``` - -```{r} -library("dosearch") -``` - -We start with the simplest graph, with the simplest example -where $D$ is policy, $Y$ is outcomes, $X$ is a confounder: -$$ -D\to Y, \quad X \to (D,Y) -$$ - -Now suppose we want conditional average policy effect. - -```{r} -data <- "p(y, d, x)" # data structure - -query <- "p(y | do(d), x)" # query -- target parameter - -graph <- " - x -> y - x -> d - d -> y -" - -dosearch(data, query, graph) -``` - -This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$: -$$ -p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x). -$$ - -```{r} -data <- "p(y, d, x)" - -query <- "p(y | do(d))" - -graph <- " - x -> y - x -> d - d -> y -" - -dosearch(data, query, graph) -``` - -This recovers the correct identification formula: -$$ -p_{Y(d)}(y) := p(y: do(d)) = \sum_{x}\left(p(x)p(y|d,x)\right) -$$ -We integrate out $x$ in the previous formula. - -Suppose we don't observe the confounder. The effect is generally not identified. - -```{r} -data <- "p(y, d)" - -query <- "p(y | do(d))" - -graph <- " - x -> y - x -> d - d -> y -" - -dosearch(data, query, graph) -``` - -The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. See "Pearl's Example" in the book - e.g. Figure 7.14. We are interested in $D \to Y$. - -Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully). - -```{r} -data <- "p(y, d, x2)" # observed only (Y, D, X_2) - -query <- "p(y | do(d))" # target parameter - -graph <- " - z1 -> x1 - z1 -> x2 - z2 -> x2 - z2 -> x3 - x2 -> d - x2 -> y - x3 -> y - x1 -> d - d -> m - m -> y -" - -dosearch(data, query, graph) -``` - -Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work. - -```{r} -data <- "p(y, d, x2, x3)" - -conditional_query <- "p(y | do(d), x2, x3)" # can ID conditional average effect? -query <- "p(y | do(d))" # can ID unconditional effect? - -graph <- " - z1 -> x1 - z1 -> x2 - z2 -> x2 - z2 -> x3 - x2 -> d - x2 -> y - x3 -> y - x1 -> d - d -> m - m -> y -" - -print(dosearch(data, conditional_query, graph)) -print(dosearch(data, query, graph)) -``` - -This retrieves the correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$: - -The conditional distribution is identified by -$$ -p_{Y(d) \mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d). -$$ - -The unconditional distribution is obtained by integration out over $x_2$ and $x_3$: - -$$ -p_{Y(d) }(y) := p(y do(d)) = \sum_{x2,x3}\left(p(x_2,x_3)p(y|x_2,x_3,d)\right). -$$ - - - -Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \to Y$? Can we use back-door-criterion twice to get $D \to M$ and $M \to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively. - -```{r} -data <- "p(y, d, m)" - -query_dm <- "p(m | do(d))" -query_md <- "p(y | do(m))" -query <- "p(y | do(d))" - -graph <- " - z1 -> x1 - z1 -> x2 - z2 -> x2 - z2 -> x3 - x2 -> d - x2 -> y - x3 -> y - x1 -> d - d -> m - m -> y -" -print(dosearch(data, query_dm, graph)) -print(dosearch(data, query_md, graph)) -print(dosearch(data, query, graph)) -``` - -So we get identification results: -First, -$$ -p_{M(d)}(m) := p(m|do(d)) = p(m|d). -$$ -Second, -$$ -p_{Y(m)}(y) := p(y|do(m)) = \sum_{d}\left(p(d)p(y|d,m)\right), -$$ -and the last by integrating the product of these two formulas: -$$ -p_{Y(d)}(y) := p(y|do(d)) = \sum_{m}\left(p(m|d)\sum_{d}\left(p(d)p(y|d,m)\right)\right) -$$ - -The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \to Y$. The answer is (guess) and the package correctly recovers it. - -```{r} -data <- "p(y, m) - p(m, d)" - -query_dm <- "p(m | do(d))" -query_md <- "p(y | do(m))" -query <- "p(y | do(d))" - -graph <- " - z1 -> x1 - z1 -> x2 - z2 -> x2 - z2 -> x3 - x2 -> d - x2 -> y - x3 -> y - x1 -> d - d -> m - m -> y -" -print(dosearch(data, query_dm, graph)) -print(dosearch(data, query_md, graph)) -print(dosearch(data, query, graph)) -``` - diff --git a/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd b/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd deleted file mode 100644 index 396faa4a..00000000 --- a/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd +++ /dev/null @@ -1,419 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains an example for teaching. - - -# An inferential problem: The Wage Gap - - -In the previous lab, we analyzed data from the March Supplement of the U.S. Current Population Survey (2015) and answered the question of how to use job-relevant characteristics, such as education and experience, to best predict wages. Now, we focus on the following inference question: - -What is the difference in predicted wages between men and women with the same job-relevant characteristics? - -Thus, we analyze if the differences in the wages of groups defined by the "sex" variable in the data. This wage gap may partly reflect *discrimination* against women, both in the labor market and in settings that affect future labor marketability such as in education, and may partly reflect a *selection effect*, namely that women are relatively more likely to take on occupations that pay somewhat less (for example, school teaching). - - -To investigate the wage gap, we consider the following log-linear regression model - -\begin{align} -\log(Y) &= \beta'X + \epsilon\\ -&= \beta_1 D + \beta_2' W + \epsilon, -\end{align} - -where $Y$ is hourly wage, $D$ is the indicator of being female ($1$ if female and $0$ otherwise) and the -$W$'s are a vector of worker characteristics explaining variation in wages. Considering transformed wages by the logarithm, we are analyzing the relative difference in the payment of men and women. - - -## Data analysis - - -We consider the same subsample of the U.S. Current Population Survey (2015) as in the previous lab. Let us load the data set. - -```{r} -load("../data/wage2015_subsample_inference.Rdata") -attach(data) -dim(data) -``` - -To start our (causal) analysis, we compare the sample means given the "sex" variable: - -```{r} -library(xtable) - -Z <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] - -data_female <- data[data$sex==1,] -Z_female <- data_female[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] - -data_male <- data[data$sex==0,] -Z_male <- data_male[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] - -table <- matrix(0, 12, 3) -table[1:12,1] <- as.numeric(lapply(Z,mean)) -table[1:12,2] <- as.numeric(lapply(Z_male,mean)) -table[1:12,3] <- as.numeric(lapply(Z_female,mean)) -rownames(table) <- c("Log Wage","Sex","Less then High School","High School Graduate","Some College","College Graduate","Advanced Degree", "Northeast","Midwest","South","West","Experience") -colnames(table) <- c("All","Men","Women") -tab<- xtable(table, digits = 4) -tab -``` - -```{r, results = FALSE} -print(tab,type="html") # set type="latex" for printing table in LaTeX -``` - - - - - - - - - - - - - - - - - -
All Men Women
Log Wage 2.9708 2.9878 2.9495
Sex 0.4445 0.0000 1.0000
Less then High School 0.0233 0.0318 0.0127
High School Graduate 0.2439 0.2943 0.1809
Some College 0.2781 0.2733 0.2840
Gollage Graduate 0.3177 0.2940 0.3473
Advanced Degree 0.1371 0.1066 0.1752
Northeast 0.2596 0.2590 0.2604
Midwest 0.2965 0.2981 0.2945
South 0.2161 0.2209 0.2101
West 0.2278 0.2220 0.2350
Experience 13.7606 13.7840 13.7313
- - -In particular, the table above shows that the difference in average *logwage* between men and women is equal to $0.038$ - -```{r} -mean(data_female$lwage)-mean(data_male$lwage) -``` - -Thus, the unconditional wage gap is about $3.8$\% for the group of never married workers (women get paid less on average in our sample). We also observe that never married working women are relatively more educated than working men and have lower working experience. - - -This unconditional (predictive) effect of the variable "sex" equals the coefficient $\beta$ in the univariate ols regression of $Y$ on $D$: - -\begin{align} -\log(Y) &=\beta D + \epsilon. -\end{align} - - -We verify this by running an ols regression in R. - -```{r} -library(sandwich) # a package used to compute robust standard errors -nocontrol.fit <- lm(lwage ~ sex) -nocontrol.est <- summary(nocontrol.fit)$coef["sex",1] -HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC'); # HC HEW - "heteroskedasticity consistent" -nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors - -# print unconditional effect of sex and the corresponding standard error -cat ("The estimated coefficient on the dummy for \"sex\" is",nocontrol.est," and the corresponding robust standard error is",nocontrol.se) - -``` - -Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. - - - -Next, we run an ols regression of $Y$ on $(D,W)$ to control for the effect of covariates summarized in $W$: - -\begin{align} -\log(Y) &=\beta_1 D + \beta_2' W + \epsilon. -\end{align} - -Here, we are considering the flexible model from the previous lab. Hence, $W$ controls for experience, education, region, and occupation and industry indicators plus transformations and two-way interactions. - - -Let us run the ols regression with controls. - -```{r} -# ols regression with controls - -flex <- lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) - -# Note that ()*() operation in formula objects in R creates a formula of the sort: -# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) -# This is not intuitive at all, but that's what it does. - -control.fit <- lm(flex, data=data) -control.est <- summary(control.fit)$coef[2,1] - -summary(control.fit) - -cat("Coefficient for OLS with controls", control.est) - -HCV.coefs <- vcovHC(control.fit, type = 'HC'); -control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors -JCV.coefs <- vcovHC(control.fit, type = 'HC3'); # Jackknife estimate is more appropriate in moderate dimensional settings -control.Jse <- sqrt(diag(JCV.coefs))[2] # Estimated std errors -``` - -We now show how the conditional gap and the remainder decompose the marginal wage gap into the parts explained and unexplained by the additional controls. (Note that this does *not* explain why there is a difference in the controls to begin with in the two groups.) - -```{r} -cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage)) - -cat("The unexplained difference: ",control.est) - -XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we), data = data[data$sex==0,]) -y0 = data[data$sex==0,]$lwage -XX1 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we), data = data[data$sex==1,]) -y1 = data[data$sex==1,]$lwage -mu1 = colMeans(XX1) -mu0 = colMeans(XX0) -betarest = summary(control.fit)$coef[3:246,1] # the coefficients excluding intercept and "sex" -cat("The explained difference:",sum(betarest*(mu1[2:245]-mu0[2:245]))) - -cat("The sum of these differences:",control.est + sum(betarest*(mu1[2:245]-mu0[2:245]))) -``` - -We next consider a Oaxaca-Blinder decomposition that also incorporates an interaction term. - -```{r} -library(MASS) -beta0 = ginv(t(XX0) %*% XX0) %*% t(XX0) %*% y0 -beta1 = ginv(t(XX1) %*% XX1) %*% t(XX1) %*% y1 - -cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage)) -cat("The unexplained difference:",beta1[1]-beta0[1]) -cat("The difference explained by endowment:",sum(beta0[2:245]*(mu1[2:245]-mu0[2:245]))) -cat("The difference explained by coefficient:",sum((beta1[2:245]-beta0[2:245])*mu1[2:245])) -cat("The sum of these differences:",beta1[1]-beta0[1] + sum(beta0[2:245]*(mu1[2:245]-mu0[2:245])) + sum((beta1[2:245]-beta0[2:245])*mu1[2:245])) -``` - -Let's compare Huber-Eicker-White (HEW) standard errors to jackknife standard errors (which are more appropriate in moderate dimensional settings.) We can see that they're pretty close in this case. - -```{r} -cat("HEW s.e. : ", control.se) -cat("Jackknife s.e. : ", control.Jse) - -``` - -The estimated regression coefficient $\beta_1\approx-0.0696$ measures how our linear prediction of wage changes if we set the "sex" variable $D$ from 0 to 1, holding the controls $W$ fixed. -We can call this the *predictive effect* (PE), as it measures the impact of a variable on the prediction we make. Overall, we see that the unconditional wage gap of size $4$\% for women increases to about $7$\% after controlling for worker characteristics. - - - -Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out the linear effect of the controls via ols. - -```{r} -# Partialling-out using ols - -# models -flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for Y -flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for D - -# partialling-out the linear effect of W from Y -t.Y <- lm(flex.y, data=data)$res -# partialling-out the linear effect of W from D -t.D <- lm(flex.d, data=data)$res - -# regression of Y on D after partialling-out the effect of W -partial.fit <- lm(t.Y~t.D-1) -partial.est <- summary(partial.fit)$coef[1] - -cat("Coefficient for D via partialling-out", partial.est) - -# standard error -HCV.coefs <- vcovHC(partial.fit, type = 'HC') -partial.se <- sqrt(diag(HCV.coefs)) -# Note that jackknife standard errors depend on all the variables in the model and so are not appropriate for the partialed out regression (without adjustment) - - -# confidence interval -confint(partial.fit) -``` - -Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. - - -We know that the partialling-out approach works well when the dimension of $W$ is low -in relation to the sample size $n$. When the dimension of $W$ is relatively high, we need to use variable selection -or penalization for regularization purposes. - -In the following, we illustrate the partialling-out approach using lasso instead of ols. - -```{r} -# Partialling-out using lasso - -library(hdm) - -# models -flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for Y -flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for D - -# partialling-out the linear effect of W from Y -t.Y <- rlasso(flex.y, data=data)$res -# partialling-out the linear effect of W from D -t.D <- rlasso(flex.d, data=data)$res - -# regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t.Y~t.D-1) -partial.lasso.est <- summary(partial.lasso.fit)$coef[1] - -cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) - -# standard error -HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC') -partial.lasso.se <- sqrt(diag(HCV.coefs)) -``` - -Using lasso for partialling-out here provides similar results as using ols. - - -Next, we summarize the results. - -```{r} -table<- matrix(0, 4, 2) -table[1,1]<- nocontrol.est -table[1,2]<- nocontrol.se -table[2,1]<- control.est -table[2,2]<- control.se -table[3,1]<- partial.est -table[3,2]<- partial.se -table[4,1]<- partial.lasso.est -table[4,2]<- partial.lasso.se -colnames(table)<- c("Estimate","Std. Error") -rownames(table)<- c("Without controls", "full reg", "partial reg", "partial reg via lasso") -tab<- xtable(table, digits=c(3, 3, 4)) -tab -``` - -```{r, results = FALSE} -print(tab, type="html") -``` - - - - - - - - - -
Estimate Std. Error
Without controls -0.038 0.0159
full reg -0.070 0.0150
partial reg -0.070 0.0150
partial reg via lasso -0.072 0.0154
- - - -It it worth noticing that controlling for worker characteristics increases the wage gap from less than 4\% to 7\%. The controls we used in our analysis include 5 educational attainment indicators (less than high school graduates, high school graduates, some college, college graduate, and advanced degree), 4 region indicators (midwest, south, west, and northeast); a quartic term (first, second, third, and fourth power) in experience and 22 occupation and 23 industry indicators. - -Keep in mind that the predictive effect (PE) does not only measures discrimination (causal effect of being female), it also may reflect -selection effects of unobserved differences in covariates between men and women in our sample. - - - -Next we try an "extra" flexible model, where we take interactions of all controls, giving us about 800 non-redundant controls. - -```{r} -# extra flexible model - -extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 - -control.fit <- lm(extraflex, data=data) -#summary(control.fit) -control.est <- summary(control.fit)$coef[2,1] - -cat("Number of Extra-Flex Controls", summary(control.fit)$df[1]-1, "\n") - -cat("Coefficient for OLS with extra flex controls", control.est) - -HCV.coefs <- vcovHC(control.fit, type = 'HC'); -control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors - -JCV.coefs <- vcovHC(control.fit, type = 'HC3'); # Jackknife -control.Jse <- sqrt(diag(JCV.coefs))[2] # Estimated std errors - -cat("HEW s.e. : ", control.se) -cat("Jackknife s.e. : ", control.Jse) - -# From Cattaneo, Jannson, and Newey (2018), we expect jackknife s.e.'s to be -# conservative. -``` - -Interestingly, jackknife standard errors are undefined in this case. Due to the high dimensional control vector, we know that conventional heteroskedasticity robust standard errors will also be severely biased. That is, the approximation obtained under $p/n$ being small is clearly breaking down here. We might then like to implement Cattaneo, Jannson, and Newey (2018) (CJN) which is supposed to work in the $p/n \rightarrow c < 1$ regime. However, computing CJN requires inversion of a matrix which is computationally singular in this example (which is related to why the jackknife s.e. are undefined.) - -```{r, eval = FALSE, echo = TRUE} -# Try to make a brute force implementation of Cattaneo, Jannson, Newey (2018). -# This is slow and doesn't actually add anything as the matrix needed to -# construct CJN is (at least) numerically singular. Don't run this block. - -# models -extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for Y -extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for D - -# CJN requires that M.*M is invertible where M = I-W(W'W)^{-1}W' and .* is the Hadamard product - -regexflex.y = lm(extraflex.y, data = data, x = TRUE) # Regression of outcome on controls -W = tmp$x # Design matrix -Wli = W[,!is.na(regexflex.y$coefficients)] # Linearly independent columns of W -np = dim(Wli) -M = diag(np[1])-Wli%*%solve(t(Wli)%*%Wli)%*%t(Wli) # Matrix M (after removing redundant columns) -scM = 1 - min(diag(M)) # This number needs to be smaller than 1/2 for CJN theoretical results -scM # Just stop here - -#MM = M^2 # Hadamard product M.*M - -#library(Matrix) -#rankMatrix(MM) # find the (numeric) rank of MM which ends up being less than its dimension. Tried to actually move forward and use other methods to invert/assess invertibility, but none worked. Not going to use a generalized inverse as this goes against the idea of the theory. - -# Not going to be able to compute CJN -``` - -We can also try to use Lasso to partial out the control variables. We'll justify this later. - -```{r} -library(hdm) - -# models -extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for Y -extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for D - -# partialling-out the linear effect of W from Y -t.Y <- rlasso(extraflex.y, data=data)$res -# partialling-out the linear effect of W from D -t.D <- rlasso(extraflex.d, data=data)$res - -# regression of Y on D after partialling-out the effect of W -partial.lasso.fit <- lm(t.Y~t.D-1) -partial.lasso.est <- summary(partial.lasso.fit)$coef[1] - -cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) - -# standard error -HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') -partial.lasso.se <- sqrt(diag(HCV.coefs)) -``` - -```{r} -table<- matrix(0, 2, 2) -table[1,1]<- control.est -table[1,2]<- control.se -table[2,1]<- partial.lasso.est -table[2,2]<- partial.lasso.se -colnames(table)<- c("Estimate","Std. Error") -rownames(table)<- c("full reg","partial reg via lasso") -tab<- xtable(table, digits=c(3, 3, 4)) -tab - -print(tab, type="latex") -``` - -In this case $p/n$ = 20%, that is $p/n$ is no longer small and we start seeing the differences between unregularized partialling out and regularized partialling out with lasso (double lasso). The results based on double lasso have rigorous guarantees in this non-small $p/n$ regime under approximate sparsity. The results based on OLS still -have guarantees in $p/n< 1$ regime under assumptions laid out in Cattaneo, Newey, and Jansson (2018), without approximate sparsity, although other regularity conditions are needed. These additional regularity conditions appear to potentially be violated in our "very flexible" specification. - - - - - diff --git a/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd b/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd deleted file mode 100644 index 342b5c8f..00000000 --- a/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd +++ /dev/null @@ -1,440 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains an example for teaching. - - -## Introduction - - -An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective. - -In the following wage example, $Y$ is the hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are - -* How can we use job-relevant characteristics, such as education and experience, to best predict wages? - -* What is the difference in predicted wages between men and women with the same job-relevant characteristics? - -In this lab, we focus on the prediction question first. - - -## Data - - - -The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$. - -The variable of interest $Y$ is the hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$. - - -## Data analysis - - -We start by loading the data set. - -```{r} -load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") -#load("../Data/wage2015_subsample_inference.Rdata") # To run locally on Hansen's PC -dim(data) -``` - -Let's have a look at the structure of the data. - -```{r} -str(data) -``` - -We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data. - -```{r} -# construct matrices for estimation from the data -Y <- log(data$wage) -n <- length(Y) -Z <- data[-which(colnames(data) %in% c("wage","lwage"))] -p <- dim(Z)[2] - -cat("Number of observations:", n, '\n') -cat( "Number of raw regressors:", p) -``` - -For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data. - -```{r} -# generate a table of means of variables -library(xtable) -Z_subset <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","mw","so","we","ne","exp1"))] -table <- matrix(0, 12, 1) -table[1:12,1] <- as.numeric(lapply(Z_subset,mean)) -rownames(table) <- c("Log Wage","Sex","Some High School","High School Graduate","Some College","College Graduate", "Advanced Degree","Midwest","South","West","Northeast","Experience") -colnames(table) <- c("Sample mean") -tab<- xtable(table, digits = 2) -tab -``` - -E.g., the share of female workers in our sample is ~44% ($sex=1$ if female). - - -Alternatively, using the xtable package, we can also print the table in LaTeX. - -```{r} -print(tab, type="latex") # type="latex" for printing table in LaTeX -``` - -## Prediction Question - - -Now, we will construct a prediction rule for hourly wage $Y$, which depends linearly on job-relevant characteristics $X$: - -\begin{equation}\label{decompose} -Y = \beta'X+ \epsilon. -\end{equation} - - -Our goals are - -* Predict wages using various characteristics of workers. - -* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$. - - -We employ two different specifications for prediction: - - -1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators). - - -2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*. - -3. ``Extra Flexible'' Model: $X$ consists of two way interactions of all raw variables, giving us about 1000 controls. - -Using more flexible models enables us to approximate the real relationship by a more complex regression model and therefore has the potential to reduce the bias relative to a more simple specification that cannot capture a complex relationship. That is, flexible models increase the range of potential shapes that can be accommodated by the estimated regression function. With sufficient data, flexible models often deliver higher prediction accuracy than simpler models but are harder to interpret. In small data sets, simpler models often perform relatively well. - - -Now, let us our three candidate models to our data by running ordinary least squares (ols): - -```{r} -# 1. basic model -basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +occ2+ind2) -regbasic <- lm(basic, data=data) # perform ols using the defined model -regbasic # estimated coefficients -cat( "Number of regressors in the basic model:",length(regbasic$coef), '\n') # number of regressors in the Basic Model - -``` - -##### Note that the basic model consists of $51$ regressors. - -```{r} -# 2. flexible model -flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+occ2+ind2 + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) -regflex <- lm(flex, data=data) -regflex # estimated coefficients -cat( "Number of regressors in the flexible model:",length(regflex$coef), "\n") # number of regressors in the Flexible Model - -``` - -##### Note that the flexible model consists of $246$ regressors. - - -```{r} -# 3. extra flexible model -extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 -regextra <- lm(extraflex, data=data) -cat( "Number of regressors in the extra flexible model:",sum(!is.na(regextra$coefficients)), "\n") # number of regressors in the extra flexible Model - -``` - -##### Note that the extra flexible model consists of $780$ non-redundant regressors. - - - -#### Re-estimating the flexible and extra-flexible model using Lasso -We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. For now, we use a simple default plug-in rule for choosing the penalization for Lasso. - -```{r, results = FALSE} -# Flexible model using Lasso -library(hdm) -lassoreg<- rlasso(flex, data=data, post=FALSE) # Post= FALSE gives lasso -sumlasso<- summary(lassoreg) - -lassoexflex <- rlasso(extraflex, data = data, post=FALSE) # Post= FALSE gives lasso -sumlassoflex <- summary(lassoexflex) - -``` - -#### Evaluating the predictive performance of the models in-sample -Now, we can evaluate the performance of our models based on in-sample measures of fit -- the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$: - -```{r} -# Assess predictive performance -sumbasic <- summary(regbasic) -sumflex <- summary(regflex) -sumextra <- summary(regextra) - -# R-squared and adjusted R-squared -R2.1 <- sumbasic$r.squared -cat("R-squared for the basic model: ", R2.1, "\n") -R2.adj1 <- sumbasic$adj.r.squared -cat("adjusted R-squared for the basic model: ", R2.adj1, "\n") - -R2.2 <- sumflex$r.squared -cat("R-squared for the flexible model: ", R2.2, "\n") -R2.adj2 <- sumflex$adj.r.squared -cat("adjusted R-squared for the flexible model: ", R2.adj2, "\n") - -R2.3 <- sumextra$r.squared -cat("R-squared for the extra flexible model: ", R2.3, "\n") -R2.adj3 <- sumextra$adj.r.squared -cat("adjusted R-squared for the extra flexible model: ", R2.adj3, "\n") - -R2.L <- sumlasso$r.squared -cat("R-squared for lasso with the flexible model: ", R2.L, "\n") -R2.adjL <- sumlasso$adj.r.squared -cat("adjusted R-squared for lasso with the flexible model: ", R2.adjL, "\n") - -R2.L2 <- sumlassoflex$r.squared -cat("R-squared for lasso with the very flexible model: ", R2.L2, "\n") -R2.adjL2 <- sumlassoflex$adj.r.squared -cat("adjusted R-squared for lasso with the flexible model: ", R2.adjL2, "\n") - - -# MSE and adjusted MSE -MSE1 <- mean(sumbasic$res^2) -cat("MSE for the basic model: ", MSE1, "\n") -p1 <- sumbasic$df[1] # number of regressors -MSE.adj1 <- (n/(n-p1))*MSE1 -cat("adjusted MSE for the basic model: ", MSE.adj1, "\n") - -MSE2 <-mean(sumflex$res^2) -cat("MSE for the flexible model: ", MSE2, "\n") -p2 <- sumflex$df[1] -MSE.adj2 <- (n/(n-p2))*MSE2 -cat("adjusted MSE for the flexible model: ", MSE.adj2, "\n") - -MSE3 <-mean(sumextra$res^2) -cat("MSE for the extra flexible model: ", MSE3, "\n") -p3 <- sumextra$df[1] -MSE.adj3 <- (n/(n-p3))*MSE3 -cat("adjusted MSE for the extra flexible model: ", MSE.adj3, "\n") - - -MSEL <-mean(sumlasso$res^2) -cat("MSE for the lasso with the flexible model: ", MSEL, "\n") -pL <- sum(sumlasso$coef != 0) -MSE.adjL <- (n/(n-pL))*MSEL -cat("adjusted MSE for the lasso with the flexible model: ", MSE.adjL, "\n") - -MSEL2 <-mean(sumlassoflex$res^2) -cat("MSE for the lasso with very flexible model: ", MSEL2, "\n") -pL2 <- sum(sumlassoflex$coef != 0) -MSE.adjL2 <- (n/(n-pL2))*MSEL2 -cat("adjusted MSE for the lasso with very flexible model: ", MSE.adjL2, "\n") - -``` - -```{r} -# Output the table -library(xtable) -table <- matrix(0, 5, 5) -table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1) -table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2) -table[3,1:5] <- c(p3,R2.3,MSE3,R2.adj3,MSE.adj3) -table[4,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL) -table[5,1:5] <- c(pL2,R2.L2,MSEL2,R2.adjL2,MSE.adjL2) -colnames(table)<- c("p","$R^2_{sample}$","$MSE_{sample}$","$R^2_{adjusted}$", "$MSE_{adjusted}$") -rownames(table)<- c("basic","flexible","very flexible","flexible-Lasso","very flexible-Lasso") -tab<- xtable(table, digits =c(0,0,2,2,2,2)) -print(tab,type="latex") -tab -``` - -Considering the measures above, the very flexible model estimated by OLS seems to perform better than the other approaches. Note, however, that the adjusted and regular measures are very different for this specification because $p/n$ is not small in this case. We also see that the differences between the usual and adjusted measures of fit increase as $p$ increases -- as predicted by theory. Finally, Lasso produces relatively stable results in both regimes that is comparable, though seems to be mildly worse in terms of predictive performance, than the OLS prediction rules. - -Let's now look at **data splitting** which provides a general procedure to assess predictive performance regardless of the ratio $p/n$. We illustrate the approach in the following. - - -## Data Splitting - -- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider). -- Use the training sample to estimate the parameters of the different predictive models. -- Use the testing sample for evaluation. Predict the $\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample. -- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for the candidate prediction models. - -```{r} -# splitting the data -set.seed(1) # to make the results replicable (we will generate random numbers) -random <- sample(1:n, floor(n*4/5)) -# draw (4/5)*n random numbers from 1 to n without replacing them -train <- data[random,] # training sample -test <- data[-random,] # testing sample -nV <- nrow(train) - -``` - -```{r} -# basic model -# estimating the parameters in the training sample -regbasic <- lm(basic, data=train) - -# calculating the out-of-sample MSE -trainregbasic <- predict(regbasic, newdata=test) -y.test <- log(test$wage) -MSE.test1 <- sum((y.test-trainregbasic)^2)/length(y.test) -R2.test1<- 1- MSE.test1/(sum((y.test-mean(train$lwage))^2)/length(y.test)) - -cat("Test MSE for the basic model: ", MSE.test1, " ") - -cat("Test R2 for the basic model: ", R2.test1) - -# in-sample MSE and R^2 -sumbasicV <- summary(regbasic) - -R2V.1 <- sumbasicV$r.squared -cat("Training R-squared for the basic model: ", R2V.1, "\n") -R2V.adj1 <- sumbasicV$adj.r.squared -cat("Training adjusted R-squared for the basic model: ", R2V.adj1, "\n") - -MSE1V <- mean(sumbasicV$res^2) -cat("Training MSE for the basic model: ", MSE1V, "\n") -p1V <- sumbasicV$df[1] # number of regressors -MSEV.adj1 <- (nV/(nV-p1V))*MSE1V -cat("Training adjusted MSE for the basic model: ", MSEV.adj1, "\n") - -``` - -In the basic model, the $MSE_{test}$ is relatively close to the $MSE_{sample}$. - -```{r} -# flexible model -# estimating the parameters -options(warn=-1) # ignore warnings -regflex <- lm(flex, data=train) - -# calculating the out-of-sample MSE -trainregflex<- predict(regflex, newdata=test) -y.test <- log(test$wage) -MSE.test2 <- sum((y.test-trainregflex)^2)/length(y.test) -R2.test2<- 1- MSE.test2/(sum((y.test-mean(train$lwage))^2)/length(y.test)) - -cat("Test MSE for the flexible model: ", MSE.test2, " ") - -cat("Test R2 for the flexible model: ", R2.test2) - -# in-sample MSE and R^2 -sumflexV <- summary(regflex) - -R2V.2 <- sumflexV$r.squared -cat("Training R-squared for the flexible model: ", R2V.2, "\n") -R2V.adj2 <- sumflexV$adj.r.squared -cat("Training adjusted R-squared for the flexible model: ", R2V.adj2, "\n") - -MSE2V <-mean(sumflexV$res^2) -cat("Training MSE for the flexible model: ", MSE2V, "\n") -p2V <- sumflexV$df[1] -MSEV.adj2 <- (nV/(nV-p2V))*MSE2V -cat("Training adjusted MSE for the flexible model: ", MSEV.adj2, "\n") - -``` - -In the flexible model, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is modest. - -```{r} -# very flexible model -# estimating the parameters -options(warn=-1) # ignore warnings -regextra <- lm(extraflex, data=train) - -# calculating the out-of-sample MSE -trainregextra<- predict(regextra, newdata=test) -y.test <- log(test$wage) -MSE.test3 <- sum((y.test-trainregextra)^2)/length(y.test) -R2.test3<- 1- MSE.test3/(sum((y.test-mean(train$lwage))^2)/length(y.test)) - -cat("Test MSE for the very flexible model: ", MSE.test3, " ") - -cat("Test R2 for the very flexible model: ", R2.test3) - -# in-sample MSE and R^2 -sumextraV <- summary(regextra) - -R2V.3 <- sumextraV$r.squared -cat("Training R-squared for the extra flexible model: ", R2V.3, "\n") -R2V.adj3 <- sumextraV$adj.r.squared -cat("Training adjusted R-squared for the extra flexible model: ", R2V.adj3, "\n") - -MSE3V <-mean(sumextraV$res^2) -cat("Training MSE for the extra flexible model: ", MSE3V, "\n") -p3V <- sumextraV$df[1] -MSEV.adj3 <- (nV/(nV-p3V))*MSE3V -cat("Training adjusted MSE for the extra flexible model: ", MSEV.adj3, "\n") - -``` - -In the very flexible model, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is large because $p/n$ is not small. - -It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results. - -Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model and both perform much better than the very flexible model. - -Next, let us use lasso regression in the flexible and very flexible models instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in these models to our previous ols regressions. - -```{r} -# flexible model using lasso -library(hdm) # a library for high-dimensional metrics -reglasso <- rlasso(flex, data=train, post=FALSE) # estimating the parameters -lassoexflex <- rlasso(extraflex, data = data, post=FALSE) # Post= FALSE gives lasso - -# calculating the out-of-sample MSE -trainreglasso<- predict(reglasso, newdata=test) -MSE.lasso <- sum((y.test-trainreglasso)^2)/length(y.test) -R2.lasso<- 1- MSE.lasso/(sum((y.test-mean(train$lwage))^2)/length(y.test)) - -cat("Test MSE for the lasso on flexible model: ", MSE.lasso, " ") - -cat("Test R2 for the lasso flexible model: ", R2.lasso) - -trainlassoexflex<- predict(lassoexflex, newdata=test) -MSE.lassoexflex <- sum((y.test-trainlassoexflex)^2)/length(y.test) -R2.lassoexflex <- 1- MSE.lassoexflex/(sum((y.test-mean(train$lwage))^2)/length(y.test)) - -cat("Test MSE for the lasso on the very flexible model: ", MSE.lassoexflex, " ") - -cat("Test R2 for the lasso on the very flexible model: ", R2.lassoexflex) - -``` - -Finally, let us summarize the results: - -```{r} -# Output the comparison table -table2 <- matrix(0, 5,2) -table2[1,1] <- MSE.test1 -table2[2,1] <- MSE.test2 -table2[3,1] <- MSE.test3 -table2[4,1] <- MSE.lasso -table2[5,1] <- MSE.lassoexflex -table2[1,2] <- R2.test1 -table2[2,2] <- R2.test2 -table2[3,2] <- R2.test3 -table2[4,2] <- R2.lasso -table2[5,2] <- R2.lassoexflex - -rownames(table2)<- rownames(table)<- c("basic","flexible","very flexible","flexible-Lasso","very flexible-Lasso") -colnames(table2)<- c("$MSE_{test}$", "$R^2_{test}$") -tab2 <- xtable(table2, digits =3) -tab2 -``` - -```{r} -print(tab2,type="latex") -``` diff --git a/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd b/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd deleted file mode 100644 index f925cba0..00000000 --- a/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd +++ /dev/null @@ -1,79 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Simple Exercise on Overfitting - - - -First set p=n - -```{r} - -set.seed(123) -n = 1000 - -p = n -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) - -print("p/n is") -print(p/n) -print("R2 is") -print(summary(lm(Y~X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) - -``` - -Second, set p=n/2. - -```{r} - -set.seed(123) -n = 1000 - -p = n/2 -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) - -print("p/n is") -print(p/n) -print("R2 is") -print(summary(lm(Y~X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) - -``` - -Third, set p/n =.05 - -```{r} - -set.seed(123) -n = 1000 - -p = .05*n -X<- matrix(rnorm(n*p), n, p) -Y<- rnorm(n) - -print("p/n is") -print(p/n) -print("R2 is") -print(summary(lm(Y~X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(Y~X))$adj.r.squared) - - -``` diff --git a/PM1/r-linear-model-overfitting.Rmd b/PM1/r-linear-model-overfitting.Rmd deleted file mode 100644 index 2dbba836..00000000 --- a/PM1/r-linear-model-overfitting.Rmd +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: An R Markdown document converted from "PM1/r-linear-model-overfitting.irnb" -output: html_document ---- - -# Simple Exercise on Overfitting - -First set p=n - -```{r} - -set.seed(123) -n <- 1000 - -p <- n -X <- matrix(rnorm(n * p), n, p) -y <- rnorm(n) - -print("p/n is") -print(p / n) -print("R2 is") -print(summary(lm(y ~ X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(y ~ X))$adj.r.squared) -``` - -Second, set p=n/2. - -```{r} - -set.seed(123) -n <- 1000 - -p <- n / 2 -X <- matrix(rnorm(n * p), n, p) -y <- rnorm(n) - -print("p/n is") -print(p / n) -print("R2 is") -print(summary(lm(y ~ X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(y ~ X))$adj.r.squared) -``` - -Third, set p/n =.05 - -```{r} - -set.seed(123) -n <- 1000 - -p <- .05 * n -X <- matrix(rnorm(n * p), n, p) -y <- rnorm(n) - -print("p/n is") -print(p / n) -print("R2 is") -print(summary(lm(y ~ X))$r.squared) -print("Adjusted R2 is") -print(summary(lm(y ~ X))$adj.r.squared) -``` - diff --git a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd b/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd deleted file mode 100644 index ea7e6892..00000000 --- a/PM1/r-ols-and-lasso-for-wage-gap-inference.Rmd +++ /dev/null @@ -1,398 +0,0 @@ ---- -title: An R Markdown document converted from "PM1/r-ols-and-lasso-for-wage-gap-inference.irnb" -output: html_document ---- - -# An inferential problem: The Gender Wage Gap - -In the previous lab, we analyzed data from the March Supplement of the U.S. Current Population Survey (2015) and answered the question of how to use job-relevant characteristics, such as education and experience, to best predict wages. Now, we focus on the following inference question: - -What is the difference in predicted wages between men and women with the same job-relevant characteristics? - -Thus, we analyze if there is a difference in the payment of men and women (*gender wage gap*). The gender wage gap may partly reflect *discrimination* against women in the labor market or may partly reflect a *selection effect*, namely that women are relatively more likely to take on occupations that pay somewhat less (for example, school teaching). - -To investigate the gender wage gap, we consider the following log-linear regression model - -\begin{align} -\log(Y) &= \beta'X + \epsilon\\ -&= \beta_1 D + \beta_2' W + \epsilon, -\end{align} - -where $Y$ is hourly wage, $D$ is the indicator of being female ($1$ if female and $0$ otherwise) and the -$W$'s are a vector of worker characteristics explaining variation in wages. Considering transformed wages by the logarithm, we are analyzing the relative difference in the payment of men and women. - -```{r} -install.packages("xtable") -install.packages("hdm") # a library for high-dimensional metrics -install.packages("sandwich") # a package used to compute robust standard errors -``` - -```{r} -library(hdm) -library(xtable) -library(sandwich) -``` - -## Data analysis - -We consider the same subsample of the U.S. Current Population Survey (2015) as in the previous lab. Let us load the data set. - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" -data <- read.csv(file) -dim(data) -``` - -To start our (causal) analysis, we compare the sample means given gender: - -```{r} -z <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", - "ad", "ne", "mw", "so", "we", "exp1"))] - -data_female <- data[data$sex == 1, ] -z_female <- data_female[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", - "clg", "ad", "ne", "mw", "so", "we", "exp1"))] - -data_male <- data[data$sex == 0, ] -z_male <- data_male[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", "clg", "ad", - "ne", "mw", "so", "we", "exp1"))] - -table <- matrix(0, 12, 3) -table[1:12, 1] <- as.numeric(lapply(z, mean)) -table[1:12, 2] <- as.numeric(lapply(z_male, mean)) -table[1:12, 3] <- as.numeric(lapply(z_female, mean)) -rownames(table) <- c("Log Wage", "Sex", "Less then High School", "High School Graduate", "Some College", - "College Graduate", "Advanced Degree", "Northeast", "Midwest", "South", "West", "Experience") -colnames(table) <- c("All", "Men", "Women") -tab <- xtable(table, digits = 4) -tab -``` - -```{r} -print(tab, type = "html") # set type="latex" for printing table in LaTeX -``` - - - - - - - - - - - - - - - - - -
All Men Women
Log Wage 2.9708 2.9878 2.9495
Sex 0.4445 0.0000 1.0000
Less then High School 0.0233 0.0318 0.0127
High School Graduate 0.2439 0.2943 0.1809
Some College 0.2781 0.2733 0.2840
Gollage Graduate 0.3177 0.2940 0.3473
Advanced Degree 0.1371 0.1066 0.1752
Northeast 0.2596 0.2590 0.2604
Midwest 0.2965 0.2981 0.2945
South 0.2161 0.2209 0.2101
West 0.2278 0.2220 0.2350
Experience 13.7606 13.7840 13.7313
- -In particular, the table above shows that the difference in average *logwage* between men and women is equal to $0.038$ - -```{r} -mean(data_female$lwage) - mean(data_male$lwage) -``` - -Thus, the unconditional gender wage gap is about $3,8$\% for the group of never married workers (women get paid less on average in our sample). We also observe that never married working women are relatively more educated than working men and have lower working experience. - -This unconditional (predictive) effect of gender equals the coefficient $\beta$ in the univariate ols regression of $Y$ on $D$: - -\begin{align} -\log(Y) &=\beta D + \epsilon. -\end{align} - -We verify this by running an ols regression in R. - -```{r} -nocontrol_fit <- lm(lwage ~ sex, data = data) -nocontrol_est <- summary(nocontrol_fit)$coef["sex", 1] -# HC - "heteroskedasticity cosistent" -- HC3 is the SE that remains consistent in high dimensions -hcv_coefs <- vcovHC(nocontrol_fit, type = "HC3") -nocontrol_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors - -# print unconditional effect of gender and the corresponding standard error -cat("The estimated coefficient on the dummy for gender is", nocontrol_est, - " and the corresponding robust standard error is", nocontrol_se) -``` - -Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. - -Next, we run an ols regression of $Y$ on $(D,W)$ to control for the effect of covariates summarized in $W$: - -\begin{align} -\log(Y) &=\beta_1 D + \beta_2' W + \epsilon. -\end{align} - -Here, we are considering the flexible model from the previous lab. Hence, $W$ controls for experience, education, region, and occupation and industry indicators plus transformations and two-way interactions. - -Let us run the ols regression with controls. - -```{r} -# ols regression with controls - -flex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) - -# Note that ()*() operation in formula objects in R creates a formula of the sort: -# '(exp1+exp2+exp3+exp4) + (shs+hsg+scl+clg+occ2+ind2+mw+so+we) -# + (exp1+exp2+exp3+exp4) * (shs+hsg+scl+clg+occ2+ind2+mw+so+we)' -# This is not intuitive at all, but that's what it does. - -control_fit <- lm(flex, data = data) -control_est <- summary(control_fit)$coef[2, 1] - -summary(control_fit) - -cat("Coefficient for OLS with controls", control_est) - -hcv_coefs <- vcovHC(control_fit, type = "HC3") -control_se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors -``` - -The estimated regression coefficient $\beta_1\approx-0.0696$ measures how our linear prediction of wage changes if we set the gender variable $D$ from 0 to 1, holding the controls $W$ fixed. -We can call this the *predictive effect* (PE), as it measures the impact of a variable on the prediction we make. Overall, we see that the unconditional wage gap of size $4$\% for women increases to about $7$\% after controlling for worker characteristics. - -We now show how the conditional gap and the remainder decompose the marginal wage gap into the parts explained and unexplained by the additional controls. (Note that this does *not* explain why there is a difference in the controls to begin with in the two groups.) - -```{r} -xx0 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), - data = data[data$sex == 0, ]) -y0 <- data[data$sex == 0, ]$lwage -xx1 <- model.matrix(~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we), - data = data[data$sex == 1, ]) -y1 <- data[data$sex == 1, ]$lwage -mu1 <- colMeans(xx1) -mu0 <- colMeans(xx0) -betarest <- summary(control_fit)$coef[3:(ncol(xx0) + 1), 1] # the coefficients excluding intercept and "sex" - -cat("The marginal gap:", mean(data_female$lwage) - mean(data_male$lwage), "\n") -diff.unexplained <- control_est -cat("The unexplained difference: ", diff.unexplained, "\n") -diff.explained <- sum(betarest * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) -cat("The explained difference:", diff.explained, "\n") -cat("The sum of these differences:", diff.unexplained + diff.explained, "\n") -``` - -We next consider a Oaxaca-Blinder decomposition that also incorporates an interaction term. - -```{r} -svd0 <- svd(xx0) -svd1 <- svd(xx1) -svd0$d[svd0$d <= 1e-10] <- 0 -svd0$d[svd0$d > 1e-10] <- 1 / svd0$d[svd0$d > 1e-10] -beta0 <- (svd0$v %*% (svd0$d * svd0$d * t(svd0$v))) %*% t(xx0) %*% y0 -svd1$d[svd1$d <= 1e-10] <- 0 -svd1$d[svd1$d > 1e-10] <- 1 / svd1$d[svd1$d > 1e-10] -beta1 <- (svd1$v %*% (svd1$d * svd1$d * t(svd1$v))) %*% t(xx1) %*% y1 - -cat("The marginal gap:", mean(data_female$lwage) - mean(data_male$lwage), "\n") -cat("The unexplained difference:", beta1[1] - beta0[1], "\n") -cat("The difference explained by endowment:", sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])), "\n") -cat("The difference explained by coefficient:", sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)]), "\n") -cat("The sum of these differences:", - (beta1[1] - beta0[1] + sum(beta0[2:ncol(xx0)] * (mu1[2:ncol(xx0)] - mu0[2:ncol(xx0)])) - + sum((beta1[2:ncol(xx0)] - beta0[2:ncol(xx0)]) * mu1[2:ncol(xx0)])), "\n") -``` - -Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out the linear effect of the controls via ols. - -```{r} -# Partialling-out using ols - -# model for Y -flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) -# model for D -flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) - -# partialling-out the linear effect of W from Y -t_y <- lm(flex_y, data = data)$res -# partialling-out the linear effect of W from D -t_d <- lm(flex_d, data = data)$res - -# regression of Y on D after partialling-out the effect of W -partial_fit <- lm(t_y ~ t_d) -partial_est <- summary(partial_fit)$coef[2, 1] - -cat("Coefficient for D via partialling-out", partial_est) - -# standard error -hcv_coefs <- vcovHC(partial_fit, type = "HC3") -partial_se <- sqrt(diag(hcv_coefs))[2] - -# confidence interval -confint(partial_fit)[2, ] -``` - -Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. - -We know that the partialling-out approach works well when the dimension of $W$ is low -in relation to the sample size $n$. When the dimension of $W$ is relatively high, we need to use variable selection -or penalization for regularization purposes. - -In the following, we illustrate the partialling-out approach using lasso instead of ols. - -```{r} -# Partialling-out using lasso - -# model for Y -flex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) -# model for D -flex_d <- sex ~ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) - -# partialling-out the linear effect of W from Y -t_y <- rlasso(flex_y, data = data)$res -# partialling-out the linear effect of W from D -t_d <- rlasso(flex_d, data = data)$res - -# regression of Y on D after partialling-out the effect of W -partial_lasso_fit <- lm(t_y ~ t_d) -partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1] - -cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) - -# standard error -hcv_coefs <- vcovHC(partial_lasso_fit, type = "HC3") -partial_lasso_se <- sqrt(diag(hcv_coefs))[2] -``` - -Using lasso for partialling-out here provides similar results as using ols. - -Next, we summarize the results. - -```{r} -table <- matrix(0, 4, 2) -table[1, 1] <- nocontrol_est -table[1, 2] <- nocontrol_se -table[2, 1] <- control_est -table[2, 2] <- control_se -table[3, 1] <- partial_est -table[3, 2] <- partial_se -table[4, 1] <- partial_lasso_est -table[4, 2] <- partial_lasso_se -colnames(table) <- c("Estimate", "Std. Error") -rownames(table) <- c("Without controls", "full reg", "partial reg", "partial reg via lasso") -tab <- xtable(table, digits = c(3, 3, 4)) -tab -``` - -```{r} -print(tab, type = "html") -``` - - - - - - - - - -
Estimate Std. Error
Without controls -0.038 0.0159
full reg -0.070 0.0150
partial reg -0.070 0.0150
partial reg via lasso -0.072 0.0154
- -It it worth noticing that controlling for worker characteristics increases the gender wage gap from less than 4\% to 7\%. The controls we used in our analysis include 5 educational attainment indicators (less than high school graduates, high school graduates, some college, college graduate, and advanced degree), 4 region indicators (midwest, south, west, and northeast); a quartic term (first, second, third, and fourth power) in experience and 22 occupation and 23 industry indicators. - -Keep in mind that the predictive effect (PE) does not only measures discrimination (causal effect of being female), it also may reflect -selection effects of unobserved differences in covariates between men and women in our sample. - -## OLS Overfitting - -Next we motivate the usage of lasso. We try an "extra" flexible model, where we take interactions of all controls, giving us about 1000 controls. To highlight the potential impact of overfitting on inference, we subset to the first 1000 observations so that $p \approx n$. - -```{r} -set.seed(2724) -subset_size <- 1000 -random <- sample(seq_len(nrow(data)), subset_size) -subset <- data[random, ] -``` - -For a linear model, the covariance matrix of the estimated $\hat{\beta}$ coefficients is given by $$\Sigma_{\hat{\beta}} = (X'X)^{-1} X' \Omega X (X'X)^{-1}$$ Under homoskedasticity, $\Omega = \sigma^2 I$ so $\Sigma_{\hat{\beta}}$ reduces to $\sigma^2(X'X)^{-1}$ with $\sigma^2$ estimated with the mean squared residuals. Under heteroskedasticity, $\Omega \neq \sigma^2 I$, so we must use an approach that yields valid standard errors. Under heteroskedasticity, there exists a variety of consistent "sandwich" estimators proposed for $\Sigma_{\hat{\beta}}$. With $e_i$ denoting the residual of observation $i: - -$ HC0 = (X'X)^{-1} X' \text{diag} [e_i^2] X(X'X)^{-1}$ - -$ HC1 = \frac{n}{n-p-1} (X'X)^{-1} X' \text{diag} [e_i^2] X(X'X)^{-1}$ - -$ HC2 = (X'X)^{-1} X' \text{diag} \left[\frac{e_i^2}{1-h_{ii}} \right] X(X'X)^{-1}$ - -$ HC3 = (X'X)^{-1} X' \text{diag} \left[\frac{e_i^2}{(1-h_{ii})^2} \right] X(X'X)^{-1}$ - - -For small sample sizes, the errors from HC0 are biased (usually downward). HC1 is a simple degree-of-freedom adjustment. HC2 is inspired by the insight that HC0's bias in finite samples results from points of high leverage in the design matrix $X$ (intuitively, outliers with respect to the independent variables). Thus, HC2 weights the $i$th squared residual by the reciprocal of $(1-h_{ii})$, with leverage values $h_{ii}$ as the $i$th diagonal element of the "hat" matrix $H = X(X'X)^{-1}X'$ to adjust for the finite-sample bias present in HC0. - -HC3 is similar to HC2, weighting by the squared $(1-h_{ii})^2$ in the denominator instead. HC3 is also equivalent to jackknife standard errors. HC3 has been shown to perform well regardless of the absence/presence of homoskedasticity and remains valid, in the sense of being biased upward under regularity conditions, in high dimensional settings. - -```{r} -# extra flexible model -extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2 - -control_fit <- lm(extraflex, data = subset) -control_est <- summary(control_fit)$coef[2, 1] -cat("Number of Extra-Flex Controls", length(control_fit$coef) - 1, "\n") -cat("Coefficient for OLS with extra flex controls", control_est) - - -n <- subset_size -p <- length(control_fit$coef) - -# HC0 SE -hcv_coefs_hc0 <- vcovHC(control_fit, type = "HC0") -control_se_hc0 <- sqrt(diag(hcv_coefs_hc0))[2] - -# For a more correct approach, we -# would implement the approach of Cattaneo, Jannson, and Newey (2018)'s procedure. - -# Jackknife. Need to trim some leverages or otherwise regularize. Theory shouldn't -# really work here. -coefs <- hatvalues(control_fit) -trim <- 0.99999999999 -coefs_trimmed <- coefs * (coefs < trim) + trim * (coefs >= trim) -omega <- (control_fit$residuals^2) / ((1 - coefs_trimmed)^2) -hcv_coefs <- vcovHC(control_fit, omega = as.vector(omega), type = "HC3") -control_se_hc3 <- sqrt(diag(hcv_coefs))[2] -``` - -```{r} -# model for Y -extraflex_y <- lwage ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 -# model for D -extraflex_d <- sex ~ (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2))^2 - -# partialling-out the linear effect of W from Y -t_y <- rlasso(extraflex_y, data = subset)$res -# partialling-out the linear effect of W from D -t_d <- rlasso(extraflex_d, data = subset)$res - -# regression of Y on D after partialling-out the effect of W -partial_lasso_fit <- lm(t_y ~ t_d) -partial_lasso_est <- summary(partial_lasso_fit)$coef[2, 1] - -cat("Coefficient for D via partialling-out using lasso", partial_lasso_est) - -# standard error -hcv_coefs <- vcovHC(partial_lasso_fit, type = "HC3") -partial_lasso_se <- sqrt(diag(hcv_coefs))[2] -``` - -```{r} -table <- matrix(0, 3, 2) -table[1, 1] <- control_est -table[1, 2] <- control_se_hc0 -table[2, 1] <- control_est -table[2, 2] <- control_se_hc3 -table[3, 1] <- partial_lasso_est -table[3, 2] <- partial_lasso_se -colnames(table) <- c("Estimate", "Std. Error") -rownames(table) <- c("full reg, HC0", "full reg, HC3", "partial reg via lasso") -tab <- xtable(table, digits = c(3, 3, 4)) -tab - -print(tab, type = "latex") -``` - -In this case $p/n \approx 1$, that is $p/n$ is no longer small and we start seeing the differences between -unregularized partialling out and regularized partialling out with lasso (double lasso). The results based on -double lasso have rigorous guarantees in this non-small p/n regime under approximate sparsity. The results based on OLS still -have guarantees in p/n< 1 regime under assumptions laid out in Cattaneo, Newey, and Jansson (2018), without approximate -sparsity, although other regularity conditions are needed. - diff --git a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd b/PM1/r-ols-and-lasso-for-wage-prediction.Rmd deleted file mode 100644 index cc690a06..00000000 --- a/PM1/r-ols-and-lasso-for-wage-prediction.Rmd +++ /dev/null @@ -1,405 +0,0 @@ ---- -title: An R Markdown document converted from "PM1/r-ols-and-lasso-for-wage-prediction.irnb" -output: html_document ---- - -## Introduction - -An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective. - -In the following wage example, $Y$ is the (log) hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are: - -* How can we use job-relevant characteristics, such as education and experience, to best predict wages? - -* What is the difference in predicted wages between men and women with the same job-relevant characteristics? - -In this lab, we focus on the prediction question first. - -## Data - -The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$. - -The variable of interest $Y$ is the (log) hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$. - -```{r} -install.packages("xtable") -install.packages("hdm") # a library for high-dimensional metrics -install.packages("glmnet") # for lasso CV -``` - -```{r} -library(hdm) -library(xtable) -library(glmnet) -``` - -## Data analysis - -We start by loading the data set. - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" -data <- read.csv(file) -dim(data) -``` - -Let's have a look at the structure of the data. - -```{r} -str(data) -``` - -We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data. - -```{r} -# construct matrices for estimation from the data -y <- log(data$wage) -n <- length(y) -Z <- data[- which(colnames(data) %in% c("wage", "lwage"))] -p <- dim(Z)[2] - -cat("Number of observations:", n, "\n") -cat("Number of raw regressors:", p) -``` - -For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data. - -```{r} -# generate a table of means of variables -Zsubset <- data[which(colnames(data) %in% c("lwage", "sex", "shs", "hsg", "scl", - "clg", "ad", "mw", "so", "we", "ne", "exp1"))] -table <- matrix(0, 12, 1) -table[1:12, 1] <- as.numeric(lapply(Zsubset, mean)) -rownames(table) <- c("Log Wage", "Sex", "Some High School", - "High School Graduate", "Some College", "College Graduate", - "Advanced Degree", "Midwest", "South", "West", "Northeast", "Experience") -colnames(table) <- c("Sample mean") -tab <- xtable(table, digits = 2) -tab -``` - -E.g., the share of female workers in our sample is ~44% ($sex=1$ if female). - -Alternatively, using the xtable package, we can also print the table in LaTeX. - -```{r} -print(tab, type = "latex") # type="latex" for printing table in LaTeX -``` - -## Prediction Question - -Now, we will construct a prediction rule for (log) hourly wage $Y$, which depends linearly on job-relevant characteristics $X$: - -\begin{equation} -Y = \beta'X+ \epsilon. -\end{equation} - -Our goals are - -* Predict wages using various characteristics of workers. - -* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$. - - -Toward answering the latter, we measure the prediction quality of the two models via data splitting: - -- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider). -- Use the training sample to estimate the parameters of the Basic Model and the Flexible Model. -- Before using the testing sample, we evaluate in-sample fit. - -```{r} -# splitting the data -set.seed(1) # to make the results replicable (we will generate random numbers) -random <- sample(1:n, floor(n * 4 / 5)) # draw (4/5)*n random numbers from 1 to n without replacing -train <- data[random, ] -test <- data[-random, ] -``` - - -We employ two different specifications for prediction: - - -1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators). - - -2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*. - -Using the **Flexible Model** enables us to approximate the real relationship by a more complex regression model and therefore to reduce the bias. The **Flexible Model** increases the range of potential shapes of the estimated regression function. In general, flexible models often deliver higher prediction accuracy but are harder to interpret. - -## Data-Splitting: In-sample performance - -Let us fit both models to our data by running ordinary least squares (ols): - -```{r} -# 1. basic model -basic <- lwage ~ (sex + exp1 + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2)) -regbasic <- lm(basic, data = train) # perform ols using the defined model -# number of regressors in the Basic Model -cat("Number of regressors in the basic model:", length(regbasic$coef), "\n") -``` - -##### Note that the basic model consists of $51$ regressors. - -```{r} -# 2. flexible model -flex <- lwage ~ sex + shs + hsg + scl + clg + mw + so + we + C(occ2) + C(ind2) + - (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we) -regflex <- lm(flex, data = train) -# number of regressors in the Flexible Model -cat("Number of regressors in the flexible model:", length(regflex$coef)) -``` - -##### Note that the flexible model consists of $246$ regressors. - -#### Re-estimating the flexible model using Lasso -We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. - -```{r} -# Flexible model using Lasso, in-sample fit -train_flex <- model.matrix(flex, train) # all regressors -fit_lasso_cv <- cv.glmnet(as.matrix(train_flex), train$lwage, family = "gaussian", alpha = 1, nfolds = 5) -# in-sample fit right now, not out-of-sample using "test" -yhat_lasso_cv <- predict(fit_lasso_cv, newx = as.matrix(train_flex), s = "lambda.min") -``` - -#### Evaluating the predictive performance of the basic and flexible models in-sample -Now, we can evaluate the performance of both models based on the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$: - -```{r} -# Assess predictive performance -sumbasic <- summary(regbasic) -sumflex <- summary(regflex) -# no summary() for lassocv - -ntrain <- nrow(train) - -# R-squared and adjusted R-squared -r2_1 <- sumbasic$r.squared -cat("R-squared for the basic model: ", r2_1, "\n") -r2_adj1 <- sumbasic$adj.r.squared -cat("adjusted R-squared for the basic model: ", r2_adj1, "\n") - -r2_2 <- sumflex$r.squared -cat("R-squared for the flexible model: ", r2_2, "\n") -r2_adj2 <- sumflex$adj.r.squared -cat("adjusted R-squared for the flexible model: ", r2_adj2, "\n") - -p_l <- fit_lasso_cv$nzero[fit_lasso_cv$index[1]] -r2_l <- 1 - sum((yhat_lasso_cv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) -cat("R-squared for the lasso with flexible model: ", r2_l, "\n") -r2_adj_l <- 1 - - (sum((yhat_lasso_cv - train$lwage)^2) / (ntrain - p_l - 1)) / - (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) -cat("adjusted R-squared for the flexible model: ", r2_adj_l, "\n") - -# MSE and adjusted MSE -mse1 <- mean(sumbasic$res^2) -cat("MSE for the basic model: ", mse1, "\n") -p1 <- sumbasic$df[1] # number of regressors -mse_adj1 <- (ntrain / (ntrain - p1)) * mse1 -cat("adjusted MSE for the basic model: ", mse_adj1, "\n") - -mse2 <- mean(sumflex$res^2) -cat("MSE for the flexible model: ", mse2, "\n") -p2 <- sumflex$df[1] -mse_adj2 <- (ntrain / (ntrain - p2)) * mse2 -cat("adjusted MSE for the lasso flexible model: ", mse_adj2, "\n") - -lasso_res <- train$lwage - yhat_lasso_cv -mse_l <- mean(lasso_res^2) -cat("MSE for the lasso flexible model: ", mse_l, "\n") -mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l -cat("adjusted MSE for the lasso flexible model: ", mse_adj_l, "\n") -``` - -```{r} -# Output the table -table <- matrix(0, 3, 5) -table[1, 1:5] <- c(p1, r2_1, mse1, r2_adj1, mse_adj1) -table[2, 1:5] <- c(p2, r2_2, mse2, r2_adj2, mse_adj2) -table[3, 1:5] <- c(p_l, r2_l, mse_l, r2_adj_l, mse_adj_l) -colnames(table) <- c("p", "$R^2_{sample}$", "$MSE_{sample}$", "$R^2_{adjusted}$", "$MSE_{adjusted}$") -rownames(table) <- c("basic reg", "flexible reg", "lasso flex") -tab <- xtable(table, digits = c(0, 0, 2, 2, 2, 2)) -print(tab, type = "latex") -tab -``` - -Considering the measures above, the flexible model performs slightly better than the basic model. - -As $p/n$ is not large, the discrepancy between the adjusted and unadjusted measures is not large. However, if it were, we might still like to apply **data splitting** as a more general procedure to deal with potential overfitting if $p/n$. We illustrate the approach in the following. - -## Data Splitting: Out-of-sample performance - -Now that we have seen in-sample fit, we evaluate our models on the out-of-sample performance: -- Use the testing sample for evaluation. Predict the $\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample. -- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for both prediction models. - -```{r} -# basic model -options(warn = -1) # ignore warnings -regbasic <- lm(basic, data = train) - -# calculating the out-of-sample MSE -yhat_bas <- predict(regbasic, newdata = test) -y_test <- test$lwage -mean_train <- mean(train$lwage) -mse_test1 <- sum((y_test - yhat_bas)^2) / length(y_test) -r2_test1 <- 1 - mse_test1 / mean((y_test - mean_train)^2) - -cat("Test MSE for the basic model: ", mse_test1, " ") -cat("Test R2 for the basic model: ", r2_test1) -``` - -In the basic model, the $MSE_{test}$ is quite close to the $MSE_{sample}$. - -```{r} -# flexible model -options(warn = -1) # ignore warnings -regflex <- lm(flex, data = train) - -# calculating the out-of-sample MSE -yhat_flex <- predict(regflex, newdata = test) -y_test <- test$lwage -mean_train <- mean(train$lwage) -mse_test2 <- sum((y_test - yhat_flex)^2) / length(y_test) -r2_test2 <- 1 - mse_test2 / mean((y_test - mean_train)^2) - -cat("Test MSE for the flexible model: ", mse_test2, " ") - -cat("Test R2 for the flexible model: ", r2_test2) -``` - -In the flexible model too, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is not large. - -It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results. - -Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model. - -Next, let us use lasso regression in the flexible model instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in the flexible model to ols regression. - -```{r} -# Flexible model using Lasso -# model matrix should be formed before train/test as some levels dropped -flex_data <- model.matrix(flex, data) -train_flex <- flex_data[random, ] -test_flex <- flex_data[-random, ] - -fit_lasso_cv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) -yhat_lasso_cv <- predict(fit_lasso_cv, newx = test_flex, s = "lambda.min") - -# calculating the out-of-sample MSE -mse_lasso <- sum((y_test - yhat_lasso_cv)^2) / length(y_test) -r2_lasso <- 1 - mse_lasso / mean((y_test - mean(train$lwage))^2) - -cat("Test MSE for the lasso on flexible model: ", mse_lasso, " ") - -cat("Test R2 for the lasso flexible model: ", r2_lasso) -``` - -Finally, let us summarize the results: - -```{r} -# Output the comparison table -table2 <- matrix(0, 3, 2) -table2[1, 1] <- mse_test1 -table2[2, 1] <- mse_test2 -table2[3, 1] <- mse_lasso -table2[1, 2] <- r2_test1 -table2[2, 2] <- r2_test2 -table2[3, 2] <- r2_lasso - -rownames(table2) <- c("basic reg", "flexible reg", "lasso regression") -colnames(table2) <- c("$MSE_{test}$", "$R^2_{test}$") -tab2 <- xtable(table2, digits = 3) -tab2 -``` - -```{r} -print(tab2, type = "latex") -``` - -## Extra flexible model and Overfitting -Given the results above, it is not immediately clear why one would choose to use Lasso as results are fairly similar. To motivate, we consider an extra flexible model to show how OLS can overfit significantly to the in-sample train data and perform poorly on the out-of-sample testing data. - - -```{r} -# extra flexible model -extraflex <- lwage ~ sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2 -regextra <- lm(extraflex, data = train) -sumextra <- summary(regextra) -cat("Number of Extra-Flex Controls", length(regextra$coef) - 1, "\n") -n <- length(data$wage) -p <- length(regextra$coef) -ntrain <- length(train$wage) -``` - -```{r} -## In-sample -# R-squared and adjusted R-squared -r2_extra <- sumextra$r.squared -cat("R-squared for the extra flexible model (in-sample): ", r2_extra, "\n") -r2_adjextra <- sumextra$adj.r.squared -cat("adjusted R-squared for the extra flexible model (in-sample): ", r2_adjextra, "\n") - -# MSE and adjusted MSE -mse_extra <- mean(sumextra$res^2) -cat("MSE for the extra flexible model (in-sample): ", mse_extra, "\n") -mse_adjextra <- (ntrain / (ntrain - p)) * mse_extra -cat("adjusted MSE for the basic model (in-sample): ", mse_adj1, "\n") -``` - -```{r} -## Out-of-sample -yhat_ex <- predict(regextra, newdata = test) -y_test_ex <- test$lwage -mse_test_ex <- sum((y_test_ex - yhat_ex)^2) / length(y_test_ex) -r2_test_ex <- 1 - mse_test_ex / mean((y_test_ex - mean(train$lwage))^2) - -cat("Test MSE for the basic model: ", mse_test_ex, " ") -cat("Test R2 for the basic model: ", r2_test_ex) -``` - -As we can see, a simple OLS overfits when the dimensionality of covariates is high, as the out-of-sample performance suffers dramatically in comparison to the in-sample performance. - -Contrast this with Lasso: - -```{r} -# model matrix should be formed before train/test as some levels dropped -flex_data <- model.matrix(extraflex, data) -train_flex <- flex_data[random, ] -test_flex <- flex_data[-random, ] - -# fit model -fit_lcv <- cv.glmnet(train_flex, train$lwage, family = "gaussian", alpha = 1, nfolds = 5) - -# in-sample -yhat_lcv <- predict(fit_lcv, newx = train_flex, s = "lambda.min") - -r2_l <- 1 - sum((yhat_lcv - train$lwage)^2) / sum((train$lwage - mean(train$lwage))^2) -p_l <- fit_lcv$nzero[fit_lcv$index[1]] -r2_adj_l <- 1 - - (sum((yhat_lcv - train$lwage) ^ 2) / (ntrain - p_l - 1)) / - (sum((train$lwage - mean(train$lwage))^2) / (ntrain - 1)) - -lasso_res <- train$lwage - yhat_lcv -mse_l <- mean(lasso_res^2) -mse_adj_l <- (ntrain / (ntrain - p_l - 1)) * mse_l - -cat("R-squared for the lasso with the extra flexible model (in-sample): ", r2_l, "\n") -cat("adjusted R-squared for the extra flexible model (in-sample): ", r2_adj_l, "\n") -cat("MSE for the lasso with the extra flexible model (in-sample): ", mse_l, "\n") -cat("adjusted MSE for the lasso with the extraflexible model (in-sample): ", mse_adj_l, "\n") - -# out-of-sample -yhat_lcv_test <- predict(fit_lcv, newx = test_flex, s = "lambda.min") -mse_lasso <- sum((test$lwage - yhat_lcv_test)^2) / length(test$lwage) -r2_lasso <- 1 - mse_lasso / mean((test$lwage - mean(train$lwage))^2) - -cat("\n") -cat("Test R2 for the lasso the extra flexible model: ", r2_lasso, "\n") -cat("Test MSE for the lasso on the extra flexible model: ", mse_lasso) -``` - -As shown above, the overfitting effect is mitigated with the penalized regression model. - diff --git a/PM2/Old/heterogenous-wage-effects.irnb.Rmd b/PM2/Old/heterogenous-wage-effects.irnb.Rmd deleted file mode 100644 index 936a40e1..00000000 --- a/PM2/Old/heterogenous-wage-effects.irnb.Rmd +++ /dev/null @@ -1,76 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Application: Heterogeneous Effect of Gender on Wage Using Double Lasso - - We use US census data from the year 2012 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. For a detailed description of the variables we refer to the help page. - - - -This analysis allows a closer look how discrimination according to gender is related to other socio-economic variables. - - - - -```{r} -library(hdm) -data(cps2012) -str(cps2012) -``` - -```{r} -# create the model matrix for the covariates -X <- model.matrix(~-1 + female + female:(widowed + divorced + separated + nevermarried + -hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3) + +(widowed + -divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + mw + so + -we + exp1 + exp2 + exp3)^2, data = cps2012) -X <- X[, which(apply(X, 2, var) != 0)] # exclude all constant variables -dim(X) - -# target variables, index.gender specifices coefficients we are interested in -index.gender <- grep("female", colnames(X)) -y <- cps2012$lnw -``` - -The parameter estimates for the target parameters, i.e. all coefficients related to gender (i.e. by interaction with other variables) are calculated and summarized by the following commands: - - - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -effects.female <- rlassoEffects(x = X, y = y, index = index.gender) -summary(effects.female) -library(stargazer) -stargazer(summary(effects.female)[1]) -``` - -Now, we estimate and plot confident intervals, first "pointwise" and then the joint confidence intervals. - -```{r} -joint.CI <- confint(effects.female, level = 0.95) -joint.CI -plot(effects.female, level=0.95) # plot of the effects -stargazer(joint.CI) -``` - -Finally, we compare the pointwise confidence intervals to joint confidence intervals. - -```{r} -joint.CI <- confint(effects.female, level = 0.95, joint = TRUE) -joint.CI -plot(effects.female, joint=TRUE, level=0.95) # plot of the effects -stargazer(joint.CI) - -# the plot output does not look great -``` diff --git a/PM2/Old/ml-for-wage-prediction.irnb.Rmd b/PM2/Old/ml-for-wage-prediction.irnb.Rmd deleted file mode 100644 index 71d8ef8e..00000000 --- a/PM2/Old/ml-for-wage-prediction.irnb.Rmd +++ /dev/null @@ -1,291 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - - - -This notebook contains an example for teaching. - - - - -# Penalized Linear Regressions: A Simulation Experiment - - -## Data Generating Process: Approximately Sparse - -```{r} -set.seed(1) - -n = 100; -p = 400; - -Z= runif(n)-1/2; -W = matrix(runif(n*p)-1/2, n, p); - - - -beta = 1/seq(1:p)^2; # approximately sparse beta -#beta = rnorm(p)*.2 # dense beta -gX = exp(4*Z)+ W%*%beta; # leading term nonlinear -X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) - - -Y = gX + rnorm(n); #generate Y - - -plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) - -print( c("theoretical R2:", var(gX)/var(Y))) - -var(gX)/var(Y); #theoretical R-square in the simulation example - - - - -``` - -We use package Glmnet to carry out predictions using cross-validated lasso, ridge, and elastic net - -```{r} - -library(glmnet) -fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss -fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss - -yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions -yhat.ridge <- predict(fit.ridge, newx = X) -yhat.elnet <- predict(fit.elnet, newx = X) - -MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - -``` - -Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm - -```{r} -library(hdm) -fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level -fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level - -yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X -yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X - -MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -``` - -Next we code up lava, which alternates the fitting of lasso and ridge - -```{r} -library(glmnet) - -lava.predict<- function(X,Y, iter=5){ - -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part - - -i=1 -while(i<= iter) { -g1 = predict(rlasso(X, as.vector(Y-m1), post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part -i = i+1 } - -return(g1+m1); - } - - -yhat.lava = lava.predict(X,Y) -MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -MSE.lava -``` - -```{r} -library(xtable) -table<- matrix(0, 6, 2) -table[1,1:2] <- MSE.lasso.cv -table[2,1:2] <- MSE.ridge -table[3,1:2] <- MSE.elnet -table[4,1:2] <- MSE.lasso -table[5,1:2] <- MSE.lasso.post -table[6,1:2] <- MSE.lava - -colnames(table)<- c("MSA", "S.E. for MSA") -rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", - "Lasso","Post-Lasso","Lava") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX -tab - - -``` - -```{r} - -plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") - -points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) -points(gX, yhat.rlasso.post, col=3, pch=17, cex = 1.2 ) -points( gX, yhat.lasso.cv,col=4, pch=19, cex = 1.2 ) - - -legend("bottomright", - legend = c("rLasso", "Post-rLasso", "CV Lasso"), - col = c(2,3,4), - pch = c(18,17, 19), - bty = "n", - pt.cex = 1.3, - cex = 1.2, - text.col = "black", - horiz = F , - inset = c(0.1, 0.1)) - - -``` - -## Data Generating Process: Approximately Sparse + Small Dense Part - -```{r} -set.seed(1) - -n = 100; -p = 400; - -Z= runif(n)-1/2; -W = matrix(runif(n*p)-1/2, n, p); - - -beta = rnorm(p)*.2 # dense beta -gX = exp(4*Z)+ W%*%beta; # leading term nonlinear -X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) - - -Y = gX + rnorm(n); #generate Y - - -plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) - -print( c("theoretical R2:", var(gX)/var(Y))) - -var(gX)/var(Y); #theoretical R-square in the simulation example - - - -``` - -```{r} - -library(glmnet) -fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss -fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss - -yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions -yhat.ridge <- predict(fit.ridge, newx = X) -yhat.elnet <- predict(fit.elnet, newx = X) - -MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - -``` - -```{r} -library(hdm) -fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level -fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level - -yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X -yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X - -MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -``` - -```{r} -library(glmnet) - -lava.predict<- function(X,Y, iter=5){ - -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part - - -i=1 -while(i<= iter) { -g1 = predict(rlasso(X, as.vector(Y-m1), post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part -i = i+1 } - -return(g1+m1); - } - - -yhat.lava = lava.predict(X,Y) -MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -MSE.lava -``` - -```{r} -library(xtable) -table<- matrix(0, 6, 2) -table[1,1:2] <- MSE.lasso.cv -table[2,1:2] <- MSE.ridge -table[3,1:2] <- MSE.elnet -table[4,1:2] <- MSE.lasso -table[5,1:2] <- MSE.lasso.post -table[6,1:2] <- MSE.lava - -colnames(table)<- c("MSA", "S.E. for MSA") -rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", - "Lasso","Post-Lasso","Lava") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX -tab - -``` - -```{r} - -plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") - -points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) -points(gX, yhat.elnet, col=3, pch=17, cex = 1.2 ) -points(gX, yhat.lava, col=4, pch=19, cex = 1.2 ) - - -legend("bottomright", - legend = c("rLasso", "Elnet", "Lava"), - col = c(2,3,4), - pch = c(18,17, 19), - bty = "n", - pt.cex = 1.3, - cex = 1.2, - text.col = "black", - horiz = F , - inset = c(0.1, 0.1)) - -``` diff --git a/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd b/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd deleted file mode 100644 index 0ae82ac4..00000000 --- a/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd +++ /dev/null @@ -1,113 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -# Simulation Design - -library(hdm) - -set.seed(1) -B= 1000 # trials -Naive = rep(0, B) -Orthogonal = rep(0, B) - -for (i in 1:B){ - -n=100 -p= 100 -beta = 1/(1:p)^2 -gamma =1/(1:p)^2 - -X=matrix(rnorm(n*p), n, p) - - -D= X%*%gamma + rnorm(n)/4 - -Y = D+ X%*%beta + rnorm(n) - -# single selection method - -SX.IDs = which(rlasso(Y~ D+X)$coef[-c(1,2)] !=0) #select covariates by Lasso - - -if (sum(SX.IDs)==0) {Naive[i] = lm(Y~ D)$coef[2]} - -if (sum(SX.IDs)>0) {Naive[i] = lm(Y~ D + X[,SX.IDs])$coef[2]} - - - -#partialling out - -resY = rlasso(Y~ X, Post=F)$res -resD = rlasso(D~ X, Post=F)$res -Orthogonal[i]= lm(resY ~ resD)$coef[2] - -} - -``` - -```{r} -hist(Orthogonal-1,col=4, freq=F, xlim= c(-2, 2), xlab= "Orhtogonal -True ", main="Orthogonal") -hist(Naive-1, col=2, freq=F, xlim= c(-2,2), xlab= "Naive- True", main = "Naive") - -``` - -```{r} -library(hdm) - -set.seed(1) -B= 1000 # trials -Naive = rep(0, B) -Orthogonal = rep(0, B) - -for (i in 1:B){ - -n=100 -p= 100 -beta = 1/(1:p)^2 -gamma =1/(1:p)^2 - -X=matrix(rnorm(n*p), n, p) - - -D= X%*%gamma + rnorm(n)/4 - -Y = D+ X%*%beta + rnorm(n) - -# single selection method - -SX.IDs = which(rlasso(Y~ D+X)$coef[-c(1,2)] !=0) #select covariates by Lasso - - -if (sum(SX.IDs)==0) {Naive[i] = lm(Y~ D)$coef[2]} - -if (sum(SX.IDs)>0) {Naive[i] = lm(Y~ D + X[,SX.IDs])$coef[2]} - - - -#partialling out - -resY = rlasso(Y~ X, Post=T)$res -resD = rlasso(D~ X, Post=T)$res -Orthogonal[i]= lm(resY ~ resD)$coef[2] - -} - -``` - -```{r} -hist(Orthogonal-1,col=4, freq=F, xlim= c(-2, 2), xlab= "Orhtogonal -True ", main="Orthogonal") -hist(Naive-1, col=2, freq=F, xlim= c(-2,2), xlab= "Naive- True", main = "Naive") - -``` diff --git a/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd b/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd deleted file mode 100644 index 5f4c4d56..00000000 --- a/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd +++ /dev/null @@ -1,291 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - - - -This notebook contains an example for teaching. - - - - -# Penalized Linear Regressions: A Simulation Experiment - - -## Data Generating Process: Approximately Sparse - -```{r} -set.seed(1) - -n = 100; -p = 400; - -Z= runif(n)-1/2; -W = matrix(runif(n*p)-1/2, n, p); - - - -beta = 1/seq(1:p)^2; # approximately sparse beta -#beta = rnorm(p)*.2 # dense beta -gX = exp(4*Z)+ W%*%beta; # leading term nonlinear -X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) - - -Y = gX + rnorm(n); #generate Y - - -plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) - -print( c("theoretical R2:", var(gX)/var(Y))) - -var(gX)/var(Y); #theoretical R-square in the simulation example - - - - -``` - -We use package Glmnet to carry out predictions using cross-validated lasso, ridge, and elastic net - -```{r} - -library(glmnet) -fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss -fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss - -yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions -yhat.ridge <- predict(fit.ridge, newx = X) -yhat.elnet <- predict(fit.elnet, newx = X) - -MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - -``` - -Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm - -```{r} -library(hdm) -fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level -fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level - -yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X -yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X - -MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -``` - -Next we code up lava, which alternates the fitting of lasso and ridge - -```{r} -library(glmnet) - -lava.predict<- function(X,Y, iter=5){ - -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part - - -i=1 -while(i<= iter) { -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part -i = i+1 } - -return(g1+m1); - } - - -yhat.lava = lava.predict(X,Y) -MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -MSE.lava -``` - -```{r} -library(xtable) -table<- matrix(0, 6, 2) -table[1,1:2] <- MSE.lasso.cv -table[2,1:2] <- MSE.ridge -table[3,1:2] <- MSE.elnet -table[4,1:2] <- MSE.lasso -table[5,1:2] <- MSE.lasso.post -table[6,1:2] <- MSE.lava - -colnames(table)<- c("MSA", "S.E. for MSA") -rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", - "Lasso","Post-Lasso","Lava") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX -tab - - -``` - -```{r} - -plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") - -points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) -points(gX, yhat.rlasso.post, col=3, pch=17, cex = 1.2 ) -points( gX, yhat.lasso.cv,col=4, pch=19, cex = 1.2 ) - - -legend("bottomright", - legend = c("rLasso", "Post-rLasso", "CV Lasso"), - col = c(2,3,4), - pch = c(18,17, 19), - bty = "n", - pt.cex = 1.3, - cex = 1.2, - text.col = "black", - horiz = F , - inset = c(0.1, 0.1)) - - -``` - -## Data Generating Process: Approximately Sparse + Small Dense Part - -```{r} -set.seed(1) - -n = 100; -p = 400; - -Z= runif(n)-1/2; -W = matrix(runif(n*p)-1/2, n, p); - - -beta = rnorm(p)*.2 # dense beta -gX = exp(4*Z)+ W%*%beta; # leading term nonlinear -X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) - - -Y = gX + rnorm(n); #generate Y - - -plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) - -print( c("theoretical R2:", var(gX)/var(Y))) - -var(gX)/var(Y); #theoretical R-square in the simulation example - - - -``` - -```{r} - -library(glmnet) -fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss -fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss -fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss - -yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions -yhat.ridge <- predict(fit.ridge, newx = X) -yhat.elnet <- predict(fit.elnet, newx = X) - -MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - -``` - -```{r} -library(hdm) -fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level -fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level - -yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X -yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X - -MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) -MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -``` - -```{r} -library(glmnet) - -lava.predict<- function(X,Y, iter=5){ - -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part - - -i=1 -while(i<= iter) { -g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" -m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part -i = i+1 } - -return(g1+m1); - } - - -yhat.lava = lava.predict(X,Y) -MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) - - -MSE.lava -``` - -```{r} -library(xtable) -table<- matrix(0, 6, 2) -table[1,1:2] <- MSE.lasso.cv -table[2,1:2] <- MSE.ridge -table[3,1:2] <- MSE.elnet -table[4,1:2] <- MSE.lasso -table[5,1:2] <- MSE.lasso.post -table[6,1:2] <- MSE.lava - -colnames(table)<- c("MSA", "S.E. for MSA") -rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", - "Lasso","Post-Lasso","Lava") -tab <- xtable(table, digits =3) -print(tab,type="latex") # set type="latex" for printing table in LaTeX -tab - -``` - -```{r} - -plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") - -points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) -points(gX, yhat.elnet, col=3, pch=17, cex = 1.2 ) -points(gX, yhat.lava, col=4, pch=19, cex = 1.2 ) - - -legend("bottomright", - legend = c("rLasso", "Elnet", "Lava"), - col = c(2,3,4), - pch = c(18,17, 19), - bty = "n", - pt.cex = 1.3, - cex = 1.2, - text.col = "black", - horiz = F , - inset = c(0.1, 0.1)) - -``` diff --git a/PM2/r_convergence_hypothesis_double_lasso.Rmd b/PM2/r_convergence_hypothesis_double_lasso.Rmd deleted file mode 100644 index 0ab246fd..00000000 --- a/PM2/r_convergence_hypothesis_double_lasso.Rmd +++ /dev/null @@ -1,268 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_convergence_hypothesis_double_lasso.irnb" -output: html_document ---- - -# Testing the Convergence Hypothesis - -```{r} -install.packages("hdm") -install.packages("xtable") -install.packages("lmtest") -install.packages("sandwich") -install.packages("glmnet") -install.packages("ggplot2") -``` - -```{r} -library(hdm) -library(xtable) -library(lmtest) -library(sandwich) -library(glmnet) # For LassoCV -library(ggplot2) -``` - -## Introduction - -We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\beta_1$ in the high-dimensional linear regression model: - $$ - Y = \beta_1 D + \beta_2'W + \epsilon. - $$ - -Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$). - -The relationship is captured by $\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\beta_1< 0)$ or fall behind $(\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions, that we won't state here, the predictive exercise we are doing here can be given causal interpretation. - -## Data Analysis - -We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data. - -```{r} -getdata <- function(...) { - e <- new.env() - name <- data(..., envir = e)[1] - e[[name]] -} - -# now load your data calling getdata() -growth <- getdata(GrowthData) -``` - -The sample contains $90$ countries and $63$ controls. - -```{r} -growth -``` - -Thus $p \approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\beta_1$. -To check this hypothesis, we analyze the relation between the output variable $Y$ and the other country's characteristics by running a linear regression in the first step. - -```{r} -## Create the outcome variable y and covariates x -y <- growth$Outcome -X <- growth[-which(colnames(growth) %in% c("intercept"))] -``` - -```{r} -fit <- lm(Outcome ~ ., data = X) -est <- summary(fit)$coef["gdpsh465", 1] - -hcv_coefs <- vcovHC(fit, type = "HC1") # HC - "heteroskedasticity cosistent" -se <- sqrt(diag(hcv_coefs))[2] # Estimated std errors - -# print unconditional effect of gdpsh465 and the corresponding standard error -cat("The estimated coefficient on gdpsh465 is", est, - " and the corresponding robust standard error is", se) - -# Calculate the 95% confidence interval for 'gdpsh465' -lower_ci <- est - 1.96 * se -upper_ci <- est + 1.96 * se - -cat("95% Confidence Interval: [", lower_ci, ",", upper_ci, "]") -``` - -## Summarize OLS results - -```{r} -# Create an empty data frame with column names -table <- data.frame( - Method = character(0), - Estimate = character(0), - `Std. Error` = numeric(0), - `Lower Bound CI` = numeric(0), - `Upper Bound CI` = numeric(0) -) - -# Add OLS results to the table -table <- rbind(table, c("OLS", est, se, lower_ci, upper_ci)) - -# Rename the columns to match the Python table -colnames(table) <- c("Method", "Estimate", "Std. Error", "lower bound CI", "upper bound CI") - -# Print the table -print(table) -``` - -Least squares provides a rather noisy estimate (high standard error) of the -speed of convergence, and does not allow us to answer the question -about the convergence hypothesis since the confidence interval includes zero. - -In contrast, we can use the partialling-out approach based on lasso regression ("Double Lasso"). - -```{r} -y <- growth$Outcome -W <- growth[-which(colnames(growth) %in% c("Outcome", "intercept", "gdpsh465"))] -D <- growth$gdpsh465 -``` - -## Method 1: Lasso with Theoretical Penalty using HDM - -While cross validation is commonly employed for choosing penalty parameters in Lasso, it can be very noisy and tends to choose relatively small penalty leading to some overfitting. For this reason, one should not use cross validation to choose tuning parameters unless sample splitting is employed. We illustrate the use of sample combined with cross validation in later chapters in the book. Since we are using the full sample here, it is much better (and theoretically valid) to use penalties that provably control overfitting, which is what we do here. - -We report the results using cross validation at the end of this notebook for comparison. There, we observe overfitting for the prediction of the outcome. - -```{r} -double_lasso <- function(y, D, W) { - - # residualize outcome with Lasso - yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) - yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) - yres <- y - as.numeric(yhat_rlasso) - - - # residualize treatment with Lasso - dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) - dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) - dres <- D - as.numeric(dhat_rlasso) - - # rest is the same as in the OLS case - hat <- mean(yres * dres) / mean(dres^2) - epsilon <- yres - hat * dres - V <- mean(epsilon^2 * dres^2) / mean(dres^2)^2 - stderr <- sqrt(V / length(y)) - - return(list(hat = hat, stderr = stderr)) -} -``` - -```{r} -results <- double_lasso(y, D, W) -hat <- results$hat -stderr <- results$stderr -# Calculate the 95% confidence interval -ci_lower <- hat - 1.96 * stderr -ci_upper <- hat + 1.96 * stderr -``` - -The least square method provides a rather noisy estimate of the speed of convergence. We can not answer the question if poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large. - -In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso based point estimate is $-5\%$ and the $95\%$ confidence interval for the (annual) rate of convergence $[-7.8\%,-2.2\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis. - -```{r} -# Add Double Lasso results to the table -table <- rbind(table, c("Double Lasso", hat, stderr, ci_lower, ci_upper)) - -# Print the table -print(table) -``` - -## Method 2: Lasso with Cross-Validation - -This section is for illustration purposes only. Given that we are using the full sample, cross validation *should not* be used for choosing tuning parameters here. Cross validation tends to (mildly) overfit, and this overfitting can lead to substantial problems when inference about parameters is the goal. - -```{r} -# Choose penalty based on KFold cross validation -set.seed(123) -# Given small sample size, we use an aggressive number of 20 folds -n_folds <- 20 - - -# Define LassoCV models for y and D -model_y <- cv.glmnet( - x = as.matrix(W), - y = y, - alpha = 1, # Lasso penalty - nfolds = n_folds, - family = "gaussian" -) - -model_d <- cv.glmnet( - x = as.matrix(W), - y = D, - alpha = 1, # Lasso penalty - nfolds = n_folds, - family = "gaussian" -) - -# Get the best lambda values for y and D -best_lambda_y <- model_y$lambda.min -best_lambda_d <- model_d$lambda.min - -# Fit Lasso models with the best lambda values -lasso_model_y <- glmnet(as.matrix(W), y, alpha = 1, lambda = best_lambda_y) -lasso_model_d <- glmnet(as.matrix(W), D, alpha = 1, lambda = best_lambda_d) - -# Calculate the residuals -res_y <- y - predict(lasso_model_y, s = best_lambda_y, newx = as.matrix(W)) -res_d <- D - predict(lasso_model_d, s = best_lambda_d, newx = as.matrix(W)) -``` - -```{r} -tmp_df <- as.data.frame(cbind(res_y, res_d)) -colnames(tmp_df) <- c("res_y", "res_d") -``` - -```{r} -fit_cv <- lm(res_y ~ res_d, data = tmp_df) -est_cv <- summary(fit_cv)$coef["res_d", 1] - -hcv_cv_coefs <- vcovHC(fit_cv, type = "HC1") # HC - "heteroskedasticity cosistent" -se_cv <- sqrt(diag(hcv_cv_coefs))[2] # Estimated std errors - -# Calculate the 95% confidence interval for 'gdpsh465' -lower_ci_cv <- est_cv - 1.96 * se_cv -upper_ci_cv <- est_cv + 1.96 * se_cv -``` - -```{r} -# Add LassoCV results to the table -table <- rbind(table, c("Double Lasso CV", est_cv, se_cv, lower_ci_cv, upper_ci_cv)) - -# Print the table -print(table) -``` - -We find that the outcome model chooses too small of a penalty based on cross-validation, leading to overfitting of the outcome and tiny outcome residuals. This leads to artificially small standard errors and a zero treatment effect. Theoretically driven penalty should be preferred for such small sample sizes. - -```{r} -# Create a data frame to store the results -results_y <- data.frame( - Alphas = model_y$lambda, - OutOfSampleR2 = 1 - model_y$cvm / var(y) -) - -results_d <- data.frame( - Alphas = model_d$lambda, - OutOfSampleR2 = 1 - model_d$cvm / var(D) -) - -# Plot Outcome Lasso-CV Model -ggplot(data = results_y, aes(x = Alphas, y = OutOfSampleR2)) + - geom_line() + - labs( - title = "Outcome Lasso-CV Model: Out-of-sample R-squared as function of penalty level", - x = "Penalty Level", - y = "Out-of-sample R-squared" - ) - -# Plot Treatment Lasso-CV Model -ggplot(data = results_d, aes(x = (Alphas), y = OutOfSampleR2)) + - geom_line() + - labs( - title = "Treatment Lasso-CV Model: Out-of-sample R-squared as function of penalty level", - x = "Penalty Level", - y = "Out-of-sample R-squared" - ) -``` - diff --git a/PM2/r_experiment_non_orthogonal.Rmd b/PM2/r_experiment_non_orthogonal.Rmd deleted file mode 100644 index 42778dec..00000000 --- a/PM2/r_experiment_non_orthogonal.Rmd +++ /dev/null @@ -1,516 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_experiment_non_orthogonal.irnb" -output: html_document ---- - -# Simulation Design - -```{r} -install.packages("hdm") -``` - -```{r} -library(hdm) -``` - -## Generating RCT data - -```{r} -gen_data <- function(n, d, p, delta, base) { - X <- matrix(rnorm(n * d), nrow = n, ncol = d) - D <- rbinom(n, 1, p) - y0 <- base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) - y1 <- delta + base - X[, 1] + rnorm(n, mean = 0, sd = 0.1) - y <- y1 * D + y0 * (1 - D) - return(list(y = y, D = D, X = X)) -} -``` - -```{r} -n <- 100 # n samples -d <- 100 # n features -delta <- 1.0 # treatment effect -base <- 0.3 # baseline outcome -``` - -## Two Means Estimator - -```{r} -# Simple two means estimate and calculation of variance -twomeans <- function(y, D) { - hat0 <- mean(y[D == 0]) # mean of outcome of un-treated - hat1 <- mean(y[D == 1]) # mean of outcome of treated - V0 <- var(y[D == 0]) / mean(1 - D) # asymptotic variance of the mean of outcome of untreated - V1 <- var(y[D == 1]) / mean(D) # asymptotic variance of the mean of outcome of treated - hat <- hat1 - hat0 # estimate of the treatment effect - # standard error of the estimate of the treatment effect - stderr <- sqrt((V0 + V1) / length(y)) - return(list(hat = hat, stderr = stderr)) -} -``` - -```{r} -# Set the random seed for reproducibility -set.seed(125) - -# Generate RCT data -data <- gen_data(n, d, 0.2, delta, base) -y <- data$y -D <- data$D -X <- data$X - -# Calculate estimation quantities -twomeans(y, D) -``` - -## Partialling-Out Estimator - -```{r} -# We implement the partialling out version of OLS (for pedagogical purposes) -partialling_out <- function(y, D, W) { - # Residualize outcome with OLS - yfit <- lm(y ~ W) - yhat <- predict(yfit, as.data.frame(W)) - yres <- y - as.numeric(yhat) - - # Residualize treatment with OLS - Dfit <- lm(D ~ W) - Dhat <- predict(Dfit, as.data.frame(W)) - Dres <- D - as.numeric(Dhat) - - # Calculate final residual ~ residual OLS estimate - hat <- mean(yres * Dres) / mean(Dres^2) - - # Calculate residual of final regression (epsilon in the BLP decomposition) - epsilon <- yres - hat * Dres - - # Calculate variance of the treatment effect - V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 - stderr <- sqrt(V / length(y)) - - return(list(hat = hat, stderr = stderr)) -} -``` - -```{r} -partialling_out(y, D, cbind(D * X, X)) -``` - -## Double Lasso Partialling-Out Estimator - -```{r} -# Now we simply replace OLS with Lasso to implement the Double Lasso process - -double_lasso <- function(y, D, W) { - - # residualize outcome with Lasso - yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) - yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) - yres <- y - as.numeric(yhat_rlasso) - - - # residualize treatment with Lasso - dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) - dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) - Dres <- D - as.numeric(dhat_rlasso) - - # rest is the same as in the OLS case - hat <- mean(yres * Dres) / mean(Dres^2) - epsilon <- yres - hat * Dres - V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 - stderr <- sqrt(V / length(y)) - - return(list(hat = hat, stderr = stderr)) -} -``` - -```{r} -double_lasso(y, D, cbind(D * X, X)) -``` - -# Simulation - -### Two-Means - -```{r} -# We now check the distributional properties of the different estimators across experiments -# First is the simple two means estimate - -n_experiments <- 100 -# we will keep track of coverage (truth is in CI) and of the point estimate and stderr -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data(n, d, 0.2, delta, base) - y <- data$y - D <- data$D - X <- data$X - - # Calculate two-means estimate - results <- twomeans(y, D) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -# Calculate average coverage (should be .95 ideally) -coverage_rate <- mean(cov) - -cat("Coverage Rate (95% CI):", coverage_rate, "\n") -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally) -``` - -```{r} -sd(hats)# standard deviation of estimates; should be close to the standard errors we calculated for the CIs -``` - -```{r} -mean(stderrs) -``` - -### Partialling Out - -```{r} -# Let's repeat this for the partialling out process (OLS), controlling for X - -n_experiments <- 100 -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data(n, d, 0.2, delta, base) - y <- data$y - D <- data$D - X <- data$X - - # Calculate partialling out estimate with OLS - results <- partialling_out(y, D, X) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -mean(cov) -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -mean(hats) # ols is heavily biased... mean of estimates very far from delta=1 -``` - -```{r} -sd(hats) -``` - -```{r} -mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting -``` - -### Double Lasso - -```{r} -# Now let's try the double Lasso. -n_experiments <- 100 -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data(n, d, 0.2, delta, base) - y <- data$y - D <- data$D - X <- data$X - - # Calculate partialling out estimate with OLS - results <- double_lasso(y, D, X) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -mean(cov) -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -mean(hats) # much closer to 1... (almost the same as two-means) -sd(hats) # standard deviation much smaller than two means, which did not adjust for X -mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty -``` - -### Single Lasso - -```{r} -# Now we simply replace OLS with Lasso to implement the Double Lasso process - -double_lasso <- function(y, D, W) { - - # residualize outcome with Lasso - yfit_rlasso <- hdm::rlasso(W, y, post = FALSE) - yhat_rlasso <- predict(yfit_rlasso, as.data.frame(W)) - yres <- y - as.numeric(yhat_rlasso) - - - # residualize treatment with Lasso - dfit_rlasso <- hdm::rlasso(W, D, post = FALSE) - dhat_rlasso <- predict(dfit_rlasso, as.data.frame(W)) - Dres <- D - as.numeric(dhat_rlasso) - - # rest is the same as in the OLS case - hat <- mean(yres * Dres) / mean(Dres^2) - epsilon <- yres - hat * Dres - V <- mean(epsilon^2 * Dres^2) / mean(Dres^2)^2 - stderr <- sqrt(V / length(y)) - - return(list(hat = hat, stderr = stderr)) -} -``` - -```{r} -# Now let's try the double Lasso. - -n_experiments <- 100 -hats <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data(n, d, 0.2, delta, base) - y <- data$y - D <- data$D - X <- data$X - - # Calculate single lasso estimate - - - yfit_rlasso <- hdm::rlasso(cbind(D, X), y, post = FALSE) - hat <- yfit_rlasso$coefficients[2] - - hats[i] <- hat -} -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -# bias is comparable and larger than standard deviation. -# Even if we could estimate the standard deviation, confidence intervals would undercover -1 - mean(hats) -sd(hats) -``` - -### Post-Lasso OLS - -```{r} -# Now let's try the post-Lasso. -n_experiments <- 100 -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data(n, d, 0.2, delta, base) - y <- data$y - D <- data$D - X <- data$X - - - # run a big lasso y ~ D, X - DX <- cbind(D, X) - yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality - coefs <- yfit_rlasso$coefficients[2:n] - selected_columns <- X[, abs(coefs) > 0.0] - # run OLS on y ~ D, X[chosen by lasso] - # calculate standard error as if lasso step never happened - results <- partialling_out(y, D - mean(D), selected_columns) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -mean(cov) -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -1 - mean(hats) # quite un-biased; bias < standard deviation -sd(hats) -``` - -```{r} -# we under-estimated a bit the uncertainty; smaller estimated stderr than true std. -# this is most prob a finite sample error, from ignoring the lasso variable selection step -# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior. -mean(stderrs) -``` - -### Not RCT Data - -```{r} -gen_data_non_rct <- function(n, d, p, delta, base) { - X <- matrix(rnorm(n * d), nrow = n, ncol = d) - D <- X[, 1] + rnorm(n, mean = 0, sd = 1 / 4) - y <- delta * D + base - X[, 1] + rnorm(n, mean = 0, sd = 1) - return(list(y = y, D = D, X = X)) -} -``` - -```{r} -# post-lasso -n_experiments <- 100 -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data_non_rct(n, d, p, delta, base) - y <- data$y - D <- data$D - X <- data$X - - - # run a big lasso y ~ D, X - DX <- cbind(D, X) - yfit_rlasso <- hdm::rlasso(DX, y, post = FALSE) # could just use this functionality - coefs <- yfit_rlasso$coefficients[2:n] - selected_columns <- X[, abs(coefs) > 0.0] - # run OLS on y ~ D, X[chosen by lasso] - # calculate standard error as if lasso step never happened - results <- partialling_out(y, D - mean(D), selected_columns) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -mean(hats) # very heavily biased -``` - -```{r} -# But now let's try the double Lasso. -n_experiments <- 100 -cov <- numeric(n_experiments) -hats <- numeric(n_experiments) -stderrs <- numeric(n_experiments) - -for (i in 1:n_experiments) { - # Generate data for each experiment - data <- gen_data_non_rct(n, d, p, delta, base) - y <- data$y - D <- data$D - X <- data$X - - # Calculate partialling out estimate with OLS - results <- double_lasso(y, D, X) - hat <- results$hat - stderr <- results$stderr - - # Calculate the 95% confidence interval - ci_lower <- hat - 1.96 * stderr - ci_upper <- hat + 1.96 * stderr - - # Check if the true parameter delta is within the confidence interval - cov[i] <- (ci_lower <= delta) & (delta <= ci_upper) - - hats[i] <- hat - stderrs[i] <- stderr -} -``` - -```{r} -mean(cov) # great coverage -``` - -```{r} -hist(hats, main = "Distribution of Estimates", xlab = "Estimate", col = "skyblue") -``` - -```{r} -1 - mean(hats) -sd(hats) # very small bias compared to standard deviation -mean(stderrs) -``` - diff --git a/PM2/r_heterogenous_wage_effects.Rmd b/PM2/r_heterogenous_wage_effects.Rmd deleted file mode 100644 index d2709e27..00000000 --- a/PM2/r_heterogenous_wage_effects.Rmd +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_heterogenous_wage_effects.irnb" -output: html_document ---- - -# Application: Heterogeneous Effect of Sex on Wage Using Double Lasso - - We use US census data from the year 2015 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. - - - -This analysis allows a closer look how the gender wage gap is related to other socio-economic variables. - - -```{r} -install.packages("hdm") -install.packages("xtable") -``` - -```{r} -library(hdm) -library(xtable) -``` - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" -data <- read.csv(file) -str(data) -data -``` - -```{r} -y <- data$lwage -Z <- subset(data, select = -c(lwage, wage)) -``` - -```{r} -center_colmeans <- function(x) { - xcenter <- colMeans(x) - x - rep(xcenter, rep.int(nrow(x), ncol(x))) -} -``` - -```{r} -# create the model matrix for the covariates -controls_formula <- "~ 0 + (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we + exp1 + exp2 + exp3 + exp4)**2" -Zcontrols <- model.matrix(as.formula(controls_formula), data = Z) -Zcontrols <- center_colmeans(Zcontrols) -``` - -Construct all the variables that we will use to model heterogeneity of effect in a linear manner - -```{r} -# create the model matrix for the linear heterogeneity -linear_het_formula <- "~ -1 + (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)" -Zhet <- model.matrix(as.formula(linear_het_formula), data = Z) -Zhet <- center_colmeans(Zhet) -``` - -Construct all interaction variables between sex and heterogeneity variables - -```{r} -# create the model matrix for the higher order heterogeneity -Zhet <- as.data.frame(cbind(Zhet, "sex" = Z$sex)) -nonlin_het_formula <- "~ -1 + sex + sex * (shs + hsg + scl + clg + mw + so + we + exp1 + exp2 + exp3 + exp4)" -Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet) -interaction_cols <- Zinteractions[, grepl("sex", colnames(Zinteractions))] -``` - -Put variables all together - -```{r} -X <- cbind(Zinteractions, Zcontrols) -``` - -Get estimates and CIs - -```{r} -# this cell takes 30 minutes to run -index_gender <- grep("sex", colnames(Zinteractions)) -effects_female <- hdm::rlassoEffects(x = X, y = y, index = index_gender, post = FALSE) -result <- summary(effects_female) -result$coef -print(xtable(result$coef[, c(1, 2, 4)], type = "latex"), digits = 3) -``` - -Now, we estimate and plot confidence intervals, first "pointwise" and then the joint confidence intervals. - -```{r} -pointwise_ci <- confint(effects_female, level = 0.95) -pointwise_ci -print(xtable(pointwise_ci), type = "latex") -``` - -Finally, we compare the pointwise confidence intervals to joint confidence intervals. - -```{r} -joint_ci <- confint(effects_female, level = 0.95, joint = TRUE) -joint_ci -print(xtable(joint_ci), type = "latex") -``` - diff --git a/PM2/r_linear_penalized_regs.Rmd b/PM2/r_linear_penalized_regs.Rmd deleted file mode 100644 index a2f52db0..00000000 --- a/PM2/r_linear_penalized_regs.Rmd +++ /dev/null @@ -1,707 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_linear_penalized_regs.irnb" -output: html_document ---- - -# Penalized Linear Regressions: A Simulation Experiment - -```{r} -install.packages("xtable") -install.packages("hdm") -install.packages("glmnet") -install.packages("ggplot2") -install.packages("tidyr") -``` - -```{r} -library(hdm) -library(xtable) -library(glmnet) -library(ggplot2) -``` - -## Data Generating Process - -We define a simple data generating process that allows for sparse, dense, and sparse+dense coefficients - -```{r} -gen_data <- function(n, p, regime = "sparse") { - # constants chosen to get R^2 of approximately .80 - if (regime == "sparse") { - beta <- (1 / seq(1:p)^2) * 7 - } else if (regime == "dense") { - beta <- rnorm(p) * 0.5 - } else if (regime == "sparsedense") { - beta_1 <- (1 / seq(1:p)^2) * 6.5 - beta_2 <- rnorm(p, 0, 0.5) * 0.7 - beta <- beta_1 + beta_2 - } - - true_fn <- function(x) { - x[, seq_len(dim(x)[2])] %*% beta - } - - X <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p) - gX <- true_fn(X) - y <- gX + rnorm(n) - - Xtest <- matrix(runif(n * p, min = -0.5, max = 0.5), n, p) - gXtest <- true_fn(Xtest) - ytest <- gXtest + rnorm(n) - - Xpop <- matrix(runif(100000 * p, min = -0.5, max = 0.5), 100000, p) - gXpop <- true_fn(Xpop) - ypop <- gXpop + rnorm(100000) - - return(list( - X = X, y = y, gX = gX, Xtest = Xtest, ytest = ytest, gXtest = gXtest, - Xpop = Xpop, ypop = ypop, gXpop = gXpop, beta = beta - )) -} -``` - -## Data Generating Process: Approximately Sparse - -```{r} -set.seed(1) -n <- 100 -p <- 400 -res <- gen_data(n, p, regime = "sparse") -``` - -```{r} -X <- res$X -y <- res$y -gX <- res$gX -Xtest <- res$Xtest -ytest <- res$ytest -gXtest <- res$gXtest -Xpop <- res$Xpop -ypop <- res$ypop -gXpop <- res$gXpop -betas <- res$beta -``` - -```{r} -plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) -print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example -``` - -```{r} -# Plot betas -plot(seq_along(betas), abs(betas), - log = "y", pch = 20, col = "blue", - xlab = expression(beta), ylab = "Magnitude (log scale)", - main = expression(paste("Beta Magnitude")) -) -``` - -## Lasso, Ridge, ElasticNet - -We use glmnet's penalized estimators, which choose the penalty parameter via cross-validation (by default 10-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. The parameter `alpha` controls what penalty (or allows for a convex combination of `l1` and `l2` penalty). Set `alpha=0.5` for elastic net. - -Features will be standardized (by glmnet) so that penalization does not favor different features asymmetrically. - -```{r} -r2_score <- function(preds, actual, ytrain = y) { - rss <- sum((preds - actual)^2) # residual sum of squares - # total sum of squares, we take mean(ytrain) as mean(actual) is an out-of-sample object - tss <- sum((actual - mean(ytrain))^2) - rsq <- 1 - rss / tss - return(rsq) -} -``` - -```{r} -# family gaussian means that we'll be using square loss -fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) -``` - -We calculate the R-squared on the small test set that we have - -```{r} -cat( - "lassocv R2 (Test): ", r2_score(predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), ytest), - "\nridge R2 (Test): ", r2_score(predict(fit_ridge, newx = Xtest, s = "lambda.min"), ytest), - "\nelnet R2 (Test): ", r2_score(predict(fit_elnet, newx = Xtest, s = "lambda.min"), ytest) -) -``` - -We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample) - -```{r} -r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) -r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) -r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) - -cat( - "lassocv R2 (Pop): ", r2_lasso_cv, - "\nridge R2 (Pop): ", r2_ridge, - "\nelnet R2 (Pop): ", r2_elnet -) -``` - -#### glmnet failure in Ridge - -**Note**: Ridge regression performs worse relatively to the Ridge in the correponding [Python notebook](https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/PM2/python_linear_penalized_regs.ipynb). Even if one were to control for the randomness in the data and use the same data for both, R's glmnet fails. - -To understand why, look at the cross-validated MSE curve with different $\lambda$ (). - -```{r} -plot(fit_ridge) -``` - -From the [glmnet documentation](https://glmnet.stanford.edu/articles/glmnet.html): - - - -> This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two special values along the $\lambda$ sequence are indicated by the vertical dotted lines. ```lambda.min``` is the value of $\lambda$ that gives minimum mean cross-validated error, while ```lambda.1se``` is the value of $\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum. - -Notice that the chosen ```lambda.min``` is at the boundary of the sequence. An easy way to check this instead of plotting is to extract the $\lambda$ sequence and the minimum chosen $\lambda_{min}$ from the fitted object. - -```{r} -cat("lambda sequence: ", fit_ridge$lambda) -cat("\nChosen minimum lambda: ", fit_ridge$lambda.min) -``` - -In general, it is good practice to examine the lambda sequence that R produces and searches over in cross-validation. When the penalty is chosen at the boundary like we see here, this indicates the generated penalty sequence is likely misspecified. Thus, we choose to supply our own sequence. In particular, we choose values that match up with those in Python's ```sklearn``` Ridge implementation. - - -```glmnet``` minimizes the elastic net loss function as follows: -$$\min_{\beta} \frac{1}{N} \| X\beta - y\|_2^2 + \lambda_{R} \left( \frac{1}{2} (1-\alpha) \|\beta\|_2^2 + \alpha \|\beta\|_1 \right) $$ - -For ridge, $\alpha=0$, so $$\min_{\beta} \frac{1}{N} \| X\beta - y\|_2^2 + \frac{\lambda_{R}}{2} \|\beta\|_2^2 $$ - -Meanwhile, ```sklearn``` minimizes $$\min_{\beta} \frac{1}{N} \|X\beta-y\|_2^2 + \frac{\lambda_{python}}{N} \|\beta\|_2^2$$ where $\lambda_{python}$ is chosen from the grid $(0.1,1,10)$. - -To translate this into R, we must set in glmnet $$\lambda_{R} :=\frac{2}{N} \lambda_{python}$$ - -```{r} -# sklearn lambdas (penalty) -lambdas_sklearn <- c(0.1, 1, 10) # defaults -l_seq <- 2 / nrow(X) * lambdas_sklearn -l_seq # note how different these are to the actual lambdas generated by glmnet -``` - -```{r} -fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5, lambda = l_seq) -r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) -``` - -```{r} -cat( - "lassocv R2 (Pop): ", r2_lasso_cv, - "\nridge R2 (Pop): ", r2_ridge, - "\nelnet R2 (Pop): ", r2_elnet -) -``` - -## Plug-in Hyperparameter Lasso and Post-Lasso OLS - -Here we compute the lasso and ols post lasso using plug-in choices for penalty levels. - -\We use "plug-in" tuning with a theoretically valid choice of penalty $\lambda = 2 \cdot c \hat{\sigma} \sqrt{n} \Phi^{-1}(1-\alpha/2p)$, where $c>1$ and $1-\alpha$ is a confidence level, and $\Phi^{-1}$ denotes the quantile function. Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\epsilon$ and $X$ in the regression equation decay quickly at the tails. - -In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks. - -We pull an anaue of R's rlasso. Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\ -The post lasso function makes OLS with the selected ***T*** regressors. -To select those parameters, they use $\lambda$ as variable to penalize\ -**Funny thing: the function rlasso was named like that because it is the "rigorous" Lasso.** - -```{r} -fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level -fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level -``` - -```{r} -r2_lasso <- r2_score(predict(fit_rlasso, newdata = Xtest), ytest) -r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xtest), ytest) - -cat( - "rlasso R2 (Test): ", r2_lasso, - "\nrlasso-post R2 (Test): ", r2_lasso_post -) -``` - -```{r} -r2_lasso <- r2_score(predict(fit_rlasso, newdata = (Xpop)), (ypop)) -r2_lasso_post <- r2_score(predict(fit_rlasso_post, newdata = (Xpop)), (ypop)) - -cat( - "rlasso R2 (Pop): ", r2_lasso, - "\nrlasso-post R2 (Pop): ", r2_lasso_post -) -``` - -## LAVA: Dense + Sparse Coefficients - -Next we code up lava, which alternates the fitting of lasso and ridge - -```{r} -# Define function to compute lava estimator. Doing an iterative scheme with fixed -# number of iteration. Could iterate until a convergence criterion is met. -lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { - - # Need to demean internally - dy <- Y - mean(Y) - dx <- scale(X, scale = FALSE) - - sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" - de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) - - i <- 1 - while (i <= iter) { - sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) - de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) - i <- i + 1 - } - - bhat <- sp1$beta + de1$beta - a0 <- mean(Y) - sum(colMeans(X) * bhat) - - # Need to add intercept to output - - yhat <- newX %*% bhat + a0 - - return(yhat) -} -``` - -```{r} -# define function to get predictions and r2 scores for lava estimator - -lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { - # 5-fold CV. glmnet does cross-validation internally and - # relatively efficiently. We're going to write out all the steps to make sure - # we're using the same CV folds across all procedures in a transparent way and - # to keep the overall structure clear as well. - - # Setup for brute force K-Fold CV - n <- length(ytr) - Kf <- num_folds # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups - - - ## ------------------------------------------------------------ - # We're going to take a shortcut and use the range of lambda values that come out - # of the default implementation in glmnet for everything. Could do better here - maybe - - ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. - ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge - ridge_lambda <- ridge_mod$lambda # values of penalty parameter - - ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. - lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) - lasso_lambda <- lasso_mod$lambda # values of penalty parameter - - ## ------------------------------------------------------------ - - - # Lava - Using a double loop over candidate penalty parameter values. - - lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)] - lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)] - - cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod)) - - for (k in 1:Kf) { - indk <- cvgroup == k - - k_xtr_mod <- xtr_mod[!indk, ] - k_ytr <- ytr[!indk] - k_xte_mod <- xtr_mod[indk, ] - k_yte <- ytr[indk] - - for (ii in seq_along(lambda1_lava_mod)) { - for (jj in seq_along(lambda2_lava_mod)) { - cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] + - sum((k_yte - lava_predict(k_xtr_mod, k_ytr, - newX = k_xte_mod, - lambda1 = lambda1_lava_mod[ii], - lambda2 = lambda2_lava_mod[jj]))^2) - } - } - } - - # Get CV min values of tuning parameters - cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE) - cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]] - cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]] - - cat("Min Lava Lasso CV Penalty: ", cvlambda1_lava_mod) - cat("\nMin Lava Ridge CV Penalty: ", cvlambda2_lava_mod) - - - #### Look at performance on test sample - - # Calculate R^2 in training data and in validation data as measures - # Refit on entire training sample - - - #### CV-min model - - # In sample fit - cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr, - newX = xtr_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - ) - r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) - - # Out of sample fit - cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr, - newX = xte_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - ) - r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) - - - cat("\nIn sample R2 (CV-min): ", r2_lava_mod) - cat("\nOut of Sample R2 (CV-min): ", r2v_lava_mod) - - - #### Use average model across cv-folds and refit model using all training data - ###### we won't report these results. - ###### Averaging is theoretically more solid, but cv-min is more practical. - n_tr <- length(ytr) - n_te <- length(yte) - yhat_tr_lava_mod <- matrix(0, n_tr, Kf) - yhat_te_lava_mod <- matrix(0, n_te, Kf) - - - for (k in 1:Kf) { - indk <- cvgroup == k - - k_xtr_mod <- xtr_mod[!indk, ] - k_ytr <- ytr[!indk] - - # Lava - yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, - newX = xtr_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - )) - yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, - newX = xte_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - )) - } - - avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod) - avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod) - - r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) - r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) - - cat("\nIn sample R2 (Average Across Folds): ", r2_cv_ave_lava_mod) - cat("\nOut of Sample R2 (Average Across Folds): ", r2v_cv_ave_lava_mod) - - return(c( - cvlambda1_lava_mod, - cvlambda2_lava_mod, - cvmin_yhat_lava_tr, # CV_min - cvmin_yhat_lava_test, # CV_min - r2_lava_mod, # CV_min - r2v_lava_mod, # CV_min - avg_yhat_lava_tr, # Average across Folds - avg_yhat_lava_test, # Average across Folds - r2_cv_ave_lava_mod, # Average across Folds - r2v_cv_ave_lava_mod # Average across Folds - )) -} -``` - -```{r} -# Results for Test -cat("Test Results ...\n") -r2_lava_traintest <- lava_yhat_r2(X, Xtest, y, ytest) -``` - -```{r} -# Results for Pop -## note we don't have to re-train the entire model -## this is just due to the way the function is defined above -cat("Population Results ...\n") -r2_lava_pop <- lava_yhat_r2(X, Xpop, y, ypop) -``` - -```{r} -# report R2 using CV min -cat("LAVA R2 (Test): ", r2_lava_traintest[[6]]) -cat("\nLAVA R2 (Pop) ", r2_lava_pop[[6]]) -``` - -## Summarizing Results - -```{r} -table <- matrix(0, 6, 1) -table[1, 1] <- r2_lasso_cv -table[2, 1] <- r2_ridge -table[3, 1] <- r2_elnet -table[4, 1] <- r2_lasso -table[5, 1] <- r2_lasso_post -table[6, 1] <- r2_lava_pop[[6]] - -colnames(table) <- c("R2 (Population)") -rownames(table) <- c( - "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", - "Lasso", "Post-Lasso", "Lava" -) -tab <- xtable(table, digits = 3) -print(tab, type = "latex") # set type="latex" for printing table in LaTeX -tab -``` - -```{r} -# Creating a data frame with the predicted values for test -data <- data.frame( - gXtest = gXtest, - Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), - ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), - RLasso = predict(fit_rlasso, newdata = Xtest), - RLassoPost = predict(fit_rlasso_post, newdata = Xtest), - LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), - Lava = as.vector(r2_lava_traintest[[4]]) -) -colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") - -# Reshaping data into longer format for ggplot -data_long <- tidyr::gather(data, Model, Predicted, -gXtest) - -# Plotting -ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + - geom_point(aes(shape = Model)) + - geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX - scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), - guide = guide_legend(title = "Model")) + - theme_minimal() + - labs( - title = "Comparison of Methods on Predicting gX", - x = "gXtest", - y = "Predictions" - ) + - guides(shape = "none") # Remove the shape legend -``` - -## Data Generating Process: Dense Coefficients - -```{r} -set.seed(1) -n <- 100 -p <- 400 -res <- gen_data(n, p, regime = "dense") - -X <- res$X -y <- res$y -gX <- res$gX -Xtest <- res$Xtest -ytest <- res$ytest -gXtest <- res$gXtest -Xpop <- res$Xpop -ypop <- res$ypop -gXpop <- res$gXpop -betas <- res$beta -``` - -```{r} -plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) -print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example -``` - -```{r} -# plot betas -plot(seq_along(betas), abs(betas), - log = "y", pch = 20, col = "blue", - xlab = expression(beta), ylab = "Magnitude (log scale)", - main = expression(paste("Beta Magnitude")) -) -``` - -```{r} -# family gaussian means that we'll be using square loss -fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) -fit_rlasso <- hdm::rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level -fit_rlasso_post <- hdm::rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level - -r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) -r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) -r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) -r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop) -r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop) -r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]] -``` - -```{r} -table <- matrix(0, 6, 1) -table[1, 1] <- r2_lasso_cv -table[2, 1] <- r2_ridge -table[3, 1] <- r2_elnet -table[4, 1] <- r2_rlasso -table[5, 1] <- r2_rlasso_post -table[6, 1] <- r2_lava - -colnames(table) <- c("R2") -rownames(table) <- c( - "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", - "Lasso", "Post-Lasso", "Lava" -) -tab <- xtable(table, digits = 3) -print(tab, type = "latex") # set type="latex" for printing table in LaTeX -tab -``` - -```{r} -# get lava prediction on test set for plot below -lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]] -``` - -```{r} -# Creating a data frame with the predicted values for test -data <- data.frame( - gXtest = gXtest, - Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), - ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), - RLasso = predict(fit_rlasso, newdata = Xtest), - RLassoPost = predict(fit_rlasso_post, newdata = Xtest), - LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), - Lava = as.vector(lava_yhat) -) -colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") - -# Reshaping data into longer format for ggplot -data_long <- tidyr::gather(data, Model, Predicted, -gXtest) - -# Plotting -ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + - geom_point(aes(shape = Model)) + - geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX - scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), - guide = guide_legend(title = "Model")) + - theme_minimal() + - labs( - title = "Comparison of Methods on Predicting gX", - x = "gXtest", - y = "Predictions" - ) + - guides(shape = "none") # Remove the shape legend -``` - -## Data Generating Process: Approximately Sparse + Small Dense Part - -```{r} -set.seed(1) -n <- 100 -p <- 400 -res <- gen_data(n, p, regime = "sparsedense") - -X <- res$X -y <- res$y -gX <- res$gX -Xtest <- res$Xtest -ytest <- res$ytest -gXtest <- res$gXtest -Xpop <- res$Xpop -ypop <- res$ypop -gXpop <- res$gXpop -betas <- res$beta -``` - -```{r} -plot(gX, y, xlab = "g(X)", ylab = "y") # plot V vs g(X) -print(c("theoretical R2:", var(gX) / var(y))) # theoretical R-square in the simulation example -``` - -```{r} -# plot betas -plot(seq_along(betas), abs(betas), - log = "y", pch = 20, col = "blue", - xlab = expression(beta), ylab = "Magnitude (log scale)", - main = expression(paste("Beta Magnitude")) -) -``` - -```{r} -# family gaussian means that we'll be using square loss -fit_lasso_cv <- cv.glmnet(X, y, family = "gaussian", alpha = 1, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_ridge <- cv.glmnet(X, y, family = "gaussian", alpha = 0, nfolds = 5) -# family gaussian means that we'll be using square loss -fit_elnet <- cv.glmnet(X, y, family = "gaussian", alpha = .5, nfolds = 5) -fit_rlasso <- rlasso(y ~ X, post = FALSE) # lasso with plug-in penalty level -fit_rlasso_post <- rlasso(y ~ X, post = TRUE) # post-lasso with plug-in penalty level - -r2_lasso_cv <- r2_score(predict(fit_lasso_cv, newx = Xpop, s = "lambda.min"), ypop) -r2_ridge <- r2_score(predict(fit_ridge, newx = Xpop, s = "lambda.min"), ypop) -r2_elnet <- r2_score(predict(fit_elnet, newx = Xpop, s = "lambda.min"), ypop) -r2_rlasso <- r2_score(predict(fit_rlasso, newdata = Xpop), ypop) -r2_rlasso_post <- r2_score(predict(fit_rlasso_post, newdata = Xpop), ypop) -r2_lava <- lava_yhat_r2(X, Xpop, y, ypop)[[6]] -``` - -```{r} -table <- matrix(0, 6, 1) -table[1, 1] <- r2_lasso_cv -table[2, 1] <- r2_ridge -table[3, 1] <- r2_elnet -table[4, 1] <- r2_rlasso -table[5, 1] <- r2_rlasso_post -table[6, 1] <- r2_lava - -colnames(table) <- c("R2") -rownames(table) <- c( - "Cross-Validated Lasso", "Cross-Validated ridge", "Cross-Validated elnet", - "Lasso", "Post-Lasso", "Lava" -) -tab <- xtable(table, digits = 3) -print(tab, type = "latex") # set type="latex" for printing table in LaTeX -tab -``` - -```{r} -# get lava prediction on test set for plot below -lava_yhat <- lava_yhat_r2(X, Xtest, y, ytest)[[4]] -``` - -```{r} -# Creating a data frame with the predicted values for test -data <- data.frame( - gXtest = gXtest, - Ridge = predict(fit_ridge, newx = Xtest, s = "lambda.min"), - ENet = predict(fit_elnet, newx = Xtest, s = "lambda.min"), - RLasso = predict(fit_rlasso, newdata = Xtest), - RLassoPost = predict(fit_rlasso_post, newdata = Xtest), - LassoCV = predict(fit_lasso_cv, newx = Xtest, s = "lambda.min"), - Lava = as.vector(lava_yhat) -) -colnames(data) <- c("gXtest", "Ridge", "ENet", "RLasso", "RlassoPost", "LassoCV", "Lava") - -# Reshaping data into longer format for ggplot -data_long <- tidyr::gather(data, Model, Predicted, -gXtest) - -# Plotting -ggplot(data_long, aes(x = gXtest, y = Predicted, color = Model)) + - geom_point(aes(shape = Model)) + - geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") + # gX by gX - scale_color_manual(values = c("brown", "yellow", "red", "green", "blue", "magenta"), - guide = guide_legend(title = "Model")) + - theme_minimal() + - labs( - title = "Comparison of Methods on Predicting gX", - x = "gXtest", - y = "Predictions" - ) + - guides(shape = "none") # Remove the shape legend -``` - diff --git a/PM2/r_ml_for_wage_prediction.Rmd b/PM2/r_ml_for_wage_prediction.Rmd deleted file mode 100644 index 9856acde..00000000 --- a/PM2/r_ml_for_wage_prediction.Rmd +++ /dev/null @@ -1,447 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_ml_for_wage_prediction.irnb" -output: html_document ---- - -# A Simple Case Study using Wage Data from 2015 - -We illustrate how to predict an outcome variable $Y$ in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. We use linear prediction rules for estimation, including OLS and the penalized linear methods we've studied. Later, we will also consider nonlinear prediction rules including tree-based methods and neural nets. - -```{r} -install.packages("xtable") -install.packages("hdm") -install.packages("glmnet") -install.packages("MLmetrics") -``` - -```{r} -library(hdm) -library(xtable) -library(glmnet) -library(MLmetrics) -``` - -## Data - -Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. -The preproccessed sample consists of $5150$ never-married individuals. - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" -data <- read.csv(file) -dim(data) -``` - -The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators. - -```{r} -Z <- subset(data, select = -c(lwage, wage)) # regressors -colnames(Z) -``` - -The following figure shows the weekly wage distribution from the US survey data. - -```{r} -hist(data$wage, xlab = "hourly wage", main = "Empirical wage distribution from the US survey data", breaks = 35) -``` - -Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by -the logarithm. - -## Analysis - -Due to the skewness of the data, we are considering log wages which leads to the following regression model - -$$log(wage) = g(Z) + \epsilon.$$ - -In this notebook, we will evaluate *linear* prediction rules. In later notebooks, we will also utilize nonlinear prediction methods. In linear models, we estimate the prediction rule of the form - -$$\hat g(Z) = \hat \beta'X.$$ - -Again, we generate $X$ in three ways: - -1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators). - - -2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., $\operatorname{exp}^2$ and $\operatorname{exp}^3$) and additional two-way interactions. - -3. Extra Flexible Model: $X$ takes the flexible model and takes all pairwise interactions. - -To evaluate the out-of-sample performance, we split the data first. - -```{r} -set.seed(1234) -training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE) - -data_train <- data[training, ] -data_test <- data[-training, ] -``` - -```{r} -y_train <- data_train$lwage -y_test <- data_test$lwage -``` - -We are starting by running a simple OLS regression. We fit the basic and flexible model to our training data by running an ols regression and compute the R-squared on the test sample - -As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression and computing the mean squared error and $R^2$ on the test sample. - -### Low dimensional specification (basic) - -```{r} -x_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" -formula_basic <- as.formula(paste("lwage", "~", x_basic)) -model_x_basic_train <- model.matrix(formula_basic, data_train) -model_x_basic_test <- model.matrix(formula_basic, data_test) -p_basic <- dim(model_x_basic_train)[2] -p_basic -``` - -```{r} -# ols (basic model) -fit_lm_basic <- lm(formula_basic, data_train) -# Compute the Out-Of-Sample Performance -yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test) -cat("Basic model MSE (OLS): ", mean((y_test - yhat_lm_basic)^2)) # MSE OLS (basic model) -``` - -To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*: - -```{r} -mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2] -mse_lm_basic -``` - -We also compute the out-of-sample $R^2$: - -```{r} -r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test) -cat("Basic model R^2 (OLS): ", r2_lm_basic) # MSE OLS (basic model) -``` - -### High-dimensional specification (flexible) - -```{r} -x_flex <- paste("sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we ", - "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)") -formula_flex <- as.formula(paste("lwage", "~", x_flex)) -model_x_flex_train <- model.matrix(formula_flex, data_train) -model_x_flex_test <- model.matrix(formula_flex, data_test) -p_flex <- dim(model_x_flex_train)[2] -p_flex -``` - -We repeat the same procedure for the flexible model. - -```{r} -# ols (flexible model) -fit_lm_flex <- lm(formula_flex, data_train) -# Compute the Out-Of-Sample Performance -options(warn = -1) -yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test) -mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2] -r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test) -cat("Flexible model R^2 (OLS): ", r2_lm_flex) # MSE OLS (flexible model) -``` - -### Penalized regressions (flexible model) - - - - -We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We proceed by running penalized regressions first for the flexible model, tuned via cross-validation. - -```{r} -fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 1) -fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 0) -fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = .5) - -yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test) -yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test) -yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test) - -mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2] -mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2] -mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2] - -r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test) -r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test) -r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test) - -# R^2 using cross-validation (flexible model) -cat("Flexible model R^2 (Lasso): ", r2_lasso_cv_flex) -cat("\nFlexible model R^2 (Ridge): ", r2_ridge_flex) -cat("\nFlexible model R^2 (Elastic Net): ", r2_elnet_flex) -``` - -We can also try a variant of the `l1` penalty, where the weight is chosen based on theoretical derivations. We use package *hdm* and the function *rlasso*, relying on a theoretical based choice of the penalty level $\lambda$ in the lasso regression. - -Specifically, we use "plug-in" tuning with a theoretically valid choice of penalty $\lambda = 2 \cdot c \hat{\sigma} \sqrt{n} \Phi^{-1}(1-\alpha/2p)$, where $c>1$ and $1-\alpha$ is a confidence level, $\Phi^{-1}$ denotes the quantile function, and $\hat{\sigma}$ is estimated in an iterative manner (see corresponding notes in book). Under homoskedasticity, this choice ensures that the Lasso predictor is well behaved, delivering good predictive performance under approximate sparsity. In practice, this formula will work well even in the absence of homoskedasticity, especially when the random variables $\epsilon$ and $X$ in the regression equation decay quickly at the tails. - -In practice, many people choose to use cross-validation, which is perfectly fine for predictive tasks. However, when conducting inference, to make our analysis valid we will require cross-fitting in addition to cross-validation. As we have not yet discussed cross-fitting, we rely on this theoretically-driven penalty in order to allow for accurate inference in the upcoming notebooks. - -Now, we repeat the same procedure for the flexible model. - -```{r} -fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE) -fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE) -yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test) -yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test) - -mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2] -mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2] - -r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test) -r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test) -# R^2 theoretically chosen penalty (flexible model) -cat("Flexible model R^2 (RLasso): ", r2_lasso_flex) -cat("\nFlexible model R^2 (RLasso post): ", r2_lasso_post_flex) -``` - -Finally, we try the combination of a sparse and a dense coefficient using the LAVA method - -```{r} -# Define function to compute lava estimator. Doing an iterative scheme with fixed -# number of iteration. Could iterate until a convergence criterion is met. -lava_predict <- function(X, Y, newX, lambda1, lambda2, iter = 5) { - - # Need to demean internally - dy <- Y - mean(Y) - dx <- scale(X, scale = FALSE) - - sp1 <- glmnet::glmnet(dx, dy, lambda = lambda1) # lasso step fits "sparse part" - de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx), alpha = 0, lambda = lambda2) - - i <- 1 - while (i <= iter) { - sp1 <- glmnet::glmnet(dx, dy - predict(de1, newx = dx, s = "lambda.min"), lambda = lambda1) - de1 <- glmnet::glmnet(dx, dy - predict(sp1, newx = dx, s = "lambda.min"), alpha = 0, lambda = lambda2) - i <- i + 1 - } - - bhat <- sp1$beta + de1$beta - a0 <- mean(Y) - sum(colMeans(X) * bhat) - - # Need to add intercept to output - - yhat <- newX %*% bhat + a0 - - return(yhat) -} -``` - -```{r} -# define function to get predictions and r2 scores for lava estimator -lava_yhat_r2 <- function(xtr_mod, xte_mod, ytr, yte, num_folds = 5) { - # 5-fold CV. glmnet does cross-validation internally and - # relatively efficiently. We're going to write out all the steps to make sure - # we're using the same CV folds across all procedures in a transparent way and - # to keep the overall structure clear as well. - - # Setup for brute force K-Fold CV - n <- length(ytr) - Kf <- num_folds # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cvgroup <- sample(sampleframe, size = n, replace = FALSE) # CV groups - - - ## ------------------------------------------------------------ - # We're going to take a shortcut and use the range of lambda values that come out - # of the default implementation in glmnet for everything. Could do better here - maybe - - ## Fit ridge on grid of lambda values (chosen by default using glmnet) using basic model. - ridge_mod <- glmnet::glmnet(xtr_mod, ytr, alpha = 0) # alpha = 0 gives ridge - ridge_lambda <- ridge_mod$lambda # values of penalty parameter - - ## Fit lasso on grid of lambda values (chosen by default using glmnet) using basic model. - lasso_mod <- glmnet::glmnet(xtr_mod, ytr) # default is lasso (equivalent to alpha = 1) - lasso_lambda <- lasso_mod$lambda # values of penalty parameter - - ## ------------------------------------------------------------ - - - # Lava - Using a double loop over candidate penalty parameter values. - - lambda1_lava_mod <- lasso_mod$lambda[seq(5, length(lasso_lambda), 10)] - lambda2_lava_mod <- ridge_mod$lambda[seq(5, length(ridge_lambda), 10)] - - cv_mod_lava <- matrix(0, length(lambda1_lava_mod), length(lambda2_lava_mod)) - - for (k in 1:Kf) { - indk <- cvgroup == k - - k_xtr_mod <- xtr_mod[!indk, ] - k_ytr <- ytr[!indk] - k_xte_mod <- xtr_mod[indk, ] - k_yte <- ytr[indk] - - for (ii in seq_along(lambda1_lava_mod)) { - for (jj in seq_along(lambda2_lava_mod)) { - cv_mod_lava[ii, jj] <- cv_mod_lava[ii, jj] + - sum((k_yte - lava_predict(k_xtr_mod, k_ytr, - newX = k_xte_mod, - lambda1 = lambda1_lava_mod[ii], - lambda2 = lambda2_lava_mod[jj]))^2) - } - } - } - - # Get CV min values of tuning parameters - cvmin_lava_mod <- which(cv_mod_lava == min(cv_mod_lava), arr.ind = TRUE) - cvlambda1_lava_mod <- lambda1_lava_mod[cvmin_lava_mod[1]] - cvlambda2_lava_mod <- lambda2_lava_mod[cvmin_lava_mod[2]] - - #### Look at performance on test sample - - # Calculate R^2 in training data and in validation data as measures - # Refit on entire training sample - - #### CV-min model - - # In sample fit - cvmin_yhat_lava_tr <- lava_predict(xtr_mod, ytr, - newX = xtr_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - ) - r2_lava_mod <- 1 - sum((ytr - cvmin_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) - - # Out of sample fit - cvmin_yhat_lava_test <- lava_predict(xtr_mod, ytr, - newX = xte_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - ) - r2v_lava_mod <- 1 - sum((yte - cvmin_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) - - #### Use average model across cv-folds and refit model using all training data - ###### we won't report these results. - ###### Averaging is theoretically more solid, but cv-min is more practical. - n_tr <- length(ytr) - n_te <- length(yte) - yhat_tr_lava_mod <- matrix(0, n_tr, Kf) - yhat_te_lava_mod <- matrix(0, n_te, Kf) - - - for (k in 1:Kf) { - indk <- cvgroup == k - - k_xtr_mod <- xtr_mod[!indk, ] - k_ytr <- ytr[!indk] - - # Lava - yhat_tr_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, - newX = xtr_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - )) - yhat_te_lava_mod[, k] <- as.vector(lava_predict(k_xtr_mod, k_ytr, - newX = xte_mod, - lambda1 = cvlambda1_lava_mod, - lambda2 = cvlambda2_lava_mod - )) - } - - avg_yhat_lava_tr <- rowMeans(yhat_tr_lava_mod) - avg_yhat_lava_test <- rowMeans(yhat_te_lava_mod) - - r2_cv_ave_lava_mod <- 1 - sum((ytr - avg_yhat_lava_tr)^2) / sum((ytr - mean(ytr))^2) - r2v_cv_ave_lava_mod <- 1 - sum((yte - avg_yhat_lava_test)^2) / sum((yte - mean(ytr))^2) - - return(c( - cvlambda1_lava_mod, - cvlambda2_lava_mod, - cvmin_yhat_lava_tr, # CV_min - cvmin_yhat_lava_test, # CV_min - r2_lava_mod, # CV_min - r2v_lava_mod, # CV_min - avg_yhat_lava_tr, # Average across Folds - avg_yhat_lava_test, # Average across Folds - r2_cv_ave_lava_mod, # Average across Folds - r2v_cv_ave_lava_mod # Average across Folds - )) -} -``` - -```{r} -fit_lava_flex <- lava_yhat_r2(model_x_flex_train, model_x_flex_test, y_train, y_test) -cat("Flexible model R^2 (LAVA): ", fit_lava_flex[[6]]) # using CV_min -``` - - - -We find that for this dataset the low dimensional OLS is sufficient. The high-dimensional approaches did not manage to substantively increase predictive power. - -### Extra high-dimensional specification (extra flexible) - -We repeat the same procedure for the extra flexible model. - - - - - - -```{r} -x_extra <- " sex + (exp1 + exp2 + exp3 + exp4 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)^2" -formula_extra <- as.formula(paste("lwage", "~", x_extra)) -model_x_extra_train <- model.matrix(formula_extra, data_train) -model_x_extra_test <- model.matrix(formula_extra, data_test) -p_extra <- dim(model_x_extra_train)[2] -p_extra -``` - -```{r} -# ols (extra flexible model) -fit_lm_extra <- lm(formula_extra, data_train) -options(warn = -1) -yhat_lm_extra <- predict(fit_lm_extra, newdata = data_test) -mse_lm_extra <- summary(lm((y_test - yhat_lm_extra)^2 ~ 1))$coef[1:2] -r2_lm_extra <- 1 - mse_lm_extra[1] / var(y_test) -cat("Extra flexible model R^2 (OLS): ", r2_lm_extra) -``` - -#### Penalized regressions (extra flexible model) - -Now let's repeat our penalized regression analysis for the extra flexible model. Note this block takes a while ~ 1 hour 15 minutes. To reduce time substantially, reduce the number of folds in LAVA. - -```{r} -# penalized regressions -fit_lasso_cv_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 1) -fit_ridge_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = 0) -fit_elnet_extra <- cv.glmnet(model_x_extra_train, y_train, family = "gaussian", alpha = .5) -fit_rlasso_extra <- hdm::rlasso(formula_extra, data_train, post = FALSE) -fit_rlasso_post_extra <- hdm::rlasso(formula_extra, data_train, post = TRUE) -fit_lava_extra <- lava_yhat_r2(model_x_extra_train, model_x_extra_test, y_train, y_test) - -yhat_lasso_cv_extra <- predict(fit_lasso_cv_extra, newx = model_x_extra_test) -yhat_ridge_extra <- predict(fit_ridge_extra, newx = model_x_extra_test) -yhat_elnet_extra <- predict(fit_elnet_extra, newx = model_x_extra_test) -yhat_rlasso_extra <- predict(fit_rlasso_extra, newdata = data_test) -yhat_rlasso_post_extra <- predict(fit_rlasso_post_extra, newdata = data_test) -yhat_lava_extra <- fit_lava_extra[[4]] - -mse_lasso_cv_extra <- summary(lm((y_test - yhat_lasso_cv_extra)^2 ~ 1))$coef[1:2] -mse_ridge_extra <- summary(lm((y_test - yhat_ridge_extra)^2 ~ 1))$coef[1:2] -mse_elnet_extra <- summary(lm((y_test - yhat_elnet_extra)^2 ~ 1))$coef[1:2] -mse_lasso_extra <- summary(lm((y_test - yhat_rlasso_extra)^2 ~ 1))$coef[1:2] -mse_lasso_post_extra <- summary(lm((y_test - yhat_rlasso_post_extra)^2 ~ 1))$coef[1:2] -mse_lava_extra <- summary(lm(as.vector(y_test - yhat_lava_extra)^2 ~ 1))$coef[1:2] - -r2_lasso_cv_extra <- 1 - mse_lasso_cv_extra[1] / var(y_test) -r2_ridge_extra <- 1 - mse_ridge_extra[1] / var(y_test) -r2_elnet_extra <- 1 - mse_elnet_extra[1] / var(y_test) -r2_lasso_extra <- 1 - mse_lasso_extra[1] / var(y_test) -r2_lasso_post_extra <- 1 - mse_lasso_post_extra[1] / var(y_test) -r2_lava_extra <- 1 - mse_lava_extra[1] / var(y_test) - -# R^2 (extra flexible) -cat("\nExtra flexible model R^2 (Lasso): ", r2_lasso_cv_extra) -cat("\nExtra flexible model R^2 (Ridge): ", r2_ridge_extra) -cat("\nExtra flexible model R^2 (Elastic Net): ", r2_elnet_extra) -cat("\nExtra flexible model R^2 (RLasso): ", r2_lasso_extra) -cat("\nExtra flexible model R^2 (RLasso post): ", r2_lasso_post_extra) -cat("\nExtra flexible model R^2 (LAVA): ", r2_lava_extra) # using CV_min -``` - - - diff --git a/PM2/r_orthogonal_orig.Rmd b/PM2/r_orthogonal_orig.Rmd deleted file mode 100644 index a49aef69..00000000 --- a/PM2/r_orthogonal_orig.Rmd +++ /dev/null @@ -1,116 +0,0 @@ ---- -title: An R Markdown document converted from "PM2/r_orthogonal_orig.irnb" -output: html_document ---- - -# Simulation on Orthogonal Estimation - -We compare the performance of the naive and orthogonal methods in a computational experiment where -$p=n=100$, $\beta_j = 1/j^2$, $(\gamma_{DW})_j = 1/j^2$ and $$Y = 1 \cdot D + \beta' W + \epsilon_Y$$ - -where $W \sim N(0,I)$, $\epsilon_Y \sim N(0,1)$, and $$D = \gamma'_{DW} W + \tilde{D}$$ where $\tilde{D} \sim N(0,1)/4$. - -The true treatment effect here is 1. From the plots produced in this notebook (estimate minus ground truth), we show that the naive single-selection estimator is heavily biased (lack of Neyman orthogonality in its estimation strategy), while the orthogonal estimator based on partialling out, is approximately unbiased and Gaussian. - -```{r} -install.packages("hdm") -install.packages("ggplot2") -``` - -```{r} -library(hdm) -library(ggplot2) -``` - -```{r} -# Initialize constants -B <- 10000 # Number of iterations -n <- 100 # Sample size -p <- 100 # Number of features - -# Initialize arrays to store results -Naive <- rep(0, B) -Orthogonal <- rep(0, B) - - -lambdaYs <- rep(0, B) -lambdaDs <- rep(0, B) - -for (i in 1:B) { - # Generate parameters - beta <- 1 / (1:p)^2 - gamma <- 1 / (1:p)^2 - - # Generate covariates / random data - X <- matrix(rnorm(n * p), n, p) - D <- X %*% gamma + rnorm(n) / 4 - - # Generate Y using DGP - Y <- D + X %*% beta + rnorm(n) - - # Single selection method - rlasso_result <- hdm::rlasso(Y ~ D + X) # Fit lasso regression - sx_ids <- which(rlasso_result$coef[-c(1, 2)] != 0) # Selected covariates - - # Check if any Xs are selected - if (sum(sx_ids) == 0) { - Naive[i] <- lm(Y ~ D)$coef[2] # Fit linear regression with only D if no Xs are selected - } else { - Naive[i] <- lm(Y ~ D + X[, sx_ids])$coef[2] # Fit linear regression with selected X otherwise - } - - # Partialling out / Double Lasso - - fitY <- hdm::rlasso(Y ~ X, post = TRUE) - resY <- fitY$res - - fitD <- hdm::rlasso(D ~ X, post = TRUE) - resD <- fitD$res - - Orthogonal[i] <- lm(resY ~ resD)$coef[2] # Fit linear regression for residuals -} -``` - -## Make a Nice Plot - -```{r} -# Specify ratio -img_width <- 15 -img_height <- img_width / 2 -``` - -```{r} -# Create a data frame for the estimates -df <- data.frame(Method = rep(c("Naive", "Orthogonal"), each = B), - Value = c(Naive - 1, Orthogonal - 1)) - -# Create the histogram using ggplot2 -hist_plot <- ggplot(df, aes(x = Value, fill = Method)) + - geom_histogram(binwidth = 0.1, color = "black", alpha = 0.7) + - facet_wrap(~Method, scales = "fixed") + - labs( - title = "Distribution of Estimates (Centered around Ground Truth)", - x = "Bias", - y = "Frequency" - ) + - scale_x_continuous(breaks = seq(-2, 1.5, 0.5)) + - theme_minimal() + - theme( - plot.title = element_text(hjust = 0.5), # Center the plot title - strip.text = element_text(size = 10), # Increase text size in facet labels - legend.position = "none", # Remove the legend - panel.grid.major = element_blank(), # Make major grid lines invisible - # panel.grid.minor = element_blank(), # Make minor grid lines invisible - strip.background = element_blank() # Make the strip background transparent - ) + - theme(panel.spacing = unit(2, "lines")) # Adjust the ratio to separate subplots wider - -# Set a wider plot size -options(repr.plot.width = img_width, repr.plot.height = img_height) - -# Display the histogram -print(hist_plot) -``` - -As we can see from the above bias plots (estimates minus the ground truth effect of 1), the double lasso procedure concentrates around zero whereas the naive estimator does not. - diff --git a/PM3/Old/automl-for-wage-prediction.irnb.Rmd b/PM3/Old/automl-for-wage-prediction.irnb.Rmd deleted file mode 100644 index 1d9e01c7..00000000 --- a/PM3/Old/automl-for-wage-prediction.irnb.Rmd +++ /dev/null @@ -1,126 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains an example for teaching. - - - -# Automatic Machine Learning with H2O AutoML - - -We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, re-analyse the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. - - -We can use AutoML as a benchmark and compare it to the methods that we used in the previous notebook where we applied one machine learning method after the other. - -```{r} -# load the H2O package -library(h2o) -``` - -```{r} -# load the data set -load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") - -# split the data -set.seed(1234) -training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) - -train <- data[training,] -test <- data[-training,] -``` - -```{r} -# start h2o cluster -h2o.init() -``` - -```{r} -# convert data as h2o type -train_h = as.h2o(train) -test_h = as.h2o(test) - -# have a look at the data -h2o.describe(train_h) -``` - -```{r} -# define the variables -y = 'lwage' -x = setdiff(names(data), c('wage','occ2', 'ind2')) - -# run AutoML for 10 base models and a maximal runtime of 100 seconds -aml = h2o.automl(x=x,y = y, - training_frame = train_h, - leaderboard_frame = test_h, - max_models = 10, - seed = 1, - max_runtime_secs = 100 - ) -# AutoML Leaderboard -lb = aml@leaderboard -print(lb, n = nrow(lb)) -``` - -We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by - -```{r} -aml@leaderboard$mse[1] -``` - -The in-sample performance can be evaluated by - -```{r} -aml@leader -``` - -This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard. - -```{r} -model_ids <- as.data.frame(aml@leaderboard$model_id)[,1] -# Get the "All Models" Stacked Ensemble model -se <- h2o.getModel(grep("StackedEnsemble_AllModels", model_ids, value = TRUE)[1]) -# Get the Stacked Ensemble metalearner model -metalearner <- se@model$metalearner_model -h2o.varimp(metalearner) -``` - -The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM. - - -```{r} -h2o.varimp_plot(metalearner) -``` - -## Generating Predictions Using Leader Model - -We can also generate predictions on a test sample using the leader model object. - -```{r} -pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample -head(pred) -``` - -This allows us to estimate the out-of-sample (test) MSE and the standard error as well. - -```{r} -y_test <- as.matrix(test_h$lwage) -summary(lm((y_test-pred)^2~1))$coef[1:2] -``` - -We observe both a lower MSE and a lower standard error compared to our previous results (see [here](https://www.kaggle.com/janniskueck/pm3-notebook-newdata)). - -```{r} -h2o.shutdown(prompt = F) -``` diff --git a/PM3/r_functional_approximation_by_nn_and_rf.Rmd b/PM3/r_functional_approximation_by_nn_and_rf.Rmd deleted file mode 100644 index aca4a796..00000000 --- a/PM3/r_functional_approximation_by_nn_and_rf.Rmd +++ /dev/null @@ -1,193 +0,0 @@ ---- -title: An R Markdown document converted from "PM3/r_functional_approximation_by_nn_and_rf.irnb" -output: html_document ---- - -# Functional Approximations by Trees and Neural Networks - -Here we show how the function -$$ -x \mapsto exp(4 x) -$$ -can be easily approximated by a tree-based methods (Trees, Random Forest) and a neural network (2 Layered Neural Network) - -```{r} -install.packages("randomForest") -install.packages("rpart") -install.packages("gbm") -install.packages("keras") -``` - -```{r} -library(randomForest) -library(rpart) -library(gbm) -library(keras) -``` - -# Functional Approximation by a Tree - -We play around with the penalty level $cp$ below to illustrate how it affects the complexity of tree. Recall we may use this to prune the tree to improve predictive performance and lessen the noise in our final estimate. A simple penalty would be the number of leaves times a penalty level $\alpha$. - -Specifics on the penalty can be found [here](https://cran.r-project.org/web/packages/rpart/rpart.pdf). - -```{r} -set.seed(1) -x_train <- matrix(runif(1000), 1000, 1) -y_train <- exp(4 * x_train) # Noiseless case Y=g(X) -dim(x_train) - - -# shallow tree -TreeModel <- rpart(y_train ~ x_train, cp = .01) # cp is penalty level -pred_tm <- predict(TreeModel, newx = x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_tm, col = 3, pch = 19) -``` - -```{r} -set.seed(1) -x_train <- matrix(runif(1000), 1000, 1) -y_train <- exp(4 * x_train) # Noiseless case Y=g(X) -dim(x_train) - - -TreeModel <- rpart(y_train ~ x_train, cp = .0005) # cp is penalty level -pred_tm <- predict(TreeModel, newx = x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_tm, col = 3, pch = 19) -``` - -# Functional Approximation by RF - -Here we show how the function -$$ -x \mapsto exp(4 x) -$$ -can be easily approximated by a tree-based method (Random Forest) and a neural network (2 Layered Neural Network) - -```{r} -RFmodel <- randomForest(y_train ~ x_train) -pred_rf <- predict(RFmodel, newdata = x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_rf, col = 4, pch = 19) -``` - -# Boosted Trees - -```{r} -data_train <- as.data.frame(cbind(x_train, y_train)) -BoostTreemodel <- gbm(y_train ~ x_train, - distribution = "gaussian", n.trees = 100, shrinkage = .01, - interaction.depth = 3 -) - -# shrinkage is "learning rate" -# n.trees is the number of boosting steps -# interaction.depth is the max depth of each tree -pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 100) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_bt, col = 4, pch = 19) -``` - -```{r} -data_train <- as.data.frame(cbind(x_train, y_train)) -BoostTreemodel <- gbm(y_train ~ x_train, - distribution = "gaussian", n.trees = 1000, shrinkage = .01, - interaction.depth = 3 -) -# shrinkage is "learning rate" -# n.trees is the number of boosting steps -# interaction.depth is the max depth of each tree -pred_bt <- predict(BoostTreemodel, newdata = data_train, n.trees = 1000) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_bt, col = 4, pch = 19) -``` - -# Same Example with a Neural Network - -```{r} -build_model <- function() { - - model <- keras_model_sequential() %>% - layer_dense( - units = 200, activation = "relu", - input_shape = 1 - ) %>% - layer_dense(units = 20, activation = "relu") %>% - layer_dense(units = 1) - - model %>% compile( - optimizer = optimizer_adam(lr = 0.01), - loss = "mse", - metrics = c("mae"), - ) -} -``` - -```{r} -model <- build_model() -summary(model) -``` - -```{r} -num_epochs <- 1 -model %>% fit(x_train, y_train, - epochs = num_epochs, batch_size = 10, verbose = 0 -) -pred_nn <- model %>% predict(x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_nn, col = 4, pch = 19, ) -``` - -```{r} -num_epochs <- 100 -model %>% fit(x_train, y_train, - epochs = num_epochs, batch_size = 10, verbose = 0 -) -pred_nn <- model %>% predict(x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_nn, col = 4, pch = 19, ) -``` - -### Using Early Stopping - -```{r} -# Define the neural network architecture -model <- keras_model_sequential() %>% - layer_dense(units = 200, activation = "relu", input_shape = 1) %>% - layer_dense(units = 20, activation = "relu") %>% - layer_dense(units = 1) # Output layer with 1 unit for regression task - -# Compile the model -model %>% compile( - optimizer = optimizer_adam(lr = 0.01), - loss = "mse", - metrics = c("mae"), -) - -summary(model) -``` - -```{r} -num_epochs <- 100 - -# Define early stopping based on validation set (20%) performance -# Patience set to 5 epochs (default in skorch is 5) -early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 5) - -# Train the model -model %>% fit( - x_train, y_train, - epochs = num_epochs, - batch_size = 10, - validation_split = 0.2, # 20% validation set - verbose = 0, - callbacks = list(early_stopping) -) - -pred_nn <- model %>% predict(x_train) -plot(x_train, y_train, type = "p", pch = 19, xlab = "z", ylab = "g(z)") -points(x_train, pred_nn, col = 4, pch = 19) -``` - diff --git a/PM3/r_ml_wage_prediction.Rmd b/PM3/r_ml_wage_prediction.Rmd deleted file mode 100644 index c5cd0bf7..00000000 --- a/PM3/r_ml_wage_prediction.Rmd +++ /dev/null @@ -1,594 +0,0 @@ ---- -title: An R Markdown document converted from "PM3/r_ml_wage_prediction.irnb" -output: html_document ---- - -# Machine Learning Estimators for Wage Prediction - -We illustrate how to predict an outcome variable Y in a high-dimensional setting, where the number of covariates $p$ is large in relation to the sample size $n$. So far we have used linear prediction rules, e.g. Lasso regression, for estimation. -Now, we also consider nonlinear prediction rules including tree-based methods. - -```{r} -# Import relevant packages -install.packages("xtable") -install.packages("hdm") -install.packages("glmnet") -install.packages("randomForest") -install.packages("rpart") -install.packages("nnet") -install.packages("gbm") -install.packages("rpart.plot") -install.packages("keras") -``` - -```{r} -library(hdm) -library(xtable) -library(glmnet) -library(randomForest) -library(rpart) -library(nnet) -library(gbm) -library(rpart.plot) -library(keras) -``` - -## Data - -Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. -The preproccessed sample consists of $5150$ never-married individuals. - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv" -data <- read.csv(file) -dim(data) -``` - -The outcomes $Y_i$'s are hourly (log) wages of never-married workers living in the U.S. The raw regressors $Z_i$'s consist of a variety of characteristics, including experience, education and industry and occupation indicators. - -```{r} -Z <- subset(data, select = -c(lwage, wage)) # regressors -colnames(Z) -``` - -The following figure shows the weekly wage distribution from the US survey data. - -```{r} -hist(data$wage, xlab = "hourly wage", main = "Empirical wage distribution from the US survey data", breaks = 35) -``` - -Wages show a high degree of skewness. Hence, wages are transformed in almost all studies by -the logarithm. - -## Analysis - -Due to the skewness of the data, we are considering log wages which leads to the following regression model - -$$log(wage) = g(Z) + \epsilon.$$ - -We will estimate the two sets of prediction rules: Linear and Nonlinear Models. -In linear models, we estimate the prediction rule of the form - -$$\hat g(Z) = \hat \beta'X.$$ -Again, we generate $X$ in two ways: - -1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, regional indicators). - - -2. Flexible Model: $X$ consists of all raw regressors from the basic model plus occupation and industry indicators, transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions. - -To evaluate the out-of-sample performance, we split the data first. - -```{r} -set.seed(1234) -training <- sample(nrow(data), nrow(data) * (3 / 4), replace = FALSE) - -data_train <- data[training, ] -data_test <- data[-training, ] -``` - -We construct the two different model matrices $X_{basic}$ and $X_{flex}$ for both the training and the test sample: - -```{r} -x_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we + C(occ2)+ C(ind2)" -x_flex <- paste("sex + exp1 + shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we ", - "+ (exp1 + exp2 + exp3 + exp4) * (shs + hsg + scl + clg + C(occ2) + C(ind2) + mw + so + we)") -formula_basic <- as.formula(paste("lwage", "~", x_basic)) -formula_flex <- as.formula(paste("lwage", "~", x_flex)) - -model_x_basic_train <- model.matrix(formula_basic, data_train) -model_x_basic_test <- model.matrix(formula_basic, data_test) -p_basic <- dim(model_x_basic_train)[2] -model_x_flex_train <- model.matrix(formula_flex, data_train) -model_x_flex_test <- model.matrix(formula_flex, data_test) -p_flex <- dim(model_x_flex_train)[2] -``` - -```{r} -y_train <- data_train$lwage -y_test <- data_test$lwage -``` - -```{r} -p_basic -p_flex -``` - -As known from our first lab, the basic model consists of $51$ regressors and the flexible model of $246$ regressors. Let us fit our models to the training sample using the two different model specifications. We are starting by running a simple ols regression. - -### OLS - -We fit the basic model to our training data by running an ols regression and compute the mean squared error on the test sample. - -```{r} -# ols (basic model) -fit_lm_basic <- lm(formula_basic, data_train) -``` - -```{r} -# Compute the Out-Of-Sample Performance -yhat_lm_basic <- predict(fit_lm_basic, newdata = data_test) -# MSE OLS (basic model) -cat("The mean squared error (MSE) using the basic model is equal to", mean((y_test - yhat_lm_basic)^2)) -``` - -To determine the out-of-sample $MSE$ and the standard error in one step, we can use the function *lm*: - -```{r} -mse_lm_basic <- summary(lm((y_test - yhat_lm_basic)^2 ~ 1))$coef[1:2] -mse_lm_basic -``` - -We also compute the out-of-sample $R^2$: - -```{r} -r2_lm_basic <- 1 - mse_lm_basic[1] / var(y_test) -# MSE OLS (basic model) -cat("The R^2 using the basic model is equal to", r2_lm_basic) -``` - -We repeat the same procedure for the flexible model. - -```{r} -# ols (flexible model) -fit_lm_flex <- lm(formula_flex, data_train) -# Compute the Out-Of-Sample Performance -options(warn = -1) -yhat_lm_flex <- predict(fit_lm_flex, newdata = data_test) -mse_lm_flex <- summary(lm((y_test - yhat_lm_flex)^2 ~ 1))$coef[1:2] -r2_lm_flex <- 1 - mse_lm_flex[1] / var(y_test) -cat("The R^2 using the flexible model is equal to", r2_lm_flex) # MSE OLS (flexible model) -``` - -We observe that ols regression works better for the basic model with smaller $p/n$ ratio. We now proceed by running lasso regressions and related penalized methods. - -### Lasso, Ridge and Elastic Net - -Considering the basic model, we run a lasso/post-lasso regression first and then we compute the measures for the out-of-sample performance. Note that applying the package *hdm* and the function *rlasso* we rely on a theoretical based choice of the penalty level $\lambda$ in the lasso regression. - -```{r} -# lasso and variants -fit_rlasso <- hdm::rlasso(formula_basic, data_train, post = FALSE) -fit_rlasso_post <- hdm::rlasso(formula_basic, data_train, post = TRUE) -yhat_rlasso <- predict(fit_rlasso, newdata = data_test) -yhat_rlasso_post <- predict(fit_rlasso_post, newdata = data_test) - -mse_lasso <- summary(lm((y_test - yhat_rlasso)^2 ~ 1))$coef[1:2] -mse_lasso_post <- summary(lm((y_test - yhat_rlasso_post)^2 ~ 1))$coef[1:2] - -r2_lasso <- 1 - mse_lasso[1] / var(y_test) -r2_lasso_post <- 1 - mse_lasso_post[1] / var(y_test) -# R^2 lasso/post-lasso (basic model) -cat("The R^2 using the basic model is equal to", r2_lasso, "for lasso and", r2_lasso_post, "for post-lasso") -``` - -Now, we repeat the same procedure for the flexible model. - -```{r} -fit_rlasso_flex <- hdm::rlasso(formula_flex, data_train, post = FALSE) -fit_rlasso_post_flex <- hdm::rlasso(formula_flex, data_train, post = TRUE) -yhat_rlasso_flex <- predict(fit_rlasso_flex, newdata = data_test) -yhat_rlasso_post_flex <- predict(fit_rlasso_post_flex, newdata = data_test) - -mse_lasso_flex <- summary(lm((y_test - yhat_rlasso_flex)^2 ~ 1))$coef[1:2] -mse_lasso_post_flex <- summary(lm((y_test - yhat_rlasso_post_flex)^2 ~ 1))$coef[1:2] - -# R^2 lasso/post-lasso (flexible model) -r2_lasso_flex <- 1 - mse_lasso_flex[1] / var(y_test) -r2_lasso_post_flex <- 1 - mse_lasso_post_flex[1] / var(y_test) -cat("The R^2 using the flexible model is equal to", r2_lasso_flex, - "for lasso and", r2_lasso_post_flex, "for post-lasso") -``` - -In contrast to a theoretical based choice of the tuning parameter $\lambda$ in the lasso regression, we can also use cross-validation to determine the penalty level by applying the package *glmnet* and the function cv.glmnet. In this context, we also run a ridge and a elastic net regression by adjusting the parameter *alpha*. - -```{r} -fit_lasso_cv <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = 1) -fit_ridge <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = 0) -fit_elnet <- cv.glmnet(model_x_basic_train, y_train, family = "gaussian", alpha = .5) - -yhat_lasso_cv <- predict(fit_lasso_cv, newx = model_x_basic_test) -yhat_ridge <- predict(fit_ridge, newx = model_x_basic_test) -yhat_elnet <- predict(fit_elnet, newx = model_x_basic_test) - -mse_lasso_cv <- summary(lm((y_test - yhat_lasso_cv)^2 ~ 1))$coef[1:2] -mse_ridge <- summary(lm((y_test - yhat_ridge)^2 ~ 1))$coef[1:2] -mse_elnet <- summary(lm((y_test - yhat_elnet)^2 ~ 1))$coef[1:2] - -r2_lasso_cv <- 1 - mse_lasso_cv[1] / var(y_test) -r2_ridge <- 1 - mse_ridge[1] / var(y_test) -r2_elnet <- 1 - mse_elnet[1] / var(y_test) - -# R^2 using cross-validation (basic model) -cat("R^2 using cross-validation for lasso, ridge and elastic net in the basic model:", - r2_lasso_cv, r2_ridge, r2_elnet) -``` - -Note that the following calculations for the flexible model need some computation time. - -```{r} -fit_lasso_cv_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 1) -fit_ridge_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = 0) -fit_elnet_flex <- cv.glmnet(model_x_flex_train, y_train, family = "gaussian", alpha = .5) - -yhat_lasso_cv_flex <- predict(fit_lasso_cv_flex, newx = model_x_flex_test) -yhat_ridge_flex <- predict(fit_ridge_flex, newx = model_x_flex_test) -yhat_elnet_flex <- predict(fit_elnet_flex, newx = model_x_flex_test) - -mse_lasso_cv_flex <- summary(lm((y_test - yhat_lasso_cv_flex)^2 ~ 1))$coef[1:2] -mse_ridge_flex <- summary(lm((y_test - yhat_ridge_flex)^2 ~ 1))$coef[1:2] -mse_elnet_flex <- summary(lm((y_test - yhat_elnet_flex)^2 ~ 1))$coef[1:2] - -r2_lasso_cv_flex <- 1 - mse_lasso_cv_flex[1] / var(y_test) -r2_ridge_flex <- 1 - mse_ridge_flex[1] / var(y_test) -r2_elnet_flex <- 1 - mse_elnet_flex[1] / var(y_test) - -# R^2 using cross-validation (flexible model) -cat("R^2 using cross-validation for lasso, ridge and elastic net in the flexible model:", - r2_lasso_cv_flex, r2_ridge_flex, r2_elnet_flex) -``` - -The performance of the lasso regression with cross-validated penalty is quite similar to the performance of lasso using a theoretical based choice of the tuning parameter. - -#Non-linear models - -Besides linear regression models, we consider nonlinear regression models to build a predictive model. We are applying regression trees, random forests, boosted trees and neural nets to estimate the regression function $g(X)$. - -## Regression Trees - -We fit a regression tree to the training data using the basic model. The variable *cp* controls the complexity of the regression tree, i.e. how deep we build the tree. - -```{r} -# tree -fit_trees <- rpart(formula_basic, data_train, minbucket = 5, cp = 0.001) -# plotting the tree -prp(fit_trees, leaf.round = 1, space = 2, yspace = 2, split.space = 2, shadow.col = "gray", trace = 1) -``` - -An important method to improve predictive performance is called "Pruning the Tree". This -means the process of cutting down the branches of a tree. We apply pruning to the complex tree above to reduce the depth. Initially, we determine the optimal complexity of the regression tree. - -```{r} -bestcp <- fit_trees$cptable[which.min(fit_trees$cptable[, "xerror"]), "CP"] -bestcp -``` - -Now, we can prune the tree and visualize the prediction rule. - -```{r} -fit_prunedtree <- prune(fit_trees, cp = bestcp) -prp(fit_prunedtree, leaf.round = 1, space = 3, yspace = 3, split.space = 7, - shadow.col = "gray", trace = 1, yesno = 1) -``` - -Finally, we calculate the mean-squared error and the $R^2$ on the test sample to evaluate the out-of-sample performance of the pruned tree. - -```{r} -yhat_pt <- predict(fit_prunedtree, newdata = data_test) -mse_pt <- summary(lm((y_test - yhat_pt)^2 ~ 1))$coef[1:2] -r2_pt <- 1 - mse_pt[1] / var(y_test) - -# R^2 of the pruned tree -cat("R^2 of the pruned tree:", r2_pt) -``` - -## Random Forest and Boosted Trees - -In the next step, we apply the more advanced tree-based methods random forest and boosted trees. - -```{r} -# random forest -fit_rf <- randomForest(model_x_basic_train, y_train, ntree = 2000, nodesize = 20, data = data_train) - -## Evaluating the method -yhat_rf <- predict(fit_rf, newdata = model_x_basic_test) # prediction - -mse_rf <- summary(lm((y_test - yhat_rf)^2 ~ 1))$coef[1:2] -r2_rf <- 1 - mse_rf[1] / var(y_test) -``` - -```{r} -# boosting -fit_boost <- gbm(formula_basic, data = data_train, distribution = "gaussian", bag.fraction = .5, - interaction.depth = 2, n.trees = 1000, shrinkage = .01) -best_boost <- gbm.perf(fit_boost, plot.it = FALSE) # cross-validation to determine when to stop - -## Evaluating the method -yhat_boost <- predict(fit_boost, newdata = data_test, n.trees = best_boost) - -mse_boost <- summary(lm((y_test - yhat_boost)^2 ~ 1))$coef[1:2] -r2_boost <- 1 - mse_boost[1] / var(y_test) -``` - -```{r} -# printing R^2 -cat("R^2 of the random forest and boosted trees:", r2_rf, r2_boost) -``` - -## NNets - -First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers -- 2 hidden and one output layer. - -```{r} -# Define the neural network architecture -model <- keras_model_sequential() %>% - layer_dense(units = 50, activation = "relu", input_shape = dim(model_x_basic_train)[2]) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 1) # Output layer with 1 unit for regression task - -# Compile the model -model %>% compile( - optimizer = optimizer_adam(lr = 0.01), - loss = "mse", - metrics = c("mae"), -) - -summary(model) -``` - -```{r} -num_epochs <- 100 - -# Define early stopping based on validation set (20%) performance -# Patience set to 5 epochs (default in skorch is 5) -early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 5) - -# Train the model -model %>% fit( - model_x_basic_train, y_train, - epochs = num_epochs, - batch_size = 10, - validation_split = 0.2, # 20% validation set - verbose = 0, - callbacks = list(early_stopping) -) -``` - -```{r} -# evaluating the performance -model %>% evaluate(model_x_basic_test, y_test, verbose = 0) -``` - -```{r} -# Calculating the performance measures -yhat_nn <- model %>% predict(model_x_basic_test) -mse_nn <- summary(lm((y_test - yhat_nn)^2 ~ 1))$coef[1:2] -r2_nn <- 1 - mse_nn[1] / var(y_test) -# printing R^2 -cat("R^2 of the neural network:", r2_nn) -``` - -To conclude, let us have a look at our results. - -## Results - -```{r} -table <- matrix(0, 16, 3) -table[1, 1:2] <- mse_lm_basic -table[2, 1:2] <- mse_lm_flex -table[3, 1:2] <- mse_lasso -table[4, 1:2] <- mse_lasso_post -table[5, 1:2] <- mse_lasso_flex -table[6, 1:2] <- mse_lasso_post_flex -table[7, 1:2] <- mse_lasso_cv -table[8, 1:2] <- mse_ridge -table[9, 1:2] <- mse_elnet -table[10, 1:2] <- mse_lasso_cv_flex -table[11, 1:2] <- mse_ridge_flex -table[12, 1:2] <- mse_elnet_flex -table[13, 1:2] <- mse_rf -table[14, 1:2] <- mse_boost -table[15, 1:2] <- mse_pt -table[16, 1:2] <- mse_nn - - -table[1, 3] <- r2_lm_basic -table[2, 3] <- r2_lm_flex -table[3, 3] <- r2_lasso -table[4, 3] <- r2_lasso_post -table[5, 3] <- r2_lasso_flex -table[6, 3] <- r2_lasso_post_flex -table[7, 3] <- r2_lasso_cv -table[8, 3] <- r2_ridge -table[9, 3] <- r2_elnet -table[10, 3] <- r2_lasso_cv_flex -table[11, 3] <- r2_ridge_flex -table[12, 3] <- r2_elnet_flex -table[13, 3] <- r2_rf -table[14, 3] <- r2_boost -table[15, 3] <- r2_pt -table[16, 3] <- r2_nn - - -colnames(table) <- c("MSE", "S.E. for MSE", "R-squared") -rownames(table) <- c( - "Least Squares (basic)", "Least Squares (flexible)", "Lasso", "Post-Lasso", - "Lasso (flexible)", "Post-Lasso (flexible)", - "Cross-Validated lasso", "Cross-Validated ridge", "Cross-Validated elnet", - "Cross-Validated lasso (flexible)", "Cross-Validated ridge (flexible)", "Cross-Validated elnet (flexible)", - "Random Forest", "Boosted Trees", "Pruned Tree", "Neural Net (Early)" -) -tab <- xtable(table, digits = 3) -print(tab, type = "latex") # set type="latex" for printing table in LaTeX -tab -``` - -Above, we displayed the results for a single split of data into the training and testing part. The table shows the test MSE in column 1 as well as the standard error in column 2 and the test $R^2$ -in column 3. We see that most models perform similarly. For most of these methods, test MSEs are within one standard error of each other. Remarkably, OLS with just the basic variables performs extremely well. However, OLS on a flexible model with many regressors performs very poorly giving nearly the highest test MSE. It is worth noticing that, as this is just a simple illustration meant to be relatively quick, the nonlinear models are not tuned. Thus, there is potential to improve the performance of the nonlinear methods we used in the analysis. - -### Ensemble learning - -In the final step, we can build a prediction model by combing the strengths of the models we considered so far. This ensemble method is of the form - $$ f(x) = \sum_{k=1}^K \alpha_k f_k(x) $$ -where the $f_k$'s denote our prediction rules from the table above and the $\alpha_k$'s are the corresponding weights. - -We first estimate the weights without penalization. - -```{r} -ensemble_ols <- summary(lm(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex + - yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex + - yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex + - yhat_pt + yhat_rf + yhat_boost + yhat_nn)) -ensemble_ols -``` - -Alternatively, we can determine the weights via lasso regression. - -```{r} -ensemble_lasso <- summary(hdm::rlasso(y_test ~ yhat_lm_basic + yhat_lm_flex + yhat_rlasso + yhat_rlasso_flex + - yhat_rlasso_post + yhat_rlasso_post_flex + yhat_lasso_cv + yhat_lasso_cv_flex + - yhat_ridge + yhat_ridge_flex + yhat_elnet + yhat_elnet_flex + - yhat_pt + yhat_rf + yhat_boost + yhat_nn)) -ensemble_lasso -``` - -The estimated weights are shown in the following table. - -```{r} -table <- matrix(0, 17, 2) -table[1:17, 1] <- ensemble_ols$coef[1:17] -table[1:17, 2] <- ensemble_lasso$coef[1:17] - -colnames(table) <- c("Weight OLS", "Weight Lasso") - -rownames(table) <- c( - "Constant", "Least Squares (basic)", "Least Squares (flexible)", "Lasso (basic)", - "Lasso (flexible)", "Post-Lasso (basic)", "Post-Lasso (flexible)", "LassoCV (basic)", - "Lasso CV (flexible)", "Ridge CV (basic)", "Ridge CV (flexible)", "ElNet CV (basic)", - "ElNet CV (flexible)", "Pruned Tree", "Random Forest", "Boosted Trees", "Neural Net" -) -tab <- xtable(table, digits = 3) -print(tab, type = "latex") # set type="latex" for printing table in LaTeX -tab -``` - -We note the superior $R^2$ performance of the ensembles. Though for more unbiased performance evaluation, we should have left out a third sample to validate the performance of the stacked model. - -```{r} -# print ensemble R^2 -cat("R^2 of stacking with LS weights:", ensemble_ols$adj.r.squared, "\n") -cat("R^2 of stacking with Lasso weights:", ensemble_lasso$adj.r.squared, "\n") -``` - -# Automatic Machine Learning with H20 AutoML - -We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, we reanalyze the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. - -We can use AutoML as a benchmark and compare it to the methods that we used previously where we applied one machine learning method after the other. - -On a unix system, installation of the package h2o also requires the RCurl package, which requires the libcurl library to have been installed. If the installation of the h2o package fails, try installing the libcurl package first and repeating the cell. You can install the libcurl package for instance by running: -```bash -sudo apt-get install -y libcurl4-openssl-dev -``` - -```{r} -# load the H2O package -install.packages("h2o") -``` - -```{r} -library(h2o) -``` - -```{r} -# start h2o cluster -h2o.init() -``` - -```{r} -# convert data as h2o type -train_h <- as.h2o(data_train) -test_h <- as.h2o(data_test) - -# have a look at the data -h2o.describe(train_h) -``` - -```{r} -y_name <- "lwage" -x_names <- setdiff(names(data), c("lwage", "wage", "occ", "ind")) - -# run AutoML for 10 base models and a maximal runtime of 100 seconds -aml <- h2o.automl( - x = x_names, y = y_name, - training_frame = train_h, - leaderboard_frame = test_h, - max_models = 10, - seed = 1, - max_runtime_secs = 100 -) -# AutoML Leaderboard -lb <- aml@leaderboard -print(lb, n = nrow(lb)) -``` - -We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by - -```{r} -aml@leaderboard$mse[1] -``` - -The in-sample performance can be evaluated by - -```{r} -aml@leader -``` - -This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard. - -```{r} -model_ids <- as.data.frame(aml@leaderboard$model_id)[, 1] -# Get the "All Models" Stacked Ensemble model -se <- h2o.getModel(grep("StackedEnsemble_AllModels", model_ids, value = TRUE)[1]) -# Get the Stacked Ensemble metalearner model -metalearner <- se@model$metalearner_model -h2o.varimp(metalearner) -``` - -The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM. - -```{r} -h2o.varimp_plot(metalearner) -``` - -## Generating Predictions Using Leader Model - -We can also generate predictions on a test sample using the leader model object. - -```{r} -pred <- as.matrix(h2o.predict(aml@leader, test_h)) # make prediction using x data from the test sample -head(pred) -``` - -```{r} -y_test <- as.matrix(test_h$lwage) -r2_test <- 1 - summary(lm((y_test - pred)^2 ~ 1))$coef[1] / var(y_test) -cat("MSE, SE, R^2:", summary(lm((y_test - pred)^2 ~ 1))$coef[1:2], r2_test) -``` - -We observe both a similar MSE and $R^2$ relative to the better performing models in our previous results. - -```{r} -h2o.shutdown(prompt = FALSE) # shut down the h20 automatically without prompting user -``` - diff --git a/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd b/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd deleted file mode 100644 index 54c64d1b..00000000 --- a/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd +++ /dev/null @@ -1,139 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - - - -This notebook contains an example for teaching. - - - - -# Deep Neural Networks for Wage Prediction - - -So far we have considered many machine learning methods such as Lasso and Random Forests for building a predictive model. In this lab, we extend our toolbox by returning to our wage prediction problem and showing how a neural network can be used for prediction. - - -## Data preparation - - -Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. - -```{r} -load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") -Z <- subset(data,select=-c(lwage,wage)) # regressors -``` - -First, we split the data first and normalize it. - -```{r} -# split the data into training and testing sets -set.seed(1234) -training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) - -data_train <- data[training,1:16] -data_test <- data[-training,1:16] - -# data_train <- data[training,] -# data_test <- data[-training,] -# X_basic <- "sex + exp1 + exp2+ shs + hsg+ scl + clg + mw + so + we + occ2+ ind2" -# formula_basic <- as.formula(paste("lwage", "~", X_basic)) -# model_X_basic_train <- model.matrix(formula_basic,data_train)[,-1] -# model_X_basic_test <- model.matrix(formula_basic,data_test)[,-1] -# data_train <- as.data.frame(cbind(data_train$lwage,model_X_basic_train)) -# data_test <- as.data.frame(cbind(data_test$lwage,model_X_basic_test)) -# colnames(data_train)[1]<-'lwage' -# colnames(data_test)[1]<-'lwage' -``` - -```{r} -# normalize the data -mean <- apply(data_train, 2, mean) -std <- apply(data_train, 2, sd) -data_train <- scale(data_train, center = mean, scale = std) -data_test <- scale(data_test, center = mean, scale = std) -data_train <- as.data.frame(data_train) -data_test <- as.data.frame(data_test) -``` - -Then, we construct the inputs for our network. - -```{r} -X_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we" -formula_basic <- as.formula(paste("lwage", "~", X_basic)) -model_X_basic_train <- model.matrix(formula_basic,data_train) -model_X_basic_test <- model.matrix(formula_basic,data_test) - -Y_train <- data_train$lwage -Y_test <- data_test$lwage -``` - -## Neural Networks - - -First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers and the ReLU activation function. - -```{r} -library(keras) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 20, activation = "relu", # ReLU activation function - input_shape = dim(model_X_basic_train)[2])%>% - layer_dense(units = 10, activation = "relu") %>% - layer_dense(units = 1) - - model %>% compile( - optimizer = optimizer_adam(lr = 0.005), # Adam optimizer - loss = "mse", - metrics = c("mae") - ) -} -``` - -Let us have a look at the structure of our network in detail. - -```{r} -model <- build_model() -summary(model) -``` - -We have $441$ trainable parameters in total. - - -Now, let us train the network. Note that this takes substantial computation time. To speed up the computation time, we use GPU as an accelerator. The extent of computational time improvements varies based on a number of factors, including model architecture, batch-size, input pipeline complexity, etc. - -```{r} -# training the network -num_epochs <- 1000 -model %>% fit(model_X_basic_train, Y_train, - epochs = num_epochs, batch_size = 100, verbose = 0) -``` - -After training the neural network, we can evaluate the performance of our model on the test sample. - -```{r} -# evaluating performance -model %>% evaluate(model_X_basic_test, Y_test, verbose = 0) -``` - -```{r} -# calculating the performance measures -pred.nn <- model %>% predict(model_X_basic_test) -MSE.nn = summary(lm((Y_test-pred.nn)^2~1))$coef[1:2] -R2.nn <- 1-MSE.nn[1]/var(Y_test) -# printing R^2 -cat("R^2 of the neural network:",R2.nn) -``` diff --git a/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd b/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd deleted file mode 100644 index 9629895f..00000000 --- a/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd +++ /dev/null @@ -1,178 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains an example for teaching. - - -# Double Lasso for Testing the Convergence Hypothesis - - -## Introduction - - -We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\beta_1$ in the high-dimensional linear regression model: - $$ - Y = \beta_1 D + \beta_2'W + \epsilon. - $$ - -Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$). - -The relationship is captured by $\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\beta_1< 0)$ or fall behind $(\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions that we won't state here, the predictive exercise we are doing here can be given a causal interpretation. - - - -The outcome $Y$ is the realized annual growth rate of a country's wealth (Gross Domestic Product per capita). The target regressor ($D$) is the initial level of the country's wealth. The target parameter $\beta_1$ is the speed of convergence, which measures the speed at which poor countries catch up with rich countries. The controls ($W$) include measures of education levels, quality of institutions, trade openness, and political stability in the country. - - -## Data analysis - - - -We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data. - -```{r} -library(hdm) # package of ``high dimensional models (hdm)" estimators -growth <- GrowthData -attach(growth) -names(growth) -``` - -We determine the dimensions of our data set. - -```{r} -dim(growth) -``` - -The sample contains $90$ countries and $63$ controls. Thus $p \approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\beta_1$. - - -To check this hypothesis, we analyze the relationship between the country's growth rate $Y$ and the country's other characteristics by running a linear regression in the first step. - -```{r} -reg.ols <- lm(Outcome~.-1,data=growth) -``` - -We determine the regression coefficient $\beta_1$ of the target regressor *gdpsh465* (initial wealth level, $D$), its 95% confidence interval and the standard error. - -```{r} -est_ols <- summary(reg.ols)$coef["gdpsh465",1] -# output: estimated regression coefficient corresponding to the target regressor - -std_ols <- summary(reg.ols)$coef["gdpsh465",2] -# output: std. error - -ci_ols <- confint(reg.ols)[2,] -# output: 95% confidence interval - -results_ols <- as.data.frame(cbind(est_ols,std_ols,ci_ols[1],ci_ols[2])) -colnames(results_ols) <-c("estimator","standard error", "lower bound CI", "upper bound CI") -rownames(results_ols) <-c("OLS") -``` - -```{r} -library(xtable) -table <- matrix(0, 1, 4) -table[1,1:4] <- c(est_ols,std_ols,ci_ols[1],ci_ols[2]) -colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") -rownames(table) <-c("OLS") -tab<- xtable(table, digits = 3) -print(tab,type="html") # set type="latex" for printing table in LaTeX -``` - - - - - - -
estimator standard error lower bound CI upper bound CI
OLS -0.009 0.030 -0.071 0.052
- - -As expected, least squares provides a rather noisy estimate of the speed of convergence, and does not allow us to answer the question about the convergence hypothesis as the confidence interval includes zero. - - -In contrast, we can use the partialling-out approach based on lasso regression ("Double Lasso"). - -```{r} -Y <- growth[, 1, drop = F] # output variable -W <- as.matrix(growth)[, -c(1, 2,3)] # controls -D <- growth[, 3, drop = F] # target regressor -r.Y <- rlasso(x=W,y=Y)$res # creates the "residual" output variable -r.D <- rlasso(x=W,y=D)$res # creates the "residual" target regressor -partial.lasso <- lm(r.Y ~ r.D) -est_lasso <- partial.lasso$coef[2] -std_lasso <- summary(partial.lasso)$coef[2,2] -ci_lasso <- confint(partial.lasso)[2,] - -library(xtable) -table <- matrix(0, 1, 4) -table[1,1:4] <- c(est_lasso,std_lasso,ci_lasso[1],ci_lasso[2]) -colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") -rownames(table) <-c("Double Lasso") -tab<- xtable(table, digits = 3) -print(tab,type="html") # set type="latex" for printing table in LaTeX -``` - - - - - - -
estimator standard error lower bound CI upper bound CI
Double Lasso -0.050 0.014 -0.078 -0.022
- - - -Lasso provides a more precise estimate (lower standard error). The Lasso based -point estimate is about $5\%$ and the $95\%$ confidence interval for the -(annual) rate of convergence is $7.8\%$ to $2.2\%$. This empirical -evidence does support the convergence hypothesis. - - -Note: Alternatively, one could also use the *rlassoEffect* funtion from the *hdm* package that directly applies the partialling-out approach. - -```{r} -lasso.effect = rlassoEffect(x = W, y = Y, d = D, method = "partialling out") -lasso.effect -``` - -## Summary - - - -Finally, let us have a look at the results. - -```{r} -library(xtable) -table <- matrix(0, 2, 4) -table[1,1:4] <- c(est_ols,std_ols,ci_ols[1],ci_ols[2]) -table[2,1:4] <- c(est_lasso,std_lasso,ci_lasso[1],ci_lasso[2]) -colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") -rownames(table) <-c("OLS","Double Lasso") -tab<- xtable(table, digits = 3) -print(tab,type="html") # set type="latex" for printing table in LaTeX -``` - -The least square method provides a rather noisy estimate of the speed of convergence. We cannot answer the question of whether poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large. - -In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso-based point estimate is $-5\%$ and the $95\%$ confidence interval for the (annual) rate of convergence $[-7.8\%,-2.2\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis. - - - - - - - - - -
estimator standard error lower bound CI upper bound CI
OLS -0.009 0.030 -0.071 0.052
Double Lasso -0.050 0.014 -0.078 -0.022
- diff --git a/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd b/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd deleted file mode 100644 index 44af9751..00000000 --- a/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd +++ /dev/null @@ -1,213 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth - - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -#install and load package -install.packages("dagitty") -install.packages("ggdag") -library(dagitty) -library(ggdag) - -``` - -# Graphs for 401(K) Analsyis - - - -# Here we have - # * $Y$ -- net financial assets; - # * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details); - # * $F$ -- latent (unobserved) firm characteristics - # * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$ - - -# State one graph (where F determines X) and plot it - - -```{r} -#generate a DAGs and plot them - -G1 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [uobserved, pos="0, -1"] -D -> Y -X -> D -F -> X -F -> D -X -> Y}') - - -ggdag(G1)+ theme_dag() -``` - -# List minimal adjustment sets to identify causal effecs $D \to Y$ - - - -```{r} -adjustmentSets( G1, "D", "Y",effect="total" ) -``` - -# What is the underlying principle? - -Here condition on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions, when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so! - - -# Another Graph (wherere $X$ determine $F$): - -```{r} -#generate a couple of DAGs and plot them - -G2 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [uobserved, pos="0, -1"] -D -> Y -X -> D -X -> F -F -> D -X -> Y}') - - -ggdag(G2)+ theme_dag() -``` - -```{r} -adjustmentSets( G2, "D", "Y", effect="total" ) - -``` - -# One more graph (encompassing previous ones), where (F, X) are jointly determined by latent factors $A$. We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$. - - -This is much more realistic graph to consider. - -```{r} -G3 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -X -> Y}') - -adjustmentSets( G3, "D", "Y", effect="total" ) - -ggdag(G3)+ theme_dag() -``` - -# Threat to Idenitification: What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case) - -```{r} -G4 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -F -> Y -X -> Y}') - - -ggdag(G4)+ theme_dag() -``` - -```{r} -adjustmentSets( G4, "D", "Y",effect="total" ) - -``` - -Note that no output means that there is no valid adustment set (among observed variables) - - -# How can F affect Y directly? Is it reasonable? - - -# Introduce Match Amount $M$ (very important mediator, why mediator?). $M$ is not observed. Luckily adjusting for $X$ still works if there is no $F \to M$ arrow. - -```{r} -G5 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -M [unobserved, pos="2, -.5"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -D -> M -M -> Y -X -> M -X -> Y}') - -print( adjustmentSets( G5, "D", "Y",effect="total" ) ) - -ggdag(G5)+ theme_dag() -``` - -# If there is $F \to M$ arrow, then adjusting for $X$ is not sufficient. - -```{r} -G6 = dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -M [uobserved, pos="2, -.5"] -D -> Y -X -> D -F -> D -A -> F -A -> X -D -> M -F -> M -A -> D -M -> Y -X -> M -X -> Y}') - -print( adjustmentSets( G6, "D", "Y" ),effect="total" ) - -ggdag(G6)+ theme_dag() -``` - - # Question: - -Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth? - - * If yes, click an "upvote" button at the top - * If no, please click an "upvote" button at the top diff --git a/PM4/r-dml-401k.Rmd b/PM4/r-dml-401k.Rmd deleted file mode 100644 index ed199079..00000000 --- a/PM4/r-dml-401k.Rmd +++ /dev/null @@ -1,1019 +0,0 @@ ---- -title: An R Markdown document converted from "PM4/r-dml-401k.irnb" -output: html_document ---- - -# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models - -## Impact of 401(k) on Financial Wealth - -As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation -on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. - -One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. - -```{r} -install.packages("xtable") -install.packages("hdm") -install.packages("sandwich") -install.packages("ggplot2") -install.packages("randomForest") -install.packages("glmnet") -install.packages("rpart") -install.packages("gbm") -``` - -```{r} -library(xtable) -library(hdm) -library(sandwich) -library(ggplot2) -library(randomForest) -library(data.table) -library(glmnet) -library(rpart) -library(gbm) -``` - -### Data - -The raw dataset can be found [here](https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/401k.csv). -The data set can be loaded from the `hdm` package for R directly by typing: - - -```{r} -data(pension) -data <- pension -dim(data) -``` - -See the "Details" section on the description of the data set, which can be accessed by - -```{r} -help(pension) -``` - -The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. - -Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. - -```{r} -hist_e401 <- ggplot(data, aes(x = e401, fill = factor(e401))) + - geom_bar() -hist_e401 -``` - -Eligibility is highly associated with financial wealth: - -```{r} -dens_net_tfa <- ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401))) + - geom_density() + - xlim(c(-20000, 150000)) + - facet_wrap(. ~ e401) - -dens_net_tfa -``` - -The unconditional APE of e401 is about $19559$: - -```{r} -e1 <- data[data$e401 == 1, ] -e0 <- data[data$e401 == 0, ] -round(mean(e1$net_tfa) - mean(e0$net_tfa), 0) -``` - -Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: - -```{r} -p1 <- data[data$p401 == 1, ] -p0 <- data[data$p401 == 0, ] -round(mean(p1$net_tfa) - mean(p0$net_tfa), 0) -``` - -As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. - -```{r} -# outcome variable -y <- data[, "net_tfa"] -# treatment variable -D <- data[, "e401"] -D2 <- data[, "p401"] -D3 <- data[, "a401"] - -columns_to_drop <- c( - "e401", "p401", "a401", "tw", "tfa", "net_tfa", "tfa_he", - "hval", "hmort", "hequity", - "nifa", "net_nifa", "net_n401", "ira", - "dum91", "icat", "ecat", "zhat", - "i1", "i2", "i3", "i4", "i5", "i6", "i7", - "a1", "a2", "a3", "a4", "a5" -) - -# covariates -X <- data[, !(names(data) %in% columns_to_drop)] -``` - -```{r} -# Constructing the controls -x_formula <- paste("~ 0 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) ", - "+ poly(fsize, 2, raw=TRUE) + male + marr + twoearn + db + pira + hown") -X <- as.data.table(model.frame(x_formula, X)) -head(X) -``` - -## Estimating the ATE of 401(k) Eligibility on Net Financial Assets - -We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. We start using ML approaches to estimate the function $g_0$ and $m_0$ in the following PLR model: - -\begin{align} - & Y = D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid D,X]= 0,\\ - & D = m_0(X) + V, & E[V \mid X] = 0. -\end{align} - -## Partially Linear Regression Models (PLR) - -```{r} -dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 3, method = "regression") { - nobs <- nrow(x) # number of observations - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for (b in seq_along(I)) { - if (method == "regression") { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - } else if (method == "randomforest") { - dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]], ], type = "prob")[, 2] # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - } else if (method == "decisiontrees") { - dfit <- dreg(x[-I[[b]], ], as.factor(d)[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]], ])[, 2] # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ]) # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - } else if (method == "boostedtrees") { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - } - cat(b, " ") - } - rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other - coef_est <- coef(rfit)[2] # extract coefficient - se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef_est, se)) # printing output - return(list(coef_est = coef_est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals -} -``` - -```{r} -summaryPLR <- function(point, stderr, resD, resy, name) { - data <- data.frame( - estimate = point, # point estimate - stderr = stderr, # standard error - lower = point - 1.96 * stderr, # lower end of 95% confidence interval - upper = point + 1.96 * stderr, # upper end of 95% confidence interval - `rmse y` = sqrt(mean(resy^2)), # RMSE of model that predicts outcome y - `rmse D` = sqrt(mean(resD^2)), # RMSE of model that predicts treatment D - `accuracy D` = mean(abs(resD) < 0.5) # binary classification accuracy of model for D - ) - rownames(data) <- name - return(data) -} -``` - -#### Double Lasso with Cross-Fitting - -```{r} -# DML with LassoCV -set.seed(123) -cat(sprintf("\nDML with Lasso CV \n")) - -dreg_lasso_cv <- function(x, d) { - cv.glmnet(x, d, family = "gaussian", alpha = 1, nfolds = 5) -} -yreg_lasso_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} - -dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_lasso_cv, yreg_lasso_cv, nfold = 5) - -sum_lasso_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "LassoCV") -tableplr <- data.frame() -tableplr <- rbind(sum_lasso_cv) -tableplr -``` - -```{r} -# Because residuals are output, reconstruct fitted values for use in ensemble -dhat_lasso <- D - dml2_results$dtil -yhat_lasso <- y - dml2_results$ytil -``` - -#### Using a $\ell_2$ Penalized Logistic Regression for D - -Note we are using the $\ell_2$ penalty here. You can use the $\ell_1$ penalty as well, but computation will take longer. - -```{r} -# DML with Lasso/Logistic -set.seed(123) -cat(sprintf("\nDML with Lasso/Logistic \n")) - -dreg_logistic_cv <- function(x, d) { - cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) -} -yreg_lasso_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} - -dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_logistic_cv, yreg_lasso_cv, nfold = 5) -sum_lasso_logistic_cv <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "LassoCV/LogisticCV") -tableplr <- rbind(tableplr, sum_lasso_logistic_cv) -tableplr -``` - -```{r} -# Because residuals are output, reconstruct fitted values for use in ensemble -dhat_lasso_logistic <- D - dml2_results$dtil -yhat_lasso_logistic <- y - dml2_results$ytil -``` - -#### Random Forests - -```{r} -# DML with Random Forest -set.seed(123) -cat(sprintf("\nDML with Random Forest \n")) - -dreg_rf <- function(x, d) { - randomForest(x, d, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg_rf <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest - -dml2_results <- dml2_for_plm(as.matrix(X), D, y, dreg_rf, yreg_rf, nfold = 5, method = "randomforest") -sum_rf <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "Random Forest") -tableplr <- rbind(tableplr, sum_rf) -tableplr -``` - -```{r} -# Because residuals are output, reconstruct fitted values for use in ensemble -dhat_rf <- D - dml2_results$dtil -dhat_rf <- y - dml2_results$ytil -``` - -#### Decision Trees - -```{r} -# DML with Decision Trees -set.seed(123) -cat(sprintf("\nDML with Decision Trees \n")) - -dreg_tr <- function(x, d) { - rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) -} -dreg_tr <- function(x, y) { - rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} - -# decision tree takes in X as dataframe, not matrix/array -dml2_results <- dml2_for_plm(X, D, y, dreg_tr, dreg_tr, nfold = 5, method = "decisiontrees") -sum_tr <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "Decision Trees") -tableplr <- rbind(tableplr, sum_tr) -tableplr -``` - -```{r} -# Because residuals are output, reconstruct fitted values for use in ensemble -dhat_tr <- D - dml2_results$dtil -yhat_tr <- y - dml2_results$ytil -``` - - -Ideally, we would do (semi) cross-fitting with AutoML in order to find good first-stage models and re-run DML with these models. Unfortunately this is not easy to do in R. In the case of semi cross-fitting, we can use R's H20 AutoML trained on the entire training set $y\sim X$, $D \sim X$, $Z\sim X$ to determine the best model (eg ensemble), but H20 does not allow you to extract said model so we can re-use that in DML. - -#### Boosted Trees - -```{r} -# DML with Boosted Trees -set.seed(123) -cat(sprintf("\nDML with Boosted Trees \n")) - -# NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) -dreg_boost <- function(x, d) { - gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg_boost <- function(x, y) { - gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} - -# passing these through regression as type="response", and D should not be factor! -dml2_results <- dml2_for_plm(X, D, y, dreg_boost, yreg_boost, nfold = 5, method = "boostedtrees") -sum_boost <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "Boosted Trees") -tableplr <- rbind(tableplr, sum_boost) -tableplr -``` - -```{r} -# Because residuals are output, reconstruct fitted values for use in ensemble -dhat_boost <- D - dml2_results$dtil -yhat_boost <- y - dml2_results$ytil -``` - -## Ensembles - -Boosted trees give the best RMSE for both Y and D, so the ensemble based on choosing the best performing prediction rule is identical to boosting in this case. - -```{r} -# Best fit is boosted trees for both D and Y - -sum_best <- summaryPLR(dml2_results$coef_est, dml2_results$se, dml2_results$dtil, - dml2_results$ytil, name = "Best") -tableplr <- rbind(tableplr, sum_best) -tableplr -``` - -We'll form a model average with unconstrained least squares weights. - -```{r} -# Least squares model average - -ma_dtil <- lm(D ~ dhat_lasso + dhat_lasso_logistic + dhat_rf + dhat_tr + dhat_boost)$residuals -ma_ytil <- lm(y ~ yhat_lasso + yhat_lasso_logistic + dhat_rf + yhat_tr + yhat_boost)$residuals - -rfit <- lm(ma_ytil ~ ma_dtil) # estimate the main parameter by regressing one residual on the other -coef_est <- coef(rfit)[2] # extract coefficient -se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error - -sum.ma <- summaryPLR(coef_est, se, ma_dtil, ma_ytil, name = "Model Average") -tableplr <- rbind(tableplr, sum.ma) -tableplr -``` - -## Interactive Regression Model (IRM) - -Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous: - - \begin{align} - & Y = g_0(D, X) + U, & \quad E[U \mid X, D]= 0,\\ - & D = m_0(X) + V, & \quad E[V\mid X] = 0. -\end{align} - -To reduce the disproportionate impact of extreme propensity score weights in the interactive model -we trim the propensity scores which are close to the bounds. - -```{r} -dml2_for_irm <- function(x, d, y, dreg, yreg0, yreg1, trimming = 0.01, nfold = 5, method = "regression") { - yhat0 <- rep(0, length(y)) - yhat1 <- rep(0, length(y)) - Dhat <- rep(0, length(d)) - - nobs <- nrow(x) # number of observations - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - - cat("fold: ") - for (b in seq_along(I)) { - # define helpful variables - Dnotb <- d[-I[[b]]] - Xb <- X[I[[b]], ] - Xnotb <- X[-I[[b]], ] - - # training dfs subsetted on the -I[[b]] fold - XD0 <- X[-I[[b]], ][d[-I[[b]]] == 0] - yD0 <- y[-I[[b]]][d[-I[[b]]] == 0] - XD1 <- X[-I[[b]], ][d[-I[[b]]] == 1] - yD1 <- y[-I[[b]]][d[-I[[b]]] == 1] - - if (method == "regression") { - yfit0 <- yreg0(as.matrix(XD0), yD0) - yfit1 <- yreg1(as.matrix(XD1), yD1) - yhat0[I[[b]]] <- predict(yfit0, as.matrix(Xb)) # default is type = "response" for glmnet family gaussian - yhat1[I[[b]]] <- predict(yfit1, as.matrix(Xb)) - } else if (method == "randomforest") { - yfit0 <- yreg0(XD0, yD0) - yfit1 <- yreg1(XD1, yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for rf - yhat1[I[[b]]] <- predict(yfit1, Xb) - } else if (method == "decisiontrees") { - yfit0 <- yreg0(XD0, yD0) - yfit1 <- yreg1(XD1, yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "vector" for decision - yhat1[I[[b]]] <- predict(yfit1, Xb) - } else if (method == "boostedtrees") { - yfit0 <- yreg0(as.data.frame(XD0), yD0) - yfit1 <- yreg1(as.data.frame(XD1), yD1) - yhat0[I[[b]]] <- predict(yfit0, Xb) # default is type = "response" for boosted - yhat1[I[[b]]] <- predict(yfit1, Xb) - } - - # propensity scores: - if (method == "regression") { - dfit_b <- dreg(as.matrix(Xnotb), Dnotb) - dhat_b <- predict(dfit_b, as.matrix(Xb), type = "response") # default is type="link" for family binomial! - } else if (method == "randomforest") { - dfit_b <- dreg(Xnotb, as.factor(Dnotb)) - dhat_b <- predict(dfit_b, Xb, type = "prob")[, 2] - } else if (method == "decisiontrees") { - dfit_b <- dreg(Xnotb, Dnotb) - dhat_b <- predict(dfit_b, Xb)[, 2] - } else if (method == "boostedtrees") { - dfit_b <- dreg(as.data.frame(Xnotb), Dnotb) - dhat_b <- predict(dfit_b, Xb, type = "response") - } - dhat_b <- pmax(pmin(dhat_b, 1 - trimming), trimming) # trimming so scores are between [trimming, (1-trimming)] - Dhat[I[[b]]] <- dhat_b - - cat(b, " ") - } - - # Prediction of treatment and outcome for observed instrument - yhat <- yhat0 * (1 - D) + yhat1 * D - # residuals - ytil <- y - yhat - dtil <- D - Dhat - # doubly robust quantity for every sample - drhat <- yhat1 - yhat0 + (y - yhat) * (D / Dhat - (1 - D) / (1 - Dhat)) - coef_est <- mean(drhat) - vari <- var(drhat) - se <- sqrt(vari / nrow(X)) - cat("point", coef_est) - cat("se", se) - return(list(coef_est = coef_est, se = se, ytil = ytil, dtil = dtil, drhat = drhat, - yhat0 = yhat0, yhat1 = yhat1, dhat = Dhat, yhat = yhat)) -} -``` - -```{r} -summaryIRM <- function(coef_est, se, ytil, dtil, drhat, name) { - summary_data <- data.frame( - estimate = coef_est, # point estimate - se = se, # standard error - lower = coef_est - 1.96 * se, # lower end of 95% confidence interval - upper = coef_est + 1.96 * se, # upper end of 95% confidence interval - rmse_y = sqrt(mean(ytil^2)), # res of model that predicts outcome y - rmse_D = sqrt(mean(dtil^2)), # res of model that predicts treatment D - accuracy_D = mean(abs(dtil) < 0.5) # binary classification accuracy of model for D - ) - row.names(summary_data) <- name - return(summary_data) -} -``` - -#### Repeat analysis in the IRM setting. - -```{r} -# DML with Lasso/Logistic -set.seed(123) -cat(sprintf("\nDML with LassoCV/Logistic \n")) - -dreg_lasso_cv <- function(x, d) { - cv.glmnet(x, d, family = "binomial", alpha = 0, nfolds = 5) -} -yreg0_lasso_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} -yreg1_lasso_cv <- function(x, y) { - cv.glmnet(x, y, family = "gaussian", alpha = 1, nfolds = 5) -} - -# more folds seems to help stabilize finite sample performance -dml2_results <- dml2_for_irm(X, D, y, dreg_lasso_cv, yreg0_lasso_cv, yreg1_lasso_cv, nfold = 5) -sum_lasso_cv <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$drhat, name = "LassoCVLogistic") -tableirm <- data.frame() -tableirm <- rbind(sum_lasso_cv) -tableirm - -yhat0_lasso <- dml2_results$yhat0 -yhat1_lasso <- dml2_results$yhat1 -dhat_lasso <- dml2_results$dhat -yhat_lasso <- dml2_results$yhat -``` - -```{r} -# DML with Random Forest -set.seed(123) -cat(sprintf("\nDML with Random Forest \n")) - -dreg_rf <- function(x, d) { - randomForest(x, d, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg0_rf <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest -yreg1_rf <- function(x, y) { - randomForest(x, y, ntree = 1000, nodesize = 10) -} # ML method=Forest - - -dml2_results <- dml2_for_irm(as.matrix(X), D, y, dreg_rf, yreg0_rf, yreg1_rf, nfold = 5, method = "randomforest") -sum_rf <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$drhat, name = "Random Forest") -tableirm <- rbind(tableirm, sum_rf) -tableirm - -yhat0_rf <- dml2_results$yhat0 -yhat1_rf <- dml2_results$yhat1 -dhat_rf <- dml2_results$dhat -dhat_rf <- dml2_results$yhat -``` - -```{r} -# DML with Decision Trees -set.seed(123) -cat(sprintf("\nDML with Decision Trees \n")) - -dreg_tr <- function(x, d) { - rpart(as.formula("D~."), cbind(data.frame(D = d), x), method = "class", minbucket = 10, cp = 0.001) -} -yreg0_tr <- function(x, y) { - rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} -yreg1_tr <- function(x, y) { - rpart(as.formula("y~."), cbind(data.frame(y = y), x), minbucket = 10, cp = 0.001) -} - -dml2_results <- dml2_for_irm(X, D, y, dreg_tr, yreg0_tr, yreg1_tr, nfold = 5, method = "decisiontrees") -sum_tr <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$drhat, name = "Decision Trees") -tableirm <- rbind(tableirm, sum_tr) -tableirm - -yhat0_tr <- dml2_results$yhat0 -yhat1_tr <- dml2_results$yhat1 -dhat_tr <- dml2_results$dhat -yhat_tr <- dml2_results$yhat -``` - -```{r} -# DML with Boosted Trees -set.seed(123) -cat(sprintf("\nDML with Boosted Trees \n")) - -# NB: early stopping cannot easily be implemented with gbm -## set n.trees = best, where best <- gbm.perf(dreg_boost, plot.it = FALSE) -dreg_boost <- function(x, d) { - gbm(as.formula("D~."), cbind(data.frame(D = d), x), distribution = "bernoulli", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg0_boost <- function(x, y) { - gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} -yreg1_boost <- function(x, y) { - gbm(as.formula("y~."), cbind(data.frame(y = y), x), distribution = "gaussian", - interaction.depth = 2, n.trees = 100, shrinkage = .1) -} - -# passing these through regression as type="response", and D should not be factor! -dml2_results <- dml2_for_irm(X, D, y, dreg_boost, yreg0_boost, yreg1_boost, nfold = 5, method = "boostedtrees") -sum_boost <- summaryIRM(dml2_results$coef_est, dml2_results$se, dml2_results$ytil, dml2_results$dtil, - dml2_results$drhat, name = "Boosted Trees") -tableirm <- rbind(tableirm, sum_boost) -tableirm - -yhat0_boost <- dml2_results$yhat0 -yhat1_boost <- dml2_results$yhat1 -dhat_boost <- dml2_results$dhat -yhat_boost <- dml2_results$yhat -``` - -```{r} -# Ensembles - -# Best -# We'll look at model that does best for Y overall. Could also use different model for Y0 and Y1 -# Here, the best performance for Y is the random forest and for D the boosted tree - -# residuals -ytil <- y - dhat_rf -dtil <- D - dhat_boost -# doubly robust quantity for every sample -drhat <- yhat1_rf - yhat0_rf + (y - dhat_rf) * (D / dhat_boost - (1 - D) / (1 - dhat_boost)) -coef_est <- mean(drhat) -vari <- var(drhat) -se <- sqrt(vari / nrow(X)) - -sum_best <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = "Best") -tableirm <- rbind(tableirm, sum_best) -tableirm -``` - -```{r} -# Least squares model average -# We'll look at weights that do best job for Y overall. Could also use different weights for Y0 and Y1 - -ma_dw <- lm(D ~ dhat_lasso + dhat_rf + dhat_tr + dhat_boost)$coef -ma_yw <- lm(y ~ yhat_lasso + dhat_rf + yhat_tr + yhat_boost)$coef - -Dhats <- cbind(as.matrix(rep(1, nrow(X))), dhat_lasso, dhat_rf, dhat_tr, dhat_boost) -Y0s <- cbind(as.matrix(rep(1, nrow(X))), yhat0_lasso, yhat0_rf, yhat0_tr, yhat0_boost) -Y1s <- cbind(as.matrix(rep(1, nrow(X))), yhat1_lasso, yhat1_rf, yhat1_tr, yhat1_boost) - -dhat <- Dhats %*% as.matrix(ma_dw) -yhat0 <- Y0s %*% as.matrix(ma_yw) -yhat1 <- Y1s %*% as.matrix(ma_yw) - -# Prediction of treatment and outcome for observed instrument -yhat <- yhat0 * (1 - D) + yhat1 * D -# residuals -ytil <- y - yhat -dtil <- D - dhat -# doubly robust quantity for every sample -drhat <- yhat1 - yhat0 + (y - yhat) * (D / dhat - (1 - D) / (1 - dhat)) -coef_est <- mean(drhat) -vari <- var(drhat) -se <- sqrt(vari / nrow(X)) - -sum.ma <- summaryIRM(coef_est, se, ytil, dtil, drhat, name = "Model Average") -tableirm <- rbind(tableirm, sum.ma) -tableirm -``` - -## Double ML package - -We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. There exist nice packages out there that can help us do our estimation with the simple call of a function. Such packages include `EconML` (Python) and `DoubleML` (Python and R). - -We run through PLR and IRM using `DoubleML` below to illustrate. The `DoubleML` package internally builds on `mlr3`. We use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. - -You find additional information about `DoubleML` on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. - -```{r} -install.packages("DoubleML") -install.packages("mlr3learners") -install.packages("mlr3") -install.packages("data.table") -install.packages("randomForest") -install.packages("ranger") - -library(DoubleML) -library(mlr3learners) -library(mlr3) -library(data.table) -library(randomForest) -library(ranger) -``` - -```{r} -# Constructing the data (as DoubleMLData) -formula_flex <- paste("net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) ", - "+ poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown") -model_flex <- as.data.table(model.frame(formula_flex, pension)) -x_cols <- colnames(model_flex)[-c(1, 2)] -data_ml <- DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols = x_cols) - -p <- dim(model_flex)[2] - 2 -p -``` - -As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. - -## Partially Linear Regression Models (PLR) - -```{r} -# Estimating the PLR -lgr::get_logger("mlr3")$set_threshold("warn") -lasso <- lrn("regr.cv_glmnet", nfolds = 5, s = "lambda.min") -lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = lasso, ml_m = lasso_class, n_folds = 5) -dml_plr$fit(store_predictions = TRUE) -dml_plr$summary() -lasso_plr <- dml_plr$coef -lasso_std_plr <- dml_plr$se -``` - -Let us check the predictive performance of this model. - -```{r} -dml_plr$params_names() -g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -``` - -```{r} -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -d <- as.matrix(pension$e401) -predictions_y <- as.matrix(d * theta) + g_hat # predictions for y -lasso_y_rmse <- sqrt(mean((y - predictions_y)^2)) -lasso_y_rmse -``` - -```{r} -# cross-fitted RMSE: treatment -d <- as.matrix(pension$e401) -lasso_d_rmse <- sqrt(mean((d - m_hat)^2)) -lasso_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -Then, we repeat this procedure for various machine learning methods. - -```{r} -# Random Forest -lgr::get_logger("mlr3")$set_threshold("warn") -randomForest <- lrn("regr.ranger") -random_forest_class <- lrn("classif.ranger") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = randomForest, ml_m = random_forest_class, n_folds = 5) -dml_plr$fit(store_predictions = TRUE) # set store_predictions=TRUE to evaluate the model -dml_plr$summary() -forest_plr <- dml_plr$coef -forest_std_plr <- dml_plr$se -``` - -We can compare the accuracy of this model to the model that has been estimated with lasso. - -```{r} -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d * theta) + g_hat # predictions for y -forest_y_rmse <- sqrt(mean((y - predictions_y)^2)) -forest_y_rmse - -# cross-fitted RMSE: treatment -forest_d_rmse <- sqrt(mean((d - m_hat)^2)) -forest_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -# Trees -lgr::get_logger("mlr3")$set_threshold("warn") - -trees <- lrn("regr.rpart") -trees_class <- lrn("classif.rpart") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = trees, ml_m = trees_class, n_folds = 5) -dml_plr$fit(store_predictions = TRUE) -dml_plr$summary() -tree_plr <- dml_plr$coef -tree_std_plr <- dml_plr$se - -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d * theta) + g_hat # predictions for y -tree_y_rmse <- sqrt(mean((y - predictions_y)^2)) -tree_y_rmse - -# cross-fitted RMSE: treatment -tree_d_rmse <- sqrt(mean((d - m_hat)^2)) -tree_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -# needed to run boosting -remotes::install_github("mlr-org/mlr3extralearners") -install.packages("mlr3extralearners") -install.packages("mboost") -library(mlr3extralearners) -library(mboost) -``` - -```{r} -# Boosting -boost <- lrn("regr.glmboost") -boost_class <- lrn("classif.glmboost") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_l = boost, ml_m = boost_class, n_folds = 5) -dml_plr$fit(store_predictions = TRUE) -dml_plr$summary() -boost_plr <- dml_plr$coef -boost_std_plr <- dml_plr$se - -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_l) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d * theta) + g_hat # predictions for y -boost_y_rmse <- sqrt(mean((y - predictions_y)^2)) -boost_y_rmse - -# cross-fitted RMSE: treatment -boost_d_rmse <- sqrt(mean((d - m_hat)^2)) -boost_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -Let's sum up the results: - -```{r} -table <- matrix(0, 4, 4) -table[1, 1:4] <- c(lasso_plr, forest_plr, tree_plr, boost_plr) -table[2, 1:4] <- c(lasso_std_plr, forest_std_plr, tree_std_plr, boost_std_plr) -table[3, 1:4] <- c(lasso_y_rmse, forest_y_rmse, tree_y_rmse, boost_y_rmse) -table[4, 1:4] <- c(lasso_d_rmse, forest_d_rmse, tree_d_rmse, boost_d_rmse) -rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D") -colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") -tab <- xtable(table, digits = 2) -tab -``` - -The best model with lowest RMSE is the PLR model estimated via lasso (or boosting based on the RSME Y). It gives the following estimate: - -```{r} -lasso_plr -``` - -## Interactive Regression Model (IRM) - -```{r} -lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm <- DoubleMLIRM$new(data_ml, - ml_g = lasso, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds = 5 -) -dml_irm$fit(store_predictions = TRUE) -dml_irm$summary() -lasso_irm <- dml_irm$coef -lasso_std_irm <- dml_irm$se - - -# predictions -dml_irm$params_names() -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o -``` - -```{r} -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -lasso_y_irm <- sqrt(mean((y - g_hat)^2)) -lasso_y_irm - -# cross-fitted RMSE: treatment -lasso_d_irm <- sqrt(mean((d - m_hat)^2)) -lasso_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -##### forest ##### - -dml_irm <- DoubleMLIRM$new(data_ml, - ml_g = randomForest, - ml_m = random_forest_class, - trimming_threshold = 0.01, n_folds = 5 -) -dml_irm$fit(store_predictions = TRUE) -dml_irm$summary() -forest_irm <- dml_irm$coef -forest_std_irm <- dml_plr$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_0 - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -forest_y_irm <- sqrt(mean((y - g_hat)^2)) -forest_y_irm - -# cross-fitted RMSE: treatment -forest_d_irm <- sqrt(mean((d - m_hat)^2)) -forest_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) - -##### trees ##### - -dml_irm <- DoubleMLIRM$new(data_ml, - ml_g = trees, ml_m = trees_class, - trimming_threshold = 0.01, n_folds = 5 -) -dml_irm$fit(store_predictions = TRUE) -dml_irm$summary() -tree_irm <- dml_irm$coef -tree_std_irm <- dml_irm$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -tree_y_irm <- sqrt(mean((y - g_hat)^2)) -tree_y_irm - -# cross-fitted RMSE: treatment -tree_d_irm <- sqrt(mean((d - m_hat)^2)) -tree_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) - - -##### boosting ##### - -dml_irm <- DoubleMLIRM$new(data_ml, - ml_g = boost, ml_m = boost_class, - trimming_threshold = 0.01, n_folds = 5 -) -dml_irm$fit(store_predictions = TRUE) -dml_irm$summary() -boost_irm <- dml_irm$coef -boost_std_irm <- dml_irm$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d * g1_hat + (1 - d) * g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -boost_y_irm <- sqrt(mean((y - g_hat)^2)) -boost_y_irm - -# cross-fitted RMSE: treatment -boost_d_irm <- sqrt(mean((d - m_hat)^2)) -boost_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -table <- matrix(0, 4, 4) -table[1, 1:4] <- c(lasso_irm, forest_irm, tree_irm, boost_irm) -table[2, 1:4] <- c(lasso_std_irm, forest_std_irm, tree_std_irm, boost_std_irm) -table[3, 1:4] <- c(lasso_y_irm, forest_y_irm, tree_y_irm, boost_y_irm) -table[4, 1:4] <- c(lasso_d_irm, forest_d_irm, tree_d_irm, boost_d_irm) -rownames(table) <- c("Estimate", "Std.Error", "RMSE Y", "RMSE D") -colnames(table) <- c("Lasso", "Random Forest", "Trees", "Boosting") -tab <- xtable(table, digits = 2) -tab -``` - -Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility. - -```{r} -lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm <- DoubleMLIRM$new(data_ml, - ml_g = randomForest, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds = 5 -) -dml_irm$fit(store_predictions = TRUE) -dml_irm$summary() -best_irm <- dml_irm$coef -best_std_irm <- dml_irm$se -``` - -These estimates that flexibly account for confounding are -substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings. - diff --git a/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd b/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd deleted file mode 100644 index e721fe31..00000000 --- a/PM4/r-identification-analysis-of-401-k-example-w-dags.Rmd +++ /dev/null @@ -1,198 +0,0 @@ ---- -title: An R Markdown document converted from "PM4/r-identification-analysis-of-401-k-example-w-dags.irnb" -output: html_document ---- - -# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth - -```{r} -# install and load package -install.packages("dagitty") -install.packages("ggdag") -``` - -```{r} -library(dagitty) -library(ggdag) -``` - -# Graphs for 401(K) Analsyis - -Here we have - * $Y$ -- net financial assets; - * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details); - * $F$ -- latent (unobserved) firm characteristics - * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$ - -**One graph (where F determines X):** - -```{r} -# generate a DAGs and plot them - -G1 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [uobserved, pos="0, -1"] -D -> Y -X -> D -F -> X -F -> D -X -> Y}') - - -ggdag(G1) + theme_dag() -``` - -**List minimal adjustment sets to identify causal effects $D \to Y$** - - -```{r} -adjustmentSets(G1, "D", "Y", effect = "total") -``` - -**What is the underlying principle?** - -Here conditioning on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so!) - -**Another Graph (wherere $X$ determines $F$):** - -```{r} -# generate a couple of DAGs and plot them - -G2 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [uobserved, pos="0, -1"] -D -> Y -X -> D -X -> F -F -> D -X -> Y}') - - -ggdag(G2) + theme_dag() -``` - -```{r} -adjustmentSets(G2, "D", "Y", effect = "total") -``` - -**One more graph (encompassing the previous ones), where (F, X) are jointly determined by latent factors $A$.** - -We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$. - -This is much more realistic graph to consider. - -```{r} -G3 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -X -> Y}') - -adjustmentSets(G3, "D", "Y", effect = "total") - -ggdag(G3) + theme_dag() -``` - -# Threat to Identification: - -What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case.) - -```{r} -G4 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -F -> Y -X -> Y}') - - -ggdag(G4) + theme_dag() -``` - -```{r} -adjustmentSets(G4, "D", "Y", effect = "total") -``` - -**Note that no output means that there is no valid adustment set (among observed variables).** - -**How can F affect Y directly? Is it reasonable?** - -Introduce Match Amount $M$. The match amount is a potential important mediator (why mediator?). $M$ is not observed. Luckily, adjusting for $X$ still works if there is no arrow $F \to M$. - -```{r} -G5 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -M [unobserved, pos="2, -.5"] -D -> Y -X -> D -F -> D -A -> F -A -> X -A -> D -D -> M -M -> Y -X -> M -X -> Y}') - -print(adjustmentSets(G5, "D", "Y", effect = "total")) - -ggdag(G5) + theme_dag() -``` - -If there is an $F \to M$ arrow, then adjusting for $X$ is not sufficient. - -```{r} -G6 <- dagitty('dag{ -Y [outcome,pos="4, 0"] -D [exposure,pos="0, 0"] -X [confounder, pos="2,-2"] -F [unobserved, pos="0, -1"] -A [unobserved, pos="-1, -1"] -M [uobserved, pos="2, -.5"] -D -> Y -X -> D -F -> D -A -> F -A -> X -D -> M -F -> M -A -> D -M -> Y -X -> M -X -> Y}') - -print(adjustmentSets(G6, "D", "Y"), effect = "total") - -ggdag(G6) + theme_dag() -``` - -Again, note that no output was returned for the adjustment set. There is no valid adjustment set here. - - # Question: - -Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth? - diff --git a/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd b/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd deleted file mode 100644 index d35fbafb..00000000 --- a/PM4/r_debiased_ml_for_partially_linear_model_growth.Rmd +++ /dev/null @@ -1,204 +0,0 @@ ---- -title: An R Markdown document converted from "PM4/r_debiased_ml_for_partially_linear_model_growth.irnb" -output: html_document ---- - -# Double/Debiased Machine Learning for the Partially Linear Regression Model - -This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model, which provides an application of DML inference to determine the causal effect of countries' intitial wealth on the rate of economic growth. - - -Reference: - -- https://arxiv.org/abs/1608.00060 -- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 - -The code is based on the book. - -```{r} -install.packages("xtable") -install.packages("hdm") -install.packages("randomForest") -install.packages("glmnet") -install.packages("sandwich") -``` - -```{r} -library(xtable) -library(randomForest) -library(hdm) -library(glmnet) -library(sandwich) - -set.seed(1) -``` - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/GrowthData.csv" -data <- read.csv(file) -data <- subset(data, select = -1) # get rid of index column -head(data) -dim(data) -``` - -```{r} -y <- as.matrix(data[, 1]) # outcome: growth rate -d <- as.matrix(data[, 3]) # treatment: initial wealth -x <- as.matrix(data[, -c(1, 2, 3)]) # controls: country characteristics - -# some summary statistics -cat(sprintf("\nThe length of y is %g \n", length(y))) -cat(sprintf("\nThe number of features in x is %g \n", dim(x)[2])) - -lres <- summary(lm(y ~ d + x))$coef[2, 1:2] -cat(sprintf("\nNaive OLS that uses all features w/o cross-fitting Y ~ D+X yields: \ncoef (se) = %g (%g)\n", - lres[1], lres[2])) -``` - -# DML algorithm - -Here we perform estimation and inference of predictive coefficient $\alpha$ in the partially linear statistical model, -$$ -Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0. -$$ -For $\tilde Y = Y- E(Y|X)$ and $\tilde D= D- E(D|X)$, we can write -$$ -\tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0. -$$ -Parameter $\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\tilde D$ and $\tilde Y$. -The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\tilde Y$ and $\tilde D$, where -the residuals are obtained by cross-validation (cross-fitting). - -The statistical parameter $\alpha$ has a causal interpretation of being the effect of $D$ on $Y$ in the causal DAG $$ D\to Y, \quad X\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$: -$$ -Y(d) = d\alpha + g(X) + U(d),\quad U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D). -$$ - -```{r} -dml2_for_plm <- function(x, d, y, dreg, yreg, nfold = 2) { - nobs <- nrow(x) # number of observations - foldid <- rep.int(1:nfold, times = ceiling(nobs / nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for (b in seq_along(I)) { - dfit <- dreg(x[-I[[b]], ], d[-I[[b]]]) # take a fold out - yfit <- yreg(x[-I[[b]], ], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]], ], type = "response") # predict the left-out fold - yhat <- predict(yfit, x[I[[b]], ], type = "response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - cat(b, " ") - } - rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other - coef.est <- coef(rfit)[2] # extract coefficient - se <- sqrt(vcovHC(rfit)[2, 2]) # record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est, se)) # printing output - return(list(coef.est = coef.est, se = se, dtil = dtil, ytil = ytil)) # save output and residuals -} -``` - -We now run through DML using as first stage models: - 1. OLS - 2. (Rigorous) Lasso - 3. Random Forests - 4. Mix of Random Forest and Lasso - -```{r} -# DML with OLS -cat(sprintf("\nDML with OLS w/o feature selection \n")) -dreg <- function(x, d) { - glmnet(x, d, lambda = 0) -} # ML method= OLS using glmnet; using lm gives bugs -yreg <- function(x, y) { - glmnet(x, y, lambda = 0) -} # ML method = OLS -dml2_ols <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) - - -# DML with Lasso: -cat(sprintf("\nDML with Lasso \n")) -dreg <- function(x, d) { - rlasso(x, d, post = FALSE) -} # ML method= lasso from hdm -yreg <- function(x, y) { - rlasso(x, y, post = FALSE) -} # ML method = lasso from hdm -dml2_lasso <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) - - -# DML with Random Forest: -cat(sprintf("\nDML with Random Forest \n")) -dreg <- function(x, d) { - randomForest(x, d) -} # ML method=Forest -yreg <- function(x, y) { - randomForest(x, y) -} # ML method=Forest -dml2_rf <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) - -# DML MIX: -cat(sprintf("\nDML with Lasso for D and Random Forest for Y \n")) -dreg <- function(x, d) { - rlasso(x, d, post = FALSE) -} # ML method=Forest -yreg <- function(x, y) { - randomForest(x, y) -} # ML method=Forest -dml2_mix <- dml2_for_plm(x, d, y, dreg, yreg, nfold = 10) -``` - -Now we examine the RMSE of D and Y to see which method performs well in the first-stage. We print all results below in the following table: - -```{r} -pr_res_d <- c(mean((dml2_ols$dtil)^2), mean((dml2_lasso$dtil)^2), mean((dml2_rf$dtil)^2), mean((dml2_mix$dtil)^2)) -pr_res_y <- c(mean((dml2_ols$ytil)^2), mean((dml2_lasso$ytil)^2), mean((dml2_rf$ytil)^2), mean((dml2_mix$ytil)^2)) -pr_res <- rbind(sqrt(pr_res_d), sqrt(pr_res_y)) -rownames(pr_res) <- c("RMSE D", "RMSE Y") -colnames(pr_res) <- c("OLS", "Lasso", "RF", "Mix") -``` - -```{r} -table <- matrix(0, 4, 4) - -# Point Estimate -table[1, 1] <- as.numeric(dml2_ols$coef.est) -table[2, 1] <- as.numeric(dml2_lasso$coef.est) -table[3, 1] <- as.numeric(dml2_rf$coef.est) -table[4, 1] <- as.numeric(dml2_mix$coef.est) - -# SE -table[1, 2] <- as.numeric(dml2_ols$se) -table[2, 2] <- as.numeric(dml2_lasso$se) -table[3, 2] <- as.numeric(dml2_rf$se) -table[4, 2] <- as.numeric(dml2_mix$se) - -# RMSE Y -table[1, 3] <- as.numeric(pr_res[2, 1]) -table[2, 3] <- as.numeric(pr_res[2, 2]) -table[3, 3] <- as.numeric(pr_res[2, 3]) -table[4, 3] <- as.numeric(pr_res[2, 4]) - -# RMSE D -table[1, 4] <- as.numeric(pr_res[1, 1]) -table[2, 4] <- as.numeric(pr_res[1, 2]) -table[3, 4] <- as.numeric(pr_res[1, 3]) -table[4, 4] <- as.numeric(pr_res[1, 4]) - - - -# print results -colnames(table) <- c("Estimate", "Standard Error", "RMSE Y", "RMSE D") -rownames(table) <- c("OLS", "Lasso", "RF", "RF/Lasso Mix") -table -``` - -```{r} -print(table, digit = 3) -``` - -```{r} -tab <- xtable(table, digits = 3) -print(tab, type = "latex") -``` - diff --git a/PM4/r_dml_inference_for_gun_ownership.Rmd b/PM4/r_dml_inference_for_gun_ownership.Rmd deleted file mode 100644 index 512479d3..00000000 --- a/PM4/r_dml_inference_for_gun_ownership.Rmd +++ /dev/null @@ -1,573 +0,0 @@ ---- -title: An R Markdown document converted from "PM4/r_dml_inference_for_gun_ownership.irnb" -output: html_document ---- - -# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates - -We consider the problem of estimating the effect of gun ownership on the homicide rate. For this purpose, we perform inference on $\beta$ in the following the partially linear model: -$$ -Y_{j, t}=\beta D_{j,(t-1)}+g\left(X_{j, t}, \bar{X}_j, \bar{X}_t, X_{j, 0}, Y_{j, 0}, t\right)+\epsilon_{j, t} -$$ -$Y_{j, t}$ is the log homicide rate in county $j$ at time $t. D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership $G_{j, t}$, which is not observed. $X_{j, t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. We use $\bar{X}_j$ to denote the within county average of $X_{j, t}$ and $\bar{X}_t$ to denote the within time period average of $X_{j, t} . X_{j, 0}$ and $Y_{j, 0}$ denote initial conditions in county $j$. We use $Z_{j, t}$ to denote the set of observed control variables $\left\{X_{j, t}, \bar{X}_j, \bar{X}_t, X_{j, 0}, Y_{j, 0}, t\right\}$, so that our model is - -$$ - Y_{i,t} = \beta D_{i,(t-1)} + g(Z_{i,t}) + \epsilon_{i,t}. -$$ - -## Data - -$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. Assuming the firearm suicide rate is a good proxy for gun ownership, the parameter $\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics. - -The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations. - -```{r} -install.packages("glmnet") -install.packages("randomForest") -install.packages("xgboost") -install.packages("keras") -install.packages("tensorflow") -install.packages("xtable") -install.packages("dplyr") -install.packages("sandwich") -``` - -```{r} -library(glmnet) -library(randomForest) -library(xgboost) -library(keras) -library(tensorflow) -library(xtable) -library(dplyr) -library(sandwich) -``` - -```{r} -file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/gun_clean.csv" -data <- read.csv(file) -dim(data) -``` - -## Preprocessing - -To attempt to flexibly account for fixed heterogeneity across counties, common time factors, and deterministic time trends, we include county-level averages, time period averages, initial conditions, and the time index as additional control variables. This strategy is related to strategies for addressing latent sources of heterogeneity via conditioning. - -We first reweight time and county variables as the data are population weighted. - -```{r} -# Note: These data are population weighted. Specifically, -# looking at the JBES replication files, they seem to be multiplied -# by sqrt((1/T sum_t population_{j,t})/100000). To get the -# unweighted variables need to divide by this number - which we can -# get from the time effects. We are mostly just going to use the weighted -# variables as inputs - except for time and county. We'll take -# cross-sectional and time series means of these weighted variables -# as well. Note that there is nothing wrong with this, but it does not -# reproduce a weighted regression in a setting where covariates may -# enter nonlinearly and flexibly. - -## County FE -county_vars <- select(data, starts_with("X_J")) - -## Time variables and population weights -# Pull out time variables -time_vars <- select(data, starts_with("X_T")) - -# Use these to construct population weights -pop_weights <- rowSums(time_vars) - -# Unweighted time variables -time_vars <- time_vars / pop_weights - -# For any columns with only zero (like the first one), just drop -time_vars <- time_vars[, colSums(time_vars != 0) > 0] - -# Create time index -time_ind <- rowSums(time_vars * (seq(1:20))) -``` - -Now we create initial conditions, county-level averages, and time period averages. - -```{r} -###### Create new data frame with variables we'll use - -# Function to find variable names -var_list <- function(df = NULL, type = c("numeric", "factor", "character"), pattern = "", exclude = NULL) { - vars <- character(0) - if (any(type %in% "numeric")) { - vars <- c(vars, names(df)[sapply(df, is.numeric)]) - } - if (any(type %in% "factor")) { - vars <- c(vars, names(df)[sapply(df, is.factor)]) - } - if (any(type %in% "character")) { - vars <- c(vars, names(df)[sapply(df, is.character)]) - } - vars[(!vars %in% exclude) & grepl(vars, pattern = pattern)] -} - -# census control variables -census <- NULL -census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL", "^HI", "^HS", "^INC", "^LF", "^LN", - "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") - -for (i in seq_along(census_var)) { - census <- append(census, var_list(data, pattern = census_var[i])) -} - -# other control variables -X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") -X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") - -# "treatment" variable -d <- "logfssl" - -# outcome variable -y <- "logghomr" - -# new data frame for time index -usedata <- as.data.frame(time_ind) -colnames(usedata) <- "time_ind" -usedata[, "weights"] <- pop_weights - -var_list <- c(y, d, X1, X2, census) -for (i in seq_along(var_list)) { - usedata[, var_list[i]] <- data[, var_list[i]] -} - -####################### Construct county specific means, -# time specific means, initial conditions - -# Initial conditions -var_list0 <- c(y, X1, X2, census) -for (i in seq_along(var_list0)) { - usedata[, paste(var_list0[i], "0", sep = "")] <- kronecker( - usedata[time_ind == 1, var_list0[i]], - rep(1, 20) - ) -} - -# County means -var_list_j <- c(X1, X2, census) -county_vars <- as.matrix(county_vars) -for (i in seq_along(var_list_j)) { - usedata[, paste(var_list_j[i], "J", sep = "")] <- - county_vars %*% qr.solve(county_vars, as.matrix(usedata[, var_list_j[i]])) -} - -# Time means -time_vars <- as.matrix(time_vars) -for (i in seq_along(var_list_j)) { - usedata[, paste(var_list_j[i], "T", sep = "")] <- - time_vars %*% qr.solve(time_vars, as.matrix(usedata[, var_list_j[i]])) -} -``` - -# Estimation - - -## Baseline OLS Estimates - -After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls in the full data set. - -```{r} -# Simple regression -lm0 <- lm(logghomr ~ logfssl, data = usedata) -vc0 <- vcovHC(lm0) -cat("Baseline OLS:", lm0$coefficients[2], " (", sqrt(vc0[2, 2]), ")\n") -# Confidence Interval with HC3 covariance -tt <- qt(c(0.025, 0.975), summary(lm0)$df[2]) -se <- sqrt(diag(vc0)) -ci <- coef(lm0) + se %o% tt -cat("2.5%: ", ci[2, 1], "97.5%: ", ci[2, 2]) -``` - -The point estimate is $0.302$ with the confidence interval ranging from 0.277 to 0.327. This -suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% then the predicted gun homicide rate goes up by 0.3%, without controlling for counties' characteristics. - - -Next we estimate with the baseline set of controls. - -```{r} -# Regression on baseline controls -var_list <- c(d, X1, X2, census) -lmC <- lm(paste("logghomr ~", paste(var_list, collapse = "+")), data = usedata) -vcC <- vcovHC(lmC) -cat("OLS with Controls:", lmC$coefficients["logfssl"], " (", sqrt(vcC["logfssl", "logfssl"]), ")\n") -``` - - - -We can also run our regression with time and space averages as controls. - -```{r} -# Regression on time and cross sectional averages -var_list_x <- c(X1, X2, census) -var_list_means <- c(d, X1, X2, census) -for (i in seq_along(var_list_x)) { - var_list_means <- c(var_list_means, paste(var_list_x[i], "J", sep = "")) -} -for (i in seq_along(var_list_x)) { - var_list_means <- c(var_list_means, paste(var_list_x[i], "T", sep = "")) -} -lmM <- lm(paste("logghomr ~", paste(var_list_means, collapse = "+")), data = usedata) -vcM <- vcovHC(lmM) -cat("OLS with Averages:", lmM$coefficients["logfssl"], " (", sqrt(vcM["logfssl", "logfssl"]), ")\n") -``` - -Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we now include all controls. - -```{r} -# Regression on all controls -lmA <- lm(logghomr ~ ., data = usedata) -vcA <- vcovHC(lmA) -cat("OLS All:", lmA$coefficients["logfssl"], " (", sqrt(vcA["logfssl", "logfssl"]), ")\n") -``` - -After controlling for a rich set of characteristics, the point estimate of gun ownership attenuates to 0.179. - -***NB***: In the background, `lm()` is dropping variables based on collinearity diagnostics. These depend on system linear algebra routines and can lead to large differences in high-dimensional or other ill-conditioned settings when using otherwise identical code across languages and/or machines. - -Now we turn to our double machine learning framework, employing linear and flexible estimation methods with cross-fitting. - -## DML Estimates - -We perform inference on $\beta$ in the following the partially linear model: - $$ -Y_{j, t}=\beta D_{j,(t-1)}+g(Z_{j,t})+\epsilon_{j, t}. -$$ -In the first stage, using cross-fitting, we employ modern regression methods to build estimators $\hat \ell(Z_{j,t})$ and $\hat m(Z_{j,t})$, where -- $\ell(Z_{j,t}):=E(Y_{j,t}|Z_{j,t})$ -- $m(Z_{j,t}):=E(D_{j,t}|Z_{j,t})$ - -Using these, we obtain the estimates of the residualized quantities -- $\tilde Y_{j,t} = Y_{j,t}- E(Y_{j,t}|Z_{j,t})$ -- $\tilde D_{j,t}= D_{j,t}- E(D_{j,t}|Z_{j,t})$ - -Using these residualized quantities, we note our model can be written as -$$ -\tilde Y_{j,t} = \beta \tilde D_{j,t} + \epsilon_{j,t}, \quad E (\epsilon_{j,t} |\tilde D_{j,t}) =0. -$$ -In the final stage, using ordinary least squares of $\tilde Y_{j,t}$ on $\tilde D_{j,t}$, we obtain the -estimate of $\beta$. - -In the following, we consider 10 different methods for the first-stage models for $\ell(\cdot)$ and $m(\cdot)$ covering linear, penalized linear, and flexible methods. We also report the first-stage RMSE scores for estimating $Y$ and $D$. - -```{r} -# NB: this cell takes > 3 hours to runon colab. To reduce computation time, -# reduce the number of cross-fitting folds. Note this may affect stability -# of estimates. - -set.seed(123) - -# Cross-fitting -n <- nrow(usedata) -Kf <- 5 # Number of cross-fitting folds -sampleframe <- rep(1:Kf, ceiling(n / Kf)) -cvgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups - -# Initialize variables for cross-fit predictions -yhat_r <- matrix(NA, n, 10) # Going to consider 10 learners -dhat_r <- matrix(NA, n, 10) - -# Cross-fitting loop -for (k in 1:Kf) { - cat("fold: ", k, "\n") - indk <- cvgroup == k - - ktrain <- usedata[!indk, ] - ktest <- usedata[indk, ] - - #### Simple regression models #### - - # Simple regression - yhat_r[indk, 1] <- ktest$logghomr - mean(ktrain$logghomr) - dhat_r[indk, 1] <- ktest$logfssl - mean(ktrain$logfssl) - - # Baseline controls - var_list <- c(X1, X2, census) - lmyk_c <- lm(paste("logghomr ~", paste(var_list, collapse = "+")), data = ktrain) - yhat_r[indk, 2] <- ktest$logghomr - predict(lmyk_c, ktest) - lmdk_c <- lm(paste("logfssl ~", paste(var_list, collapse = "+")), data = ktrain) - dhat_r[indk, 2] <- ktest$logfssl - predict(lmdk_c, ktest) - - # All controls - lmyk_a <- lm(logghomr ~ . - logfssl, data = ktrain) - yhat_r[indk, 3] <- ktest$logghomr - predict(lmyk_a, ktest) - lmdk_a <- lm(logfssl ~ . - logghomr, data = ktrain) - dhat_r[indk, 3] <- ktest$logfssl - predict(lmdk_a, ktest) - - #### Penalized Linear Models #### - - # Lasso - default CV tuning - ytrain <- as.matrix(usedata[!indk, "logghomr"]) - dtrain <- as.matrix(usedata[!indk, "logfssl"]) - xtrain <- as.matrix(usedata[!indk, !names(usedata) %in% - c("logghomr", "logfssl")]) - ytest <- as.matrix(usedata[indk, "logghomr"]) - dtest <- as.matrix(usedata[indk, "logfssl"]) - xtest <- as.matrix(usedata[indk, !names(usedata) %in% - c("logghomr", "logfssl")]) - - lassoyk <- cv.glmnet(xtrain, ytrain) - yhat_r[indk, 4] <- ytest - predict(lassoyk, newx = xtest, s = "lambda.min") - - lassodk <- cv.glmnet(xtrain, dtrain) - dhat_r[indk, 4] <- dtest - predict(lassodk, newx = xtest, s = "lambda.min") - - # Ridge - ridgeyk <- cv.glmnet(xtrain, ytrain, alpha = 0) - yhat_r[indk, 5] <- ytest - predict(ridgeyk, newx = xtest, s = "lambda.min") - - ridgedk <- cv.glmnet(xtrain, dtrain, alpha = 0) - dhat_r[indk, 5] <- dtest - predict(ridgedk, newx = xtest, s = "lambda.min") - - # EN, .5 - no cv over alpha - enyk <- cv.glmnet(xtrain, ytrain, alpha = .5) - yhat_r[indk, 6] <- ytest - predict(enyk, newx = xtest, s = "lambda.min") - - endk <- cv.glmnet(xtrain, dtrain, alpha = .5) - dhat_r[indk, 6] <- dtest - predict(endk, newx = xtest, s = "lambda.min") - - #### Flexible regression models #### - - # Random forest - rfyk <- randomForest(logghomr ~ . - logfssl, data = ktrain) - yhat_r[indk, 7] <- ktest$logghomr - predict(rfyk, ktest) - rfdk <- randomForest(logfssl ~ . - logghomr, data = ktrain) - dhat_r[indk, 7] <- ktest$logfssl - predict(rfdk, ktest) - - # Boosted tree - depth 4 - xgb_train_y <- xgb.DMatrix( - data = as.matrix(usedata[!indk, !names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[!indk, "logghomr"]) - ) - xgb_test_y <- xgb.DMatrix( - data = as.matrix(usedata[indk, !names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[indk, "logghomr"]) - ) - xgb_train_d <- xgb.DMatrix( - data = as.matrix(usedata[!indk, !names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[!indk, "logfssl"]) - ) - xgb_test_d <- xgb.DMatrix( - data = as.matrix(usedata[indk, !names(usedata) %in% - c("logghomr", "logfssl")]), - label = as.matrix(usedata[indk, "logfssl"]) - ) - - byk <- xgb.cv( - data = xgb_train_y, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5 - ) - best_iter <- which.min(as.matrix(byk$evaluation_log[, 4])) - byk <- xgboost( - data = xgb_train_y, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4 - ) - yhat_r[indk, 8] <- ktest$logghomr - predict(byk, - newdata = xgb_test_y, - iterationrange = c(1, (best_iter + 1)) - ) - - bdk <- xgb.cv( - data = xgb_train_d, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4, nfold = 5 - ) - best_iter <- which.min(as.matrix(bdk$evaluation_log[, 4])) - bdk <- xgboost( - data = xgb_train_d, - nrounds = 1000, verbose = 0, eta = .1, max_depth = 4 - ) - dhat_r[indk, 8] <- ktest$logfssl - predict(bdk, - newdata = xgb_test_d, - iterationrange = c(1, (best_iter + 1)) - ) - - #### Neural Networks #### - - # normalize the covariate data - mean <- apply(xtrain, 2, mean) - std <- apply(xtrain, 2, sd) - xtrainNN <- scale(xtrain, center = mean, scale = std) - xtestNN <- scale(xtest, center = mean, scale = std) - - xtestNN <- xtestNN[, which(!is.nan(colMeans(xtrainNN)))] - xtrainNN <- xtrainNN[, which(!is.nan(colMeans(xtrainNN)))] - - # DNN 50/50/50/50, .5 dropout - NNmodely <- keras_model_sequential() - NNmodely %>% - layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 1) - - NNmodely %>% compile( - loss = "mse", - optimizer = optimizer_rmsprop() - ) - - fit_nn_model_y <- NNmodely %>% fit( - xtrainNN, ytrain, - epochs = 200, batch_size = 200, - validation_split = .2, verbose = 0 - ) - yhat_r[indk, 9] <- ktest$logghomr - predict(NNmodely, xtestNN) - - NNmodeld <- keras_model_sequential() - NNmodeld %>% - layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dropout(rate = .5) %>% - layer_dense(units = 1) - - NNmodeld %>% compile( - loss = "mse", - optimizer = optimizer_rmsprop() - ) - - fit_nn_model_d <- NNmodeld %>% fit( - xtrainNN, dtrain, - epochs = 200, batch_size = 200, - validation_split = .2, verbose = 0 - ) - dhat_r[indk, 9] <- ktest$logfssl - predict(NNmodeld, xtestNN) - - # DNN 50/50/50/50, early stopping - NNmodely <- keras_model_sequential() - NNmodely %>% - layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 1) - - NNmodely %>% compile( - loss = "mse", - optimizer = optimizer_rmsprop() - ) - - early_stop <- callback_early_stopping( - monitor = "val_loss", patience = 25, - restore_best_weights = TRUE - ) - - fit_nn_model_y <- NNmodely %>% fit( - xtrainNN, ytrain, - epochs = 200, batch_size = 200, - validation_split = .2, verbose = 0, - callbacks = list(early_stop) - ) - yhat_r[indk, 10] <- ktest$logghomr - predict(NNmodely, xtestNN) - - NNmodeld <- keras_model_sequential() - NNmodeld %>% - layer_dense(units = 50, activation = "relu", input_shape = c(ncol(xtrainNN))) %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 50, activation = "relu") %>% - layer_dense(units = 1) - - NNmodeld %>% compile( - loss = "mse", - optimizer = optimizer_rmsprop() - ) - - early_stop <- callback_early_stopping( - monitor = "val_loss", patience = 25, - restore_best_weights = TRUE - ) - - fit_nn_model_d <- NNmodeld %>% fit( - xtrainNN, dtrain, - epochs = 200, batch_size = 200, - validation_split = .2, verbose = 0, - callbacks = list(early_stop) - ) - dhat_r[indk, 10] <- ktest$logfssl - predict(NNmodeld, xtestNN) -} - -################################################################################ -# Predictions done, now DML - -rmse_y <- sqrt(colMeans(yhat_r^2)) -rmse_d <- sqrt(colMeans(dhat_r^2)) - -# dml coefficient estimates -b_dml <- rep(NA, 10) -s_dml <- rep(NA, 10) -for (k in 1:10) { - lm_k <- lm(yhat_r[, k] ~ dhat_r[, k] - 1) - v_k <- vcovHC(lm_k) - b_dml[k] <- lm_k$coefficients - s_dml[k] <- sqrt(v_k) -} - -# "best" coefficient estimate -lm_k <- lm(yhat_r[, which.min(rmse_y)] ~ dhat_r[, which.min(rmse_d)] - 1) -v_k <- vcovHC(lm_k) -b_dml[11] <- lm_k$coefficients -s_dml[11] <- sqrt(v_k) - -# ls model average -yhat <- usedata$logghomr - yhat_r -dhat <- usedata$logfssl - dhat_r - -ma_y <- lm(usedata$logghomr ~ yhat - 1) -ma_d <- lm(usedata$logfssl ~ dhat - 1) -weights_y <- ma_y$coefficients -weights_d <- ma_d$coefficients -lm_k <- lm(ma_y$residuals ~ ma_d$residuals - 1) -v_k <- vcovHC(lm_k) -b_dml[12] <- lm_k$coefficients -s_dml[12] <- sqrt(v_k) - -## Display results -table1 <- matrix(0, 10, 2) -table1[, 1] <- rmse_y -table1[, 2] <- rmse_d -colnames(table1) <- c("RMSE Y", "RMSE D") -rownames(table1) <- c( - "OLS - No Controls", "OLS - Basic", "OLS - All", - "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", - "Random Forest", "Boosted trees - depth 4", - "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping" -) -tab1 <- xtable(table1, digits = c(0, 4, 4)) -tab1 - -table2 <- matrix(0, 12, 2) -table2[, 1] <- b_dml -table2[, 2] <- s_dml -colnames(table2) <- c("Point Estimate", "Std. Error") -rownames(table2) <- c( - "OLS - No Controls", "OLS - Basic", "OLS - All", - "Lasso (CV)", "Ridge (CV)", "Elastic Net (.5,CV)", - "Random Forest", "Boosted trees - depth 4", - "DNN - 50/50/50/50, dropout", "DNN - 50/50/50/50, early stopping", - "Best", "Least Squares Model Average" -) -tab2 <- xtable(table2, digits = c(0, 4, 4)) -tab2 -``` - -```{r} -print(xtable(table1, type = "latex")) -print(xtable(table2, type = "latex")) -``` - diff --git a/PM5/Autoencoders.Rmd b/PM5/Autoencoders.Rmd deleted file mode 100644 index 95a288c4..00000000 --- a/PM5/Autoencoders.Rmd +++ /dev/null @@ -1,262 +0,0 @@ ---- -title: An R Markdown document converted from "PM5/Autoencoders.irnb" -output: html_document ---- - -# Autoencoders - -In this notebook, we'll introduce and explore "autoencoders," which are a very successful family of models in modern deep learning. In particular we will: - - -1. Illustrate the connection between autoencoders and classical *Principal Component Analysis (PCA)* -3. Train a non-linear auto-encoder that uses a deep neural network - -### Overview -As explained in the text, autoencoders are a way of discovering *latent, low-dimensional structure* in a dataset. In particular, a random data vector $X \in \mathbb{R}^d$ can be said to have low-dimensional structure if we can find some functions $f: \mathbb{R}^d \to \mathbb{R}^k$ and $g: \mathbb{R}^k \to \mathbb{R}^d$, with $k \ll d$, such that -$$g(f(X)) \approx X.$$ - -In other words, $f(X)$ is a parsimonious, $k$-dimensional representation of $X$ that contains all of the information necessary to approximately reconstruct the full vector $X$. Traditionally, $f(X)$ is called an *encoding* of $X$. - -It turns out that this is meaningless unless we restrict what kinds of functions $f$ and $g$ are allowed to be, because it's possible to write down some (completely ugly) one-to-one function $\mathbb{R}^d \to \mathbb{R}^1$ for any $d$. This gives rise to the notion of *autoencoders* where, given some sets of reasonable functions $F$ and $G$, we aim to minimize -$$\mathbb{E}[\mathrm{loss}(X, f(g(X))]$$ -over functions $f \in F$ and $g \in G$. As usual, this is done by minimizing the sample analog. - - - -## Linear Autoencoders and PCA: Practice - -It turns out that linear autoencoders are the same as PCA. Let's do a small sanity check to verify this. In particular, let's perform PCA two ways: first using a standard (linear algebra) toolkit, and second as a linear autoencoder using a neural network library. -If all goes well, they should give you the same reconstructions! - -To make it a bit more fun, we will use the [*Labeled Faces in the Wild*](https://www.kaggle.com/jessicali9530/celeba-dataset) dataset which consists of standardized images of roughly 5,000 celebrities' faces. In this data, PCA amounts to looking for a small number of "proto-faces" such that a linear combination of them can accurately reconstruct any celebrity's face. - -```{r} -install.packages("keras") -``` - -```{r} -install.packages("reticulate") -install.packages("abind") -install.packages("grid") -install.packages("gridExtra") -install.packages("dplyr") -install.packages("purrr") -install.packages("reshape2") -install.packages("ggplot2") -``` - -```{r} -library(reticulate) - -# Import Python's sklearn.datasets -sklearn <- import("sklearn.datasets") - -# Fetch the dataset -faces <- sklearn$fetch_lfw_people() - -# Access the images and reshape the data similar to Python's reshape method -n_examples <- dim(faces$images)[1] -height <- dim(faces$images)[2] -width <- dim(faces$images)[3] -design_matrix <- array_reshape(faces$images, c(n_examples, height * width)) - -n_features <- dim(design_matrix)[2] - -# Print the dataset details -cat(sprintf( - paste("Labeled Faces in the Wild Dataset:\n Number of examples: %d\n ", - "Number of features: %d\n Image height: %d\n Image width: %d"), - n_examples, n_features, height, width -)) -``` - -```{r} -library(ggplot2) -library(gridExtra) -library(grid) - -# Find indices where the label is 'Arnold Schwarzenegger' -# faces$target uses python style indexing that starts at 0 rather than R style -# indexing that starts at 1, so we subtract 1 so the indexing lines up -arnold_labels <- which(faces$target_names == "Arnold Schwarzenegger") - 1 -# Get indices of all images corresponding to Arnold -arnold_pics <- which(faces$target %in% arnold_labels) - -plot_faces <- function(images, n_row = 2, n_col = 3, width, height) { - par(mfrow = c(n_row, n_col), mar = c(0.5, 0.5, 0.5, 0.5)) - for (i in seq_len(n_row * n_col)) { - if (i <= length(images)) { - # image needs to be transposed for and then flipped for correct orientation - # using R "image" - tmp <- t(images[[i]]) - tmp <- tmp[, rev(seq_len(ncol(tmp)))] - image(tmp, col = gray.colors(256), axes = FALSE, xlab = "", ylab = "") - } - } -} - -# Ensure arnold_images contains the right amount of data and is not NULL -arnold_images <- lapply(arnold_pics[seq_len(min(6, length(arnold_pics)))], function(idx) { - faces$images[idx, , ] -}) - -plot_faces(arnold_images, n_row = 2, n_col = 3, height = 62, width = 47) -``` - -```{r} -library(stats) - -# Perform PCA on the design matrix -pca <- prcomp(design_matrix, rank. = 128, retx = TRUE, center = TRUE, scale. = FALSE) - -# Extract the principal components (eigenfaces) -eigenfaces <- pca$rotation -``` - -```{r} -# 2. Plot the first 6 "eigenfaces," the six images whose linear span best explains the variation in our dataset -pca_images <- lapply(1:6, function(idx) { - array_reshape(eigenfaces[, idx], c(height, width)) -}) - -plot_faces(pca_images, height = height, width = width) -# we check the first six eigenvectors/projection axes, reshaped (the eigenvectors that -# captured the highest variation in our dataset of images) -# here, eigenvector1 orthog to eigenvector2 and all the others => decorrelation -# (there's no way to reconstruct eigenvector1 using a linear combination of all the other eigenvectors) -``` - -```{r} -reconstruct <- function(image_vector, n_components, eigenfaces) { - components <- eigenfaces[, 1:n_components, drop = FALSE] - compimage <- components %*% (t(components) %*% image_vector) - return(array_reshape(compimage, c(height, width))) -} - -# Select an Arnold image for reconstruction -face_vector <- t(design_matrix[arnold_pics[1], , drop = FALSE]) - -# Perform reconstructions with varying number of components -reconstructions <- lapply(c(1, 2, 8, 32, 64, 128), function(k) { - reconstruct(face_vector, k, eigenfaces) -}) - -# Plot the reconstructed faces -plot_faces(reconstructions, height = height, width = width) -``` - -```{r} -# 4. Train linear autoencoder with 64 neurons using Keras -# 5. Compare reconstructions of Arnold's face both using MSE and visually -``` - -```{r} -library(keras) - -encoding_dimension <- 64 -input_image <- layer_input(shape = n_features) -encoded <- layer_dense(units = encoding_dimension, activation = "linear")(input_image) -decoded <- layer_dense(units = n_features, activation = "linear")(encoded) -autoencoder <- keras_model(inputs = input_image, outputs = decoded) -autoencoder %>% compile( - optimizer = "adam", - loss = "mse" -) -autoencoder %>% fit( - design_matrix, - design_matrix, - epochs = 50, - batch_size = 256, - shuffle = TRUE, - verbose = 0 -) -``` - -```{r} -autoencoder %>% fit( - design_matrix, - design_matrix, - epochs = 50, - batch_size = 256, - shuffle = TRUE, - verbose = 0 -) -``` - -```{r} -library(ggplot2) -library(gridExtra) -library(reshape2) - -# Compute neural reconstruction -face_vector_flat <- as.numeric(face_vector) -reconstruction <- predict(autoencoder, matrix(face_vector_flat, nrow = 1)) - -# Do visual comparison -image_height <- 62 -image_width <- 47 -image1 <- matrix(reconstructions[[4]], nrow = image_height, ncol = image_width) -image2 <- t(matrix(reconstruction, nrow = image_width, ncol = image_height)) - -images <- list(image1, image2) -plot_faces(images, n_row = 1, n_col = 2, width = image_width, height = image_height) - - -# Do numeric comparison -# We also normalize the black/white gradient to take values in [0,1] (divide by 255) -img1 <- as.numeric(reconstructions[[4]]) / 255 -img2 <- as.numeric(reconstruction) / 255 -mse <- mean((img1 - img2)^2) -mse -``` - -## Neural Autoencoders - -Finally, let's train a nonlinear autoencoder for the same data where $F$ and $G$ are neural networks, and we restrict the dimension to be $k=64$. - -```{r} -library(tensorflow) - -# Use a nonlinear neural network -n_features <- 2914 -encoding_dimension <- 64 - -input_image <- layer_input(shape = n_features) -encoded <- input_image %>% - layer_dense(units = encoding_dimension, activation = "relu") %>% - layer_dense(units = encoding_dimension, activation = "relu") - -decoded <- encoded %>% - layer_dense(units = encoding_dimension, activation = "relu") %>% - layer_dense(units = n_features, activation = "relu") - -autoencoder <- keras_model(inputs = input_image, outputs = decoded) - -autoencoder %>% compile( - optimizer = "adam", - loss = "mse" -) -autoencoder %>% fit( - design_matrix, - design_matrix, - epochs = 50, - batch_size = 256, - shuffle = TRUE, - verbose = 0 -) - -# Compute neural reconstruction -reconstruction <- predict(autoencoder, matrix(face_vector, nrow = 1)) - -# Do visual comparison -plot_faces(list(reconstructions[[4]], t(matrix(reconstruction, nrow = image_width, ncol = image_height))), - n_row = 1, n_col = 2, width = image_width, height = image_height) - -# Do numeric comparison -# We also normalize the black/white gradient to take values in [0,1] (divide by 255) -img1 <- as.numeric(reconstructions[[4]]) / 255 -img2 <- as.numeric(reconstruction) / 255 -mse <- mean((img1 - img2)^2) -mse -``` - diff --git a/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd b/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd deleted file mode 100644 index 501ba7b7..00000000 --- a/T/T-3 Diff-in-Diff Minimum Wage Example.Rmd +++ /dev/null @@ -1,667 +0,0 @@ ---- -title: An R Markdown document converted from "T/T-3 Diff-in-Diff Minimum Wage Example.irnb" -output: html_document ---- - -# Minimum Wage Example Notebook with DiD - -This notebook implements Difference-in-Differences in an application on -the effect of minimum wage changes on teen employment. We use data from -[Callaway -(2022)](https://bcallaway11.github.io/files/Callaway-Chapter-2022/main.pdf). The data are annual county level data from the United States covering 2001 to 2007. The outcome variable is log county-level teen employment, and the treatment variable is an indicator for whether the county has a minimum wage above the federal minimum wage. Note that this definition of the treatment variable makes the analysis straightforward but ignores the nuances of the exact value of the minimum wage in each county and how far those values are from the federal minimum. The data also include county population and county average annual pay. -See [Callaway and Sant’Anna -(2021)](https://www.sciencedirect.com/science/article/abs/pii/S0304407620303948) -for additional details on the data. - -First, we will load some libraries. - -*(The installation of the packages might take up to 5 minutes)* - -```{r} -dependencies <- c("BMisc", "glmnet", "randomForest", "rpart", "xtable", "data.table") -install.packages(dependencies) -``` - -```{r} -lapply(dependencies, library, character.only = TRUE) -``` - -```{r} -set.seed(772023) -options(warn = -1) -``` - -## Loading the data - -```{r} -data <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/minwage_data.csv", - row.names = 1) -data <- data.table(data) -``` - -```{r} -head(data) -``` - -### Data Preparation - -We remove observations that are already treated in the first observed period (2001). We drop all variables that we won't use in our analysis. - -```{r} -data <- subset(data, (G == 0) | (G > 2001)) -data <- data[, -c( - "countyreal", "state_name", "FIPS", "emp0A01_BS", - "quarter", "censusdiv", "pop", "annual_avg_pay", - "state_mw", "fed_mw", "ever_treated" -)] -``` - -Next, we create the treatment groups. We focus our analysis exclusively on the set of counties that had wage increases away from the federal minimum wage in 2004. That is, we treat 2003 and earlier as the pre-treatment period. - -```{r} -treat1 <- subset(data, (G == 2004) & (year == 2001)) -treat2 <- subset(data, (G == 2004) & (year == 2002)) -treat3 <- subset(data, (G == 2004) & (year == 2003)) -treat4 <- subset(data, (G == 2004) & (year == 2004)) -treat5 <- subset(data, (G == 2004) & (year == 2005)) -treat6 <- subset(data, (G == 2004) & (year == 2006)) -treat7 <- subset(data, (G == 2004) & (year == 2007)) - -cont1 <- subset(data, (G == 0 | G > 2001) & (year == 2001)) -cont2 <- subset(data, (G == 0 | G > 2002) & (year == 2002)) -cont3 <- subset(data, (G == 0 | G > 2003) & (year == 2003)) -cont4 <- subset(data, (G == 0 | G > 2004) & (year == 2004)) -cont5 <- subset(data, (G == 0 | G > 2005) & (year == 2005)) -cont6 <- subset(data, (G == 0 | G > 2006) & (year == 2006)) -cont7 <- subset(data, (G == 0 | G > 2007) & (year == 2007)) -``` - -We assume that the basic assumptions, particularly parallel trends, hold after conditioning on pre-treatment variables: 2001 population, 2001 average pay and 2001 teen employment, as well as the region in which the county is located. (The region is characterized by four -categories.) - -Consequently, we want to extract the control variables for both treatment and control group in 2001. - -```{r} -treat1 <- treat1[, -c("year", "G", "region", "treated")] - -cont1 <- cont1[, -c("year", "G", "region", "treated")] -``` - -2003 serves as the pre-treatment period for both counties that do receive the treatment in 2004 and those that do not. - -```{r} -treatB <- merge(treat3, treat1, by = "id", suffixes = c(".pre", ".0")) -treatB <- treatB[, -c("treated", "lpop.pre", "lavg_pay.pre", "year", "G")] - -contB <- merge(cont3, cont1, by = "id", suffixes = c(".pre", ".0")) -contB <- contB[, -c("treated", "lpop.pre", "lavg_pay.pre", "year", "G")] -``` - -We estimate the ATET in 2004-2007, which corresponds to the effect in the year of treatment as well as in the three years after the treatment. The control observations are the observations that still have the federal minimum wage in each year. (The control group is shrinking in each year as additional units receive treatment). - -```{r} -treat4 <- treat4[, -c("lpop", "lavg_pay", "year", "G", "region")] -treat5 <- treat5[, -c("lpop", "lavg_pay", "year", "G", "region")] -treat6 <- treat6[, -c("lpop", "lavg_pay", "year", "G", "region")] -treat7 <- treat7[, -c("lpop", "lavg_pay", "year", "G", "region")] - -cont4 <- cont4[, -c("lpop", "lavg_pay", "year", "G", "region")] -cont5 <- cont5[, -c("lpop", "lavg_pay", "year", "G", "region")] -cont6 <- cont6[, -c("lpop", "lavg_pay", "year", "G", "region")] -cont7 <- cont7[, -c("lpop", "lavg_pay", "year", "G", "region")] - -tdid04 <- merge(treat4, treatB, by = "id") -dy <- tdid04$lemp - tdid04$lemp.pre -tdid04$dy <- dy -tdid04 <- tdid04[, -c("id", "lemp", "lemp.pre")] - -tdid05 <- merge(treat5, treatB, by = "id") -dy <- tdid05$lemp - tdid05$lemp.pre -tdid05$dy <- dy -tdid05 <- tdid05[, -c("id", "lemp", "lemp.pre")] - -tdid06 <- merge(treat6, treatB, by = "id") -dy <- tdid06$lemp - tdid06$lemp.pre -tdid06$dy <- dy -tdid06 <- tdid06[, -c("id", "lemp", "lemp.pre")] - -tdid07 <- merge(treat7, treatB, by = "id") -dy <- tdid07$lemp - tdid07$lemp.pre -tdid07$dy <- dy -tdid07 <- tdid07[, -c("id", "lemp", "lemp.pre")] - -cdid04 <- merge(cont4, contB, by = "id") -dy <- cdid04$lemp - cdid04$lemp.pre -cdid04$dy <- dy -cdid04 <- cdid04[, -c("id", "lemp", "lemp.pre")] - -cdid05 <- merge(cont5, contB, by = "id") -dy <- cdid05$lemp - cdid05$lemp.pre -cdid05$dy <- dy -cdid05 <- cdid05[, -c("id", "lemp", "lemp.pre")] - -cdid06 <- merge(cont6, contB, by = "id") -dy <- cdid06$lemp - cdid06$lemp.pre -cdid06$dy <- dy -cdid06 <- cdid06[, -c("id", "lemp", "lemp.pre")] - -cdid07 <- merge(cont7, contB, by = "id") -dy <- cdid07$lemp - cdid07$lemp.pre -cdid07$dy <- dy -cdid07 <- cdid07[, -c("id", "lemp", "lemp.pre")] -``` - -### Estimation of the ATET with DML - -We estimate the ATET of the county level minimum wage being larger than the federal minimum with the DML algorithm presented in Section 16.3 in the book. This requires estimation of the nuisance functions $E[Y|D=0,X]$, $E[D|X]$ as well as $P(D = 1)$. For the conditional expectation functions, we will consider different modern ML regression methods, namely: Constant (= no controls); a linear combination of the controls; an expansion of the raw control variables including all third order interactions; Lasso (CV); Ridge (CV); Random Forest; Shallow Tree; Deep Tree; and CV Tree. -The methods indicated with CV have their tuning parameter selected by cross-validation. - -The following code block implements the DML cross-fitting procedure. - -```{r} -att <- matrix(NA, 4, 10) -se_att <- matrix(NA, 4, 10) -rmse_d <- matrix(NA, 4, 9) -rmse_y <- matrix(NA, 4, 9) -trimmed <- matrix(NA, 4, 9) - -print("DML estimation starting, please wait") -for (ii in 1:4) { # ii refer to the 4 investigated post-treatment periods - - tdata <- get(paste("tdid0", (3 + ii), sep = "")) # Treatment data - cdata <- get(paste("cdid0", (3 + ii), sep = "")) # Control data - usedata <- rbind(tdata, cdata) - - #----------------------------------------------------------------------------- - # Cross-fit setup - n <- nrow(usedata) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups - - # Initialize variables for CV predictions - y_gd0x_fit <- matrix(NA, n, 9) - dgx_fit <- matrix(NA, n, 9) - pd_fit <- matrix(NA, n, 1) - - #----------------------------------------------------------------------------- - # Cross-fit loop - for (k in 1:Kf) { - cat("year: ", ii + 2003, "; fold: ", k, "\n") - indk <- cfgroup == k - - ktrain <- usedata[!indk, ] - ktest <- usedata[indk, ] - - # Build some matrices for later - ytrain <- as.matrix(usedata[!indk, "dy"]) - ytest <- as.matrix(usedata[indk, "dy"]) - dtrain <- as.matrix(usedata[!indk, "treated"]) - dtest <- as.matrix(usedata[indk, "treated"]) - - # Expansion for lasso/ridge (region specific cubic polynomial) - Xexpand <- model.matrix( - ~ region * (polym(lemp.0, lpop.0, lavg_pay.0, - degree = 3, raw = TRUE - )), - data = usedata - ) - - xtrain <- as.matrix(Xexpand[!indk, ]) - xtest <- as.matrix(Xexpand[indk, ]) - - #----------------------------------------------------------------------------- - # Estimating P(D = 1) - pd_fit[indk, 1] <- mean(ktrain$treated) - - #----------------------------------------------------------------------------- - # Estimating E[D|X] - - # 1) Constant - dgx_fit[indk, 1] <- mean(ktrain$treated) - - # 2) Baseline controls - glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - family = "binomial", data = ktrain - ) - dgx_fit[indk, 2] <- predict(glmXdk, newdata = ktest, type = "response") - - # 3) Region specific linear index - glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - family = "binomial", data = ktrain - ) - dgx_fit[indk, 3] <- predict(glmRXdk, newdata = ktest, type = "response") - - # 4) Lasso - expansion - default CV tuning - lassoXdk <- cv.glmnet(xtrain, dtrain, family = "binomial", type.measure = "mse") - dgx_fit[indk, 4] <- predict(lassoXdk, - newx = xtest, type = "response", - s = "lambda.min" - ) - - # 5) Ridge - expansion - default CV tuning - ridgeXdk <- cv.glmnet(xtrain, dtrain, - family = "binomial", - type.measure = "mse", alpha = 0 - ) - dgx_fit[indk, 5] <- predict(ridgeXdk, - newx = xtest, type = "response", - s = "lambda.min" - ) - - # 6) Random forest - rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, mtry = 4, ntree = 1000 - ) - dgx_fit[indk, 6] <- predict(rfXdk, ktest, type = "prob")[, 2] - - # 7) Tree (start big) - btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) - ) - # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dgx_fit[indk, 7] <- predict(btXdk, ktest) - - # 8) Tree (small tree) - stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) - ) - # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dgx_fit[indk, 8] <- predict(stXdk, ktest) - - # 9) Tree (cv) - bestcp <- btXdk$cptable[which.min(btXdk$cptable[, "xerror"]), "CP"] - cvXdk <- prune(btXdk, cp = bestcp) - dgx_fit[indk, 9] <- predict(cvXdk, ktest) - - #----------------------------------------------------------------------------- - # Estimating E[Y|D=0,X] - - # subset to D = 0 - ktrain0 <- ktrain[ktrain$treated == 0, ] - - ytrain0 <- ytrain[ktrain$treated == 0, ] - xtrain0 <- xtrain[ktrain$treated == 0, ] - - # 1) Constant - y_gd0x_fit[indk, 1] <- mean(ktrain0$dy) - - # 2) Baseline controls - lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) - y_gd0x_fit[indk, 2] <- predict(lmXyk, newdata = ktest) - - # 3) Region specific linear index - lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - data = ktrain - ) - y_gd0x_fit[indk, 3] <- predict(lmRXyk, newdata = ktest) - - # 4) Lasso - expansion - default CV tuning - lassoXyk <- cv.glmnet(xtrain0, ytrain0) - y_gd0x_fit[indk, 4] <- predict(lassoXyk, newx = xtest, s = "lambda.min") - - # 5) Ridge - expansion - default CV tuning - ridgeXyk <- cv.glmnet(xtrain0, ytrain0, alpha = 0) - y_gd0x_fit[indk, 5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") - - # 6) Random forest - rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain0, mtry = 4, ntree = 1000 - ) - y_gd0x_fit[indk, 6] <- predict(rfXyk, ktest) - - # 7) Tree (start big) - btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain0, method = "anova", - control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) - ) - y_gd0x_fit[indk, 7] <- predict(btXyk, ktest) - - # 8) Tree (small tree) - stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) - ) - y_gd0x_fit[indk, 8] <- predict(stXyk, ktest) - - # 9) Tree (cv) - bestcp <- btXyk$cptable[which.min(btXyk$cptable[, "xerror"]), "CP"] - cvXyk <- prune(btXyk, cp = bestcp) - y_gd0x_fit[indk, 9] <- predict(cvXyk, ktest) - } - - rmse_d[ii, ] <- sqrt(colMeans((usedata$treated - dgx_fit)^2)) - rmse_y[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - - y_gd0x_fit[usedata$treated == 0, ])^2)) - - # trim propensity scores of 1 to .95 - for (r in 1:9) { - trimmed[ii, r] <- sum(dgx_fit[, r] > .95) - dgx_fit[dgx_fit[, r] > .95, r] <- .95 - } - - att_num <- c( - colMeans(((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * - (usedata$dy - y_gd0x_fit)), - mean(((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) - / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * - (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])])) - ) - att_den <- mean(usedata$treated / pd_fit) - - att[ii, ] <- att_num / att_den - - phihat <- cbind( - ((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * - (usedata$dy - y_gd0x_fit), - ((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) - / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * - (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])]) - ) / att_den - se_att[ii, ] <- sqrt(colMeans((phihat^2)) / n) -} -``` - -We start by reporting the RMSE obtained during cross-fitting for each learner in each period. - -```{r} -table1y <- matrix(0, 9, 4) -table1y <- t(rmse_y) -colnames(table1y) <- c("2004", "2005", "2006", "2007") -rownames(table1y) <- c( - "No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest", "Deep Tree", - "Shallow Tree", "Tree (CV)" -) -table1y -``` - -```{r} -table1d <- matrix(0, 9, 4) -table1d <- t(rmse_d) -colnames(table1d) <- c("2004", "2005", "2006", "2007") -rownames(table1d) <- c( - "No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest", "Deep Tree", - "Shallow Tree", "Tree (CV)" -) -table1d -``` - -Here we see that the Deep Tree systematically performs worse in terms of cross-fit predictions than the other learners for both tasks and that Expansion performs similarly poorly for the outcome prediction. It also appears there is some signal in the regressors, especially for the propensity score, as all methods outside of Deep Tree and Expansion produce smaller RMSEs than the No Controls baseline. The other methods all produce similar RMSEs, with a small edge going to Ridge and Lasso. While it would be hard to reliably conclude which of the relatively good performing methods is statistically best here, one could exclude Expansion and Deep Tree from further consideration on the basis of out-of-sample performance suggesting -they are doing a poor job approximating the nuisance functions. Best (or a different ensemble) provides a good baseline that is principled in the sense that one could pre-commit to using the best learners without having first looked at the subsequent estimation results. - -We report estimates of the ATET in each period in the following table. - -```{r} -table2 <- matrix(0, 20, 4) -table2[seq(1, 20, 2), ] <- t(att) -table2[seq(2, 20, 2), ] <- t(se_att) -colnames(table2) <- c("2004", "2005", "2006", "2007") -rownames(table2) <- c( - "No Controls", "s.e.", "Basic", "s.e.", - "Expansion", "s.e.", "Lasso (CV)", "s.e.", - "Ridge (CV)", "s.e.", "Random Forest", "s.e.", - "Deep Tree", "s.e.", "Shallow Tree", "s.e.", - "Tree (CV)", "s.e.", "Best", "s.e." -) -table2 -``` - -Here, we see that the majority of methods provide point estimates that suggest the effect of the minimum wage increase leads to decreases in youth employment with small effects in the initial period that become larger in the years following the treatment. This pattern seems economically plausible as it may take time for firms to adjust employment and other input choices in response to a minimum wage change. The methods that produce estiamtes that are not consistent with this pattern are Deep Tree and Expansion which are both suspect as they systematically underperform in terms of having poor cross-fit prediction performance. In terms of point estimates, the other pattern that emerges is that all estimates that use the covariates produce ATET estimates that are systematically smaller in magnitude than the No Controls baseline, suggesting that failing to include the controls may lead to overstatement of treatment effects in this example. - -Turning to inference, we would reject the hypothesis of no minimum wage effect two or more years after the change at the 5% level, even after multiple testing correction, if we were to focus on the row "Best" (or many of the other individual rows). Focusing on "Best" is a reasonable ex ante strategy that could be committed to prior to conducting any analysis. It is, of course, reassuring that this broad conclusion is also obtained using many of the individual learners suggesting some robustness to the exact choice of learner made. - -### Assess pre-trends - -Because we have data for the period 2001-2007, we can perform a so-called pre-trends test to provide some evidence about the plausibility of the conditional parallel trends assumption. Specifically, we can continue to use 2003 as the reference period but now consider 2002 to be the treatment period. Sensible economic mechanisms underlying the assumption would then typically suggest that the ATET in 2002 - before the 2004 minimum wage change we are considering - should be zero. Finding evidence that the ATET in 2002 is non-zero then calls into question the validity of the assumption. - -We change the treatment status of those observations, which received treatment in 2004 in the 2002 data and create a placebo treatment as well as control group. - -```{r} -treat2 <- treat2[, -c("lpop", "lavg_pay", "year", "G", "region")] -treat2$treated <- 1 # Code these observations as treated - -tdid02 <- merge(treat2, treatB, by = "id") -dy <- tdid02$lemp - tdid02$lemp.pre -tdid02$dy <- dy -tdid02 <- tdid02[, -c("id", "lemp", "lemp.pre")] - -cont2 <- cont2[, -c("lpop", "lavg_pay", "year", "G", "region")] - -cdid02 <- merge(cont2, contB, by = "id") -dy <- cdid02$lemp - cdid02$lemp.pre -cdid02$dy <- dy -cdid02 <- cdid02[, -c("id", "lemp", "lemp.pre")] -``` - -We repeat the exercise for obtaining our ATET estimates and standard error for 2004-2007. Particularly, we also use all the learners as mentioned above. - -```{r} -att_p <- matrix(NA, 1, 10) -se_att_p <- matrix(NA, 1, 10) -rmse_d_p <- matrix(NA, 1, 9) -rmse_y_p <- matrix(NA, 1, 9) -trimmed_p <- matrix(NA, 1, 9) -for (ii in 1) { - tdata <- get(paste("tdid0", (3 - ii), sep = "")) # Treatment data - cdata <- get(paste("cdid0", (3 - ii), sep = "")) # Control data - usedata <- rbind(tdata, cdata) - - #----------------------------------------------------------------------------- - # Cross-fit setup - n <- nrow(usedata) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) # Cross-fitting groups - - # Initialize variables for CV predictions - y_gd0x_fit <- matrix(NA, n, 9) - dgx_fit <- matrix(NA, n, 9) - pd_fit <- matrix(NA, n, 1) - - #----------------------------------------------------------------------------- - # Cross-fit loop - for (k in 1:Kf) { - cat("year: ", ii + 2001, "; fold: ", k, "\n") - indk <- cfgroup == k - - ktrain <- usedata[!indk, ] - ktest <- usedata[indk, ] - - # Build some matrices for later - ytrain <- as.matrix(usedata[!indk, "dy"]) - ytest <- as.matrix(usedata[indk, "dy"]) - dtrain <- as.matrix(usedata[!indk, "treated"]) - dtest <- as.matrix(usedata[indk, "treated"]) - - # Expansion for lasso/ridge (region specific cubic polynomial) - Xexpand <- model.matrix( - ~ region * (polym(lemp.0, lpop.0, lavg_pay.0, - degree = 3, raw = TRUE - )), - data = usedata - ) - - xtrain <- as.matrix(Xexpand[!indk, ]) - xtest <- as.matrix(Xexpand[indk, ]) - - #----------------------------------------------------------------------------- - # Estimating P(D = 1) - pd_fit[indk, 1] <- mean(ktrain$treated) - - #----------------------------------------------------------------------------- - # Estimating E[D|X] - - # 1) Constant - dgx_fit[indk, 1] <- mean(ktrain$treated) - - # 2) Baseline controls - glmXdk <- glm(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - family = "binomial", data = ktrain - ) - dgx_fit[indk, 2] <- predict(glmXdk, newdata = ktest, type = "response") - - # 3) Region specific linear index - glmRXdk <- glm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - family = "binomial", data = ktrain - ) - dgx_fit[indk, 3] <- predict(glmRXdk, newdata = ktest, type = "response") - - # 4) Lasso - expansion - default CV tuning - lassoXdk <- cv.glmnet(xtrain, dtrain, family = "binomial", type.measure = "mse") - dgx_fit[indk, 4] <- predict(lassoXdk, - newx = xtest, type = "response", - s = "lambda.min" - ) - - # 5) Ridge - expansion - default CV tuning - ridgeXdk <- cv.glmnet(xtrain, dtrain, - family = "binomial", - type.measure = "mse", alpha = 0 - ) - dgx_fit[indk, 5] <- predict(ridgeXdk, - newx = xtest, type = "response", - s = "lambda.min" - ) - - # 6) Random forest - rfXdk <- randomForest(as.factor(treated) ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, mtry = 4, ntree = 1000 - ) - dgx_fit[indk, 6] <- predict(rfXdk, ktest, type = "prob")[, 2] - - # 7) Tree (start big) - btXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) - ) - # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dgx_fit[indk, 7] <- predict(btXdk, ktest) - - # 8) Tree (small tree) - stXdk <- rpart(treated ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) - ) - # xval is the number of cross-validation splits. E.g. xval = 5 is five fold CV - dgx_fit[indk, 8] <- predict(stXdk, ktest) - - # 9) Tree (cv) - bestcp <- btXdk$cptable[which.min(btXdk$cptable[, "xerror"]), "CP"] - cvXdk <- prune(btXdk, cp = bestcp) - dgx_fit[indk, 9] <- predict(cvXdk, ktest) - - #----------------------------------------------------------------------------- - # Estimating E[Y|D=0,X] - - # subset to D = 0 - ktrain0 <- ktrain[ktrain$treated == 0, ] - - ytrain0 <- ytrain[ktrain$treated == 0, ] - xtrain0 <- xtrain[ktrain$treated == 0, ] - - # 1) Constant - y_gd0x_fit[indk, 1] <- mean(ktrain0$dy) - - # 2) Baseline controls - lmXyk <- lm(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, data = ktrain0) - y_gd0x_fit[indk, 2] <- predict(lmXyk, newdata = ktest) - - # 3) Region specific linear index - lmRXyk <- lm(treated ~ region * (lemp.0 + lpop.0 + lavg_pay.0), - data = ktrain - ) - y_gd0x_fit[indk, 3] <- predict(lmRXyk, newdata = ktest) - - # 4) Lasso - expansion - default CV tuning - lassoXyk <- cv.glmnet(xtrain0, ytrain0) - y_gd0x_fit[indk, 4] <- predict(lassoXyk, newx = xtest, s = "lambda.min") - - # 5) Ridge - expansion - default CV tuning - ridgeXyk <- cv.glmnet(xtrain0, ytrain0, alpha = 0) - y_gd0x_fit[indk, 5] <- predict(ridgeXyk, newx = xtest, s = "lambda.min") - - # 6) Random forest - rfXyk <- randomForest(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain0, mtry = 4, ntree = 1000 - ) - y_gd0x_fit[indk, 6] <- predict(rfXyk, ktest) - - # 7) Tree (start big) - btXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain0, method = "anova", - control = rpart.control(maxdepth = 15, cp = 0, xval = 5, minsplit = 10) - ) - y_gd0x_fit[indk, 7] <- predict(btXyk, ktest) - - # 8) Tree (small tree) - stXyk <- rpart(dy ~ region + lemp.0 + lpop.0 + lavg_pay.0, - data = ktrain, method = "anova", - control = rpart.control(maxdepth = 3, cp = 0, xval = 0, minsplit = 10) - ) - y_gd0x_fit[indk, 8] <- predict(stXyk, ktest) - - # 9) Tree (cv) - bestcp <- btXyk$cptable[which.min(btXyk$cptable[, "xerror"]), "CP"] - cvXyk <- prune(btXyk, cp = bestcp) - y_gd0x_fit[indk, 9] <- predict(cvXyk, ktest) - } - - rmse_d_p[ii, ] <- sqrt(colMeans((usedata$treated - dgx_fit)^2)) - rmse_y_p[ii, ] <- sqrt(colMeans((usedata$dy[usedata$treated == 0] - - y_gd0x_fit[usedata$treated == 0, ])^2)) - - # trim propensity scores of 1 to .95 - for (r in 1:9) { - trimmed_p[ii, r] <- sum(dgx_fit[, r] > .95) - dgx_fit[dgx_fit[, r] > .95, r] <- .95 - } - - att_num <- c( - colMeans(((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * - (usedata$dy - y_gd0x_fit)), - mean(((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) - / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * - (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])])) - ) - att_den <- mean(usedata$treated / pd_fit) - - att_p[ii, ] <- att_num / att_den - - phihat <- cbind( - ((usedata$treated - dgx_fit) / ((pd_fit %*% matrix(1, 1, 9)) * (1 - dgx_fit))) * - (usedata$dy - y_gd0x_fit), - ((usedata$treated - dgx_fit[, which.min(rmse_d[ii, ])]) - / (pd_fit * (1 - dgx_fit[, which.min(rmse_d[ii, ])]))) * - (usedata$dy - y_gd0x_fit[, which.min(rmse_y[ii, ])]) - ) / att_den - se_att_p[ii, ] <- sqrt(colMeans((phihat^2)) / n) -} -``` - -We report the results in the following table. - -```{r} -tableP <- matrix(0, 4, 10) -tableP[1, ] <- c(rmse_y_p, min(rmse_y_p)) -tableP[2, ] <- c(rmse_d_p, min(rmse_d_p)) -tableP[3, ] <- att_p -tableP[4, ] <- se_att_p -rownames(tableP) <- c("RMSE Y", "RMSE D", "ATET", "s.e.") -colnames(tableP) <- c( - "No Controls", "Basic", "Expansion", - "Lasso (CV)", "Ridge (CV)", - "Random Forest", "Deep Tree", - "Shallow Tree", "Tree (CV)", "Best" -) -tableP <- t(tableP) -tableP -``` - -Here we see broad agreement across all methods in the sense of returning point estimates that are small in magnitude and small relative to standard errors. In no case would we reject the hypothesis that the pre-event effect in 2002 is different from zero at usual levels of significance. We note that failing to reject the hypothesis of no pre-event effects certainly does not imply that the conditional DiD assumption is in fact satisfied. For example, confidence intervals include values that would be consistent with relatively large pre-event effects. However, it is reassuring to see that there is not strong evidence of a violation of the underlying identifying assumption. - diff --git a/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd b/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd deleted file mode 100644 index 8c4564e4..00000000 --- a/T/T_4_Regression_Discontinuity_on_Progresa_Data.Rmd +++ /dev/null @@ -1,601 +0,0 @@ ---- -title: An R Markdown document converted from "T/T_4_Regression_Discontinuity_on_Progresa_Data.irnb" -output: html_document ---- - -# Regression Discontinuity -This notebook illustrates the use of Regression Discontinuity in an empirical study. We analyze the effect of the antipoverty program *Progresa/Oportunidades* on the consumption behavior of families in Mexico in the early 2000s. - -The program was intended for families in extreme poverty and included financial incentives for participation in measures that improved the family's health, nutrition and children's education. The effect of this program is a widely studied problem in social and economic sciences and, according to the WHO, was a very successful measure in terms of reducing extreme poverty in Mexico. - -Eligibility for the program was determined based on a pre-intervention household poverty-index. Individuals above a certain threshold received the treatment (participation in the program) while individuals below the threshold were excluded and recorded as a control group. All observations above the threshold participated in the program, which makes the analysis fall into the standard (sharp) regression discontinuity design. - -First, we need to install and load some packages. This can take up to 15 minutes. - -```{r} -dependencies <- c("rdrobust", "fastDummies", "randomForest", "hdm", "gbm", "rdd") -install.packages(dependencies) -``` - -```{r} -lapply(dependencies, library, character.only = TRUE) -``` - -We use a dataset assembled by [Calonico et al. (2014)](https://rdpackages.github.io/references/Calonico-Cattaneo-Titiunik_2014_ECMA--Supplemental.pdf) and follow the analysis in [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). - -First, we open the data and remove any observations that have NaN values. - -```{r} -df <- read.csv("https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/progresa.csv", - row.names = 1) -comp <- complete.cases(df) -df <- df[comp, ] -print("Shape of Data:") -print(dim(df)) -print("Variable Names:") -print(colnames(df)) -head(df) -``` - -The data set contains 1,944 observations for which full covariate information of 27 variables is available. - -We want to measure the local average treatment effect of program participation on four outcome variables. The outcome variables are food and non-food consumption of the recorded families at two points in time, one year and two years after the implementation of the program. - -The baseline covariates, recorded prior to program implementation, include the household's size; household head's age, sex, years of education and employment status; spouse's age and years of education; number of children not older than five years and their sex, and physical characteristics of the house: whether the house has cement floors, water connection, water connection inside the house, a bathroom, electricity, number of rooms, pre-intervention consumption, and an identifier of the urban locality in which the house is located. - -The data fits to the pattern of a sharp RD design, namely, all individuals that were below the cut-off index received no intervention, and all individuals above the cut-off were eligible to join the *progresa* program and thus participated. - -## Estimation without Covariates - -First, we will perform a very simple RD estimation with a weighted linear regression. We use a triangular kernel, which assigns weights to observations based on their distance from the cutoff point. The weights decrease linearly as the distance from the cutoff point increases. - -```{r} -triangular_kernel <- function(index, h) { - weights <- 1 - abs(index) / h - weights[weights < 0] <- 0 - return(weights) -} -``` - -The parameter `h` is the bandwidth that controls the range of observations that receive non-zero weights. We use the `IKbandwidth` function from the `rdd` package that implements the *Imbens-Kalyanaraman* method. Another standard approach would be to use the standard deviation of `index`. - -```{r} -h <- IKbandwidth(X = df$index, Y = df$conspcfood_t1, cutpoint = 0) -``` - -We use the triangular kernel function to calculate weights for each observation. After that, we can fit two seperate linear regressions for both treatment and control groups. - -```{r} -weights <- triangular_kernel(df$index, h) -model_treated <- lm(conspcfood_t1 ~ index, data = df[df$index > 0, ], weights = weights[df$index > 0]) -model_control <- lm(conspcfood_t1 ~ index, data = df[df$index < 0, ], weights = weights[df$index < 0]) -``` - -The treatment effect at the cutoff point is estimated as the difference between the predictions of the two models at the cutoff point. - -```{r} -cutoff <- 0 -treatment_effect <- predict(model_treated, newdata = data.frame(index = cutoff)) - - predict(model_control, newdata = data.frame(index = cutoff)) -treatment_effect -``` - -We estimate that the participation in the program reduced food consumption by $22.1$ units in the first year after the intervention. We can repeat the estimation using the `rdd` package, which yields us an estimate as well as a confidence band calculated according to the formulas presented in the book. We look at all four targets. - -```{r} -result <- c() -for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")) { - rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho = 1) - result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust", ])) -} -resframe <- as.data.frame(result) -colnames(resframe) <- c("LATE", "s.e.") -rownames(resframe) <- c("Food T_1", "Non-Food T_1", "Food T_2", "Non-Food T_2") -print(resframe) -``` - -While the effects in the first year after the intervention are negative, we observe significant positive effects in the second year after an individual or household was accepted in the *Progresa* program. This is in accordance to the previous analysis of this dataset. One possible explanation for this is that the program households have more money and can thus afford more. This was the desired effect of the program to combat hunger and extreme poverty. - -The following plot visualizes the two weighted regressions at the cut-off for the last outcome variable (non-food consumption in `t2`). We can clearly see the "jump" at the cut-off, which is our LATE. - -```{r} -rdplot(df$conspcfood_t1, df$index, c = 0, x.lim = c(-1, 1), y.lim = c(250, 400)) -``` - -## Estimation with Covariates - -For identification and estimation of the average treatment effect at the cutoff value no covariate information is required except the running variable, but nevertheless in many applications additional covariates are collected which might be exploited for the analysis. - - -The standard approach is simply to take up the regressors in the weighted least squares regression. - -```{r} -model_treated <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, - data = df[df$index > 0, ], weights = weights[df$index > 0]) -model_control <- lm(conspcfood_t1 ~ index + hhownhouse + headage + heademp + headeduc, - data = df[df$index < 0, ], weights = weights[df$index < 0]) -prediction_treated <- predict(model_treated, newdata = data.frame( - index = cutoff, - hhownhouse = weighted.mean(df[df$index > 0, ]$hhownhouse, w = weights[df$index > 0]), - headage = weighted.mean(df[df$index > 0, ]$headage, w = weights[df$index > 0]), - heademp = weighted.mean(df[df$index > 0, ]$heademp, w = weights[df$index > 0]), - headeduc = weighted.mean(df[df$index > 0, ]$headeduc, w = weights[df$index > 0]) -)) -prediction_control <- predict(model_control, newdata = data.frame( - index = cutoff, - hhownhouse = weighted.mean(df[df$index < 0, ]$hhownhouse, w = weights[df$index < 0]), - headage = weighted.mean(df[df$index < 0, ]$headage, w = weights[df$index < 0]), - heademp = weighted.mean(df[df$index < 0, ]$heademp, w = weights[df$index < 0]), - headeduc = weighted.mean(df[df$index < 0, ]$headeduc, w = weights[df$index < 0]) -)) -treatment_effect <- prediction_treated - prediction_control -treatment_effect -``` - -Including these selected covariates does not have a significant impact on the LATE estimation. - -Again, we can also use `rdrobust` to repeat the estimation with all other outcomes. - -```{r} -result <- c() -for (outcome in c("conspcfood_t1", "conspcnonfood_t1", "conspcfood_t2", "conspcnonfood_t2")) { - rdd_result <- rdrobust(df[[outcome]], df$index, c = cutoff, rho = 1, covs = df[, c(1:8, 10:17, 19, 22)]) - result <- rbind(result, c(rdd_result$coef[1], rdd_result$se["Robust", ])) -} -resframe_adj <- as.data.frame(result) -colnames(resframe_adj) <- c("LATE", "s.e.") -rownames(resframe_adj) <- c("Food T_1", "Non-Food T_1", "Food T_2", "Non-Food T_2") -resframe_adj["% reduction"] <- (resframe_adj["s.e."] - resframe[, 2]) * 100 / resframe[, 2] -print(resframe_adj) -``` - -Overall, the adjustment by only a few covariates has not changed the estimated coefficient much from the result without covariates. However, including covariates does reduce the standard errors. - -## Estimation using ML - -As discussed in the book, including many covariates in RDD estimation can be beneficial for multiple reasons: -1. **Efficiency and power improvements**: As in randomized control trials, using covariates can increase efficiency and improve power. -2. **Auxiliary information**: In RDD the score determines the treatment assignment and measurement errors in the running variable can distort the results. Additional covariates can be exploited to overcome these issues or to deal with missing data problems. -3. **Treatment effect heterogeneity**: Covariates can be used to define subgroups in which the treatment effects differ. -4. **Other parameters of interest and extrapolation**: As the identified treatment effect in RDD is local at the cutoff, additional covariates might help for extrapolation of the treatment effects or identify other causal parameters. - -However, including a high number of covariates also comes with additional challenges, such as variables selection, non-linearities or interactions between covariates. The best way to overcome these is the use of modern ML methods. - -There are multiple ways to implement the estimators presented in the book, we will closely follow the analysis of [Noack et al. (2023)](https://arxiv.org/pdf/2107.07942.pdf). We set up running variable and outcome as above. The baseline covariates will be all the other variables in the data. - -```{r} -# Running Variable and Outcome -df_ml <- df -investigated_outcome <- "conspcfood_t1" -names(df_ml)[names(df_ml) == "index"] <- "X" -names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" - -# Baseline covariates including consumption -b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) - -# Fixed effects for localities -i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) - -# Flexible covariates including localities indicators -f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] -Zlasso <- as.matrix(cbind(i_fe, f_covs)) -``` - -We will use the package `rdrobust` for the RD estimation. Before starting the DML procedure, we have to estimate a bandwidth to restrict the samples in the first stage estimation. - -```{r} -h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] -``` - -The next chunk sets up the crossfitting and estimates the function $\eta(Z)$, which we will use to adjust $Y$ for the second stage. We use Random Forest, a Boosting implementation, Linear Regression and Lasso with both a baseline and flexible covariate structure. - -```{r} -first_stage <- function() { - # Set up the cross-fitting - n <- nrow(df_ml) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) - - # Matrix to store eta predictions - eta_fit <- matrix(NA, n, 5) - - # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X) < h_fs) - - for (k in 1:Kf) { - fold <- (cfgroup == k) - - data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] - data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - - data_fold <- df_ml[fold, ] - - model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) - - rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) - rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 - - gbm1 <- gbm(model, - data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - gbm0 <- gbm(model, - data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 - - lm1 <- lm(model, data = data_treated) - lm0 <- lm(model, data = data_control) - eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 - - las_base1 <- rlasso(model, data = data_treated) - las_base0 <- rlasso(model, data = data_control) - eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - - data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) - data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) - data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) - - las_flex1 <- rlasso(model_flex, data = data_treated_extended) - las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 - } - return(eta_fit) -} - -eta_fit <- first_stage() -``` - -With the estimated $\hat{\eta}(Z)$ we can correct for confounding in $Y$ and now run the RDD estimation as second stage again. - -```{r} -methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") - -second_stage <- function(eta_fit) { - adj_results <- NULL - - for (i in seq_along(methods)) { - m_y <- df_ml$Y - eta_fit[, i] - rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) - } - return(adj_results) -} - -adj_frame <- as.data.frame(second_stage(eta_fit)) -rownames(adj_frame) <- methods -colnames(adj_frame) <- c("LATE", "s.e.") -print(adj_frame) -``` - -Finally, we create a small simulation study with only $R=20$ repetitions to show the variance reducing effect of the inclusion of ML-based estimators for the covariates. The next block runs up to ten minutes. - -```{r} -estimates <- adj_frame[, 1] -std_err <- adj_frame[, 2] -R <- 19 - -for (i in 1:R) { - eta_fit <- first_stage() - adj_results <- second_stage(eta_fit) - estimates <- cbind(estimates, adj_results[, 1]) - std_err <- cbind(std_err, adj_results[, 2]) -} -``` - -We aggregate the median of the estimates, the mean of the standard errors and also calculate the mean reduction of standard error compared to the "no covariates" estimation. We see, that including covariates can reduce the standard error of estimation around 15-20%. - -```{r} -med_est <- apply(estimates, 1, median) -mean_se <- apply(std_err, 1, mean) -adj_frame <- as.data.frame(cbind(med_est, mean_se)) -rownames(adj_frame) <- methods -colnames(adj_frame) <- c("LATE", "s.e.") -adj_frame["% reduction"] <- (adj_frame["s.e."] - resframe[1, 2]) * 100 / resframe[1, 2] -adj_frame["Linear Adjusted (no cross-fit)", ] <- resframe_adj[1, ] -print(adj_frame) -``` - -## We now repeat the exercise for the other outcomes (excluding the simulation). - -Non-Food Consumption (Year 1) - -```{r} -# Running Variable and Outcome -df_ml <- df -investigated_outcome <- "conspcnonfood_t1" -names(df_ml)[names(df_ml) == "index"] <- "X" -names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" - -# Baseline covariates including consumption -b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) - -# Fixed effects for localities -i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) - -# Flexible covariates including localities indicators -f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] -Zlasso <- as.matrix(cbind(i_fe, f_covs)) - -h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] - -first_stage <- function() { - # Set up the cross-fitting - n <- nrow(df_ml) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) - - # Matrix to store eta predictions - eta_fit <- matrix(NA, n, 5) - - # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X) < h_fs) - - for (k in 1:Kf) { - fold <- (cfgroup == k) - - data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] - data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - - data_fold <- df_ml[fold, ] - - model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) - - rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) - rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 - - gbm1 <- gbm(model, - data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - gbm0 <- gbm(model, - data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 - - lm1 <- lm(model, data = data_treated) - lm0 <- lm(model, data = data_control) - eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 - - las_base1 <- rlasso(model, data = data_treated) - las_base0 <- rlasso(model, data = data_control) - eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - - data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) - data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) - data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) - - las_flex1 <- rlasso(model_flex, data = data_treated_extended) - las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 - } - return(eta_fit) -} - -eta_fit <- first_stage() - -methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") - -second_stage <- function(eta_fit) { - adj_results <- NULL - - for (i in seq_along(methods)) { - m_y <- df_ml$Y - eta_fit[, i] - rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) - } - return(adj_results) -} - -adj_frame <- as.data.frame(second_stage(eta_fit)) -rownames(adj_frame) <- methods -colnames(adj_frame) <- c("LATE", "s.e.") -print(adj_frame) -``` - -Food Consumption (Year 2) - -```{r} -# Running Variable and Outcome -df_ml <- df -investigated_outcome <- "conspcfood_t2" -names(df_ml)[names(df_ml) == "index"] <- "X" -names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" - -# Baseline covariates including consumption -b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) - -# Fixed effects for localities -i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) - -# Flexible covariates including localities indicators -f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] -Zlasso <- as.matrix(cbind(i_fe, f_covs)) - -h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] - -first_stage <- function() { - # Set up the cross-fitting - n <- nrow(df_ml) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) - - # Matrix to store eta predictions - eta_fit <- matrix(NA, n, 5) - - # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X) < h_fs) - - for (k in 1:Kf) { - fold <- (cfgroup == k) - - data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] - data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - - data_fold <- df_ml[fold, ] - - model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) - - rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) - rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 - - gbm1 <- gbm(model, - data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - gbm0 <- gbm(model, - data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 - - lm1 <- lm(model, data = data_treated) - lm0 <- lm(model, data = data_control) - eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 - - las_base1 <- rlasso(model, data = data_treated) - las_base0 <- rlasso(model, data = data_control) - eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - - data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) - data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) - data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) - - las_flex1 <- rlasso(model_flex, data = data_treated_extended) - las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 - } - return(eta_fit) -} - -eta_fit <- first_stage() - -methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") - -second_stage <- function(eta_fit) { - adj_results <- NULL - - for (i in seq_along(methods)) { - m_y <- df_ml$Y - eta_fit[, i] - rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) - } - return(adj_results) -} - -adj_frame <- as.data.frame(second_stage(eta_fit)) -rownames(adj_frame) <- methods -colnames(adj_frame) <- c("LATE", "s.e.") -print(adj_frame) -``` - -Non-Food Consumption (Year 2) - -```{r} -# Running Variable and Outcome -df_ml <- df -investigated_outcome <- "conspcnonfood_t2" -names(df_ml)[names(df_ml) == "index"] <- "X" -names(df_ml)[names(df_ml) == investigated_outcome] <- "Y" - -# Baseline covariates including consumption -b_covs <- names(df_ml[, c(1:8, 10:17, 19, 22)]) - -# Fixed effects for localities -i_fe <- as.matrix(dummy_cols(df_ml$clus, remove_first_dummy = TRUE)) - -# Flexible covariates including localities indicators -f_covs <- as.matrix(model.matrix(~ .^2, data = df_ml[b_covs]))[, -1] -Zlasso <- as.matrix(cbind(i_fe, f_covs)) - -h_fs <- 2 * rdrobust(df_ml$Y, df_ml$X, rho = 1)$bws[[1]] - -first_stage <- function() { - # Set up the cross-fitting - n <- nrow(df_ml) - Kf <- 5 # Number of folds - sampleframe <- rep(1:Kf, ceiling(n / Kf)) - cfgroup <- sample(sampleframe, size = n, replace = FALSE) - - # Matrix to store eta predictions - eta_fit <- matrix(NA, n, 5) - - # Create vector of observations to be considered in the first stage model - weights <- (abs(df_ml$X) < h_fs) - - for (k in 1:Kf) { - fold <- (cfgroup == k) - - data_treated <- df_ml[df_ml$X > 0 & !fold & weights > 0, ] - data_control <- df_ml[df_ml$X < 0 & !fold & weights > 0, ] - - data_fold <- df_ml[fold, ] - - model <- as.formula(paste("Y~", paste(b_covs, collapse = "+"))) - - rf1 <- randomForest(model, data = data_treated, mtry = 4, ntree = 1000) - rf0 <- randomForest(model, data = data_control, mtry = 4, ntree = 1000) - eta_fit[fold, 1] <- (predict(rf1, data_fold) + predict(rf0, data_fold)) / 2 - - gbm1 <- gbm(model, - data = data_treated, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - gbm0 <- gbm(model, - data = data_control, n.trees = 100, interaction.depth = 1, - shrinkage = .1, distribution = "gaussian" - ) - eta_fit[fold, 2] <- (predict(gbm1, data_fold, n.trees = 100) + predict(gbm0, data_fold, n.trees = 100)) / 2 - - lm1 <- lm(model, data = data_treated) - lm0 <- lm(model, data = data_control) - eta_fit[fold, 3] <- (predict(lm1, data_fold) + predict(lm0, data_fold)) / 2 - - las_base1 <- rlasso(model, data = data_treated) - las_base0 <- rlasso(model, data = data_control) - eta_fit[fold, 4] <- (predict(las_base1, data_fold) + predict(las_base0, data_fold)) / 2 - - data_treated_extended <- cbind(Zlasso[rownames(data_treated), ], data_treated) - data_control_extended <- cbind(Zlasso[rownames(data_control), ], data_control) - data_fold_extended <- cbind(Zlasso[rownames(data_fold), ], data_fold) - model_flex <- as.formula(paste("Y~", paste(c(b_covs, colnames(Zlasso)), collapse = "+"))) - - las_flex1 <- rlasso(model_flex, data = data_treated_extended) - las_flex0 <- rlasso(model_flex, data = data_control_extended) - eta_fit[fold, 5] <- (predict(las_flex1, data_fold_extended) + predict(las_flex0, data_fold_extended)) / 2 - } - return(eta_fit) -} - -eta_fit <- first_stage() - -methods <- c("Random Forest", "Gradient Boosting", "Linear Regression", "Lasso Baseline", "Lasso Flexible") - -second_stage <- function(eta_fit) { - adj_results <- NULL - - for (i in seq_along(methods)) { - m_y <- df_ml$Y - eta_fit[, i] - rdd_result <- rdrobust(m_y, df$index, c = cutoff, rho = 1) - adj_results <- rbind(adj_results, c(rdd_result$coef[1], rdd_result$se["Robust", ])) - } - return(adj_results) -} - -adj_frame <- as.data.frame(second_stage(eta_fit)) -rownames(adj_frame) <- methods -colnames(adj_frame) <- c("LATE", "s.e.") -print(adj_frame) -``` - diff --git a/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd b/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd deleted file mode 100644 index 7d506973..00000000 --- a/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd +++ /dev/null @@ -1,264 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Double/Debiased ML for Partially Linear IV Model - -References: - -https://arxiv.org/abs/1608.00060 - - -https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 - -The code is based on the book. - - - - - -# Partially Linear IV Model - -We consider the partially linear structural equation model: -\begin{eqnarray} - & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ - & Z = m_0(X) + V, & E[V \mid X] = 0. -\end{eqnarray} - - -Note that this model is not a regression model unless $Z=D$. The model is a canonical -model in causal inference, going back to P. Wright's work on IV methods for estimaing demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. - - -The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation -in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. - - -The causal DAG this model corresponds to is given by: -$$ -Z \to D, X \to (Y, Z, D), L \to (Y,D), -$$ -where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. - - - ---- - -# Example - -A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. - ----- - - - -# PLIVM in Residualized Form - -The PLIV model above can be rewritten in the following residualized form: -$$ - \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, -$$ -where -$$ - \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ - \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ - \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. -$$ - The tilded variables above represent original variables after taking out or "partialling out" - the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ - and $U$ have non-zero correlation, which automatically means that $U$ and $V$ - must have non-zero variation. - - - ------ - -# DML for PLIV Model - -Given identification, DML proceeds as follows - -Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts -to solving the three problems of predicting $Y$, $D$, and $Z$ using -$X$, using any generic ML method, giving us estimated residuals -$$ -\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). -$$ -The estimates should be of a cross-validated form, as detailed in the algorithm below. - -Estimate $\theta_0$ by the the intstrumental -variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. -Use the conventional inference for the IV regression estimator, ignoring -the estimation error in these residuals. - -The reason we work with this residualized form is that it eliminates the bias -arising when solving the prediction problem in stage 1. The role of cross-validation -is to avoid another source of bias due to potential overfitting. - -The estimator is adaptive, -in the sense that the first stage estimation errors do not affect the second -stage errors. - - - -```{r _kg_hide-output=TRUE} -install.packages("hdm") -install.packages("AER") -install.packages("randomForest") -``` - -```{r} - -library(AER) #applied econometrics library -library(randomForest) #random Forest library -library(hdm) #high-dimensional econometrics library -library(glmnet) #glm net - - -# DML for PLIVM - -DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=2) { - # this implements DML2 algorithm, where there moments are estimated via DML, before constructing - # the pooled estimate of theta randomly split data into folds - nobs <- nrow(x) - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- ztil<- rep(NA, nobs) - # obtain cross-fitted residuals - cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out - zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual - ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial - cat(b," ") - } - ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) - coef.est <- ivfit$coef #extract coefficient - se <- ivfit$se #record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) -} - - -``` - - ------ - -# Emprical Example: Acemoglu, Jonsohn, Robinson (AER). - - -* Y is log GDP; -* D is a measure of Protection from Expropriation, a proxy for -quality of insitutions; -* Z is the log of Settler's mortality; -* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) - - - - -```{r} - -data(AJR); - -y = AJR$GDP; -d = AJR$Exprop; -z = AJR$logMort -xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) -x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + - Asia + Namer + Samer)^2, data=AJR) -dim(x) - -# DML with Random Forest -cat(sprintf("\n DML with Random Forest \n")) - -dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest - -set.seed(1) -DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=20) - -# DML with PostLasso -cat(sprintf("\n DML with Post-Lasso \n")) - -dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso -yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso -zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso - -set.seed(1) -DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=20) - - -# Compare Forest vs Lasso - -comp.tab= matrix(NA, 3, 2) -comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) -comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) -comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) -rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") -colnames(comp.tab) = c("RF", "LASSO") -print(comp.tab, digits=3) -``` - -# Examine if we have weak instruments - -```{r} -install.packages("lfe") -library(lfe) -summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) -summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) -``` - -# We do have weak instruments, because t-stats in regression $\tilde D \sim \tilde Z$ are less than 4 in absolute value - - -So let's carry out DML inference combined with Anderson-Rubin Idea - -```{r} -# DML-AR (DML with Anderson-Rubin) - -DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ - n = length(rY) - Cstat = rep(0, length(grid)) - for (i in 1:length(grid)) { - Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) - }; - LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); - UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); - plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); - lines(grid, Cstat, lty = 1, col = 1); - abline(h=qchisq(1-alpha,1), lty = 3, col = 4); - abline(v=LB, lty = 3, col = 2); - abline(v=UB, lty = 3, col = 2); - return(list(UB=UB, LB=LB)) - } - -``` - -```{r} -DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, - grid = seq(-2, 2, by =.01)) - - -DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, - grid = seq(-2, 2, by =.01)) - -``` diff --git a/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd b/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd deleted file mode 100644 index f5daee27..00000000 --- a/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd +++ /dev/null @@ -1,677 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models - - -## Impact of 401(k) on Financial Wealth - -As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation -on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. - -One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. - - -### Data - -The data set can be loaded from the `hdm` package for R by typing - - - -```{r} -library(hdm) -library(ggplot2) -data(pension) -data <- pension -dim(data) -``` - -See the "Details" section on the description of the data set, which can be accessed by - - -```{r} -help(pension) -``` - -The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. - - -Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. - -```{r} -hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() -hist_e401 -``` - -Eligibility is highly associated with financial wealth: - -```{r} -dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + - geom_density() + xlim(c(-20000, 150000)) + - facet_wrap(.~e401) - -dens_net_tfa -``` - -The unconditional APE of e401 is about $19559$: - -```{r} -e1 <- data[data$e401==1,] -e0 <- data[data$e401==0,] -round(mean(e1$net_tfa)-mean(e0$net_tfa),0) -``` - -Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: - -```{r} -p1 <- data[data$p401==1,] -p0 <- data[data$p401==0,] -round(mean(p1$net_tfa)-mean(p0$net_tfa),0) -``` - -As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. - - -## Double ML package - - -We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. To get those estimators, we use the `DoubleML` package that internally builds on mlr3. You find additional information on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. - -```{r} -# installing Double ML -remotes::install_github("DoubleML/doubleml-for-r",quiet=TRUE) - - -# loading the packages -library(DoubleML) -library(mlr3learners) -library(mlr3) -library(data.table) -library(randomForest) - -``` - -As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. - - -## Estimating the ATE of 401(k) Eligibility on Net Financial Assets - - -We first look at the treatment effect of e401 on net total financial assets. We give estimates of the ATE and ATT that corresponds to the linear model - -\begin{equation*} -Y = D \alpha + f(X)'\beta+ \epsilon, -\end{equation*} - -where $f(X)$ includes indicators of marital status, two-earner status, defined benefit pension status, IRA participation status, and home ownership status, and orthogonal polynomials of degrees 2, 4, 6 and 8 in family size, education, age and income, respectively. The dimensions of $f(X)$ is 25. - -In the first step, we report estimates of the average treatment effect (ATE) of 401(k) eligibility on net financial assets both in the partially linear regression (PLR) model and in the interactive regression model (IRM) allowing for heterogeneous treatment effects. - - -```{r} -# Constructing the data (as DoubleMLData) -formula_flex = "net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" -model_flex = as.data.table(model.frame(formula_flex, pension)) -x_cols = colnames(model_flex)[-c(1,2)] -data_ml = DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols=x_cols) - - -p <- dim(model_flex)[2]-2 -p - -# complex model with two-way interactions -#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE) - -``` - -## Partially Linear Regression Models (PLR) - - -We start using lasso to estimate the function $g_0$ and $m_0$ in the following PLR model: - - -\begin{eqnarray} - & Y = D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid D,X]= 0,\\ - & D = m_0(X) + V, & E[V \mid X] = 0. -\end{eqnarray} - -```{r} -# Estimating the PLR -lgr::get_logger("mlr3")$set_threshold("warn") -set.seed(123) -lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") -lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_g = lasso, ml_m = lasso_class, n_folds=3) -dml_plr$fit(store_predictions=TRUE) -dml_plr$summary() -lasso_plr <- dml_plr$coef -lasso_std_plr <- dml_plr$se -``` - -Let us check the predictive performance of this model. - -```{r} -dml_plr$params_names() -g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -``` - -```{r} -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -d <- as.matrix(pension$e401) -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -lasso_y_rmse <- sqrt(mean((y-predictions_y)^2)) -lasso_y_rmse -``` - -```{r} -# cross-fitted RMSE: treatment -d <- as.matrix(pension$e401) -lasso_d_rmse <- sqrt(mean((d-m_hat)^2)) -lasso_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -Then, we repeat this procedure for various machine learning methods. - -```{r} -# Random Forest -lgr::get_logger("mlr3")$set_threshold("warn") -randomForest <- lrn("regr.ranger") -randomForest_class <- lrn("classif.ranger") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_g = randomForest, ml_m = randomForest_class, n_folds=3) -dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model -dml_plr$summary() -forest_plr <- dml_plr$coef -forest_std_plr <- dml_plr$se -``` - -We can compare the accuracy of this model to the model that has been estimated with lasso. - -```{r} -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -forest_y_rmse <- sqrt(mean((y-predictions_y)^2)) -forest_y_rmse - -# cross-fitted RMSE: treatment -forest_d_rmse <- sqrt(mean((d-m_hat)^2)) -forest_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -# Trees -lgr::get_logger("mlr3")$set_threshold("warn") - -trees <- lrn("regr.rpart") -trees_class <- lrn("classif.rpart") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_g = trees, ml_m = trees_class, n_folds=3) -dml_plr$fit(store_predictions=TRUE) -dml_plr$summary() -tree_plr <- dml_plr$coef -tree_std_plr <- dml_plr$se - -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -tree_y_rmse <- sqrt(mean((y-predictions_y)^2)) -tree_y_rmse - -# cross-fitted RMSE: treatment -tree_d_rmse <- sqrt(mean((d-m_hat)^2)) -tree_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -# Boosting -lgr::get_logger("mlr3")$set_threshold("warn") -boost<- lrn("regr.xgboost",objective="reg:squarederror") -boost_class <- lrn("classif.xgboost",objective = "binary:logistic",eval_metric ="logloss") - -dml_plr <- DoubleMLPLR$new(data_ml, ml_g = boost, ml_m = boost_class, n_folds=3) -dml_plr$fit(store_predictions=TRUE) -dml_plr$summary() -boost_plr <- dml_plr$coef -boost_std_plr <- dml_plr$se - -# Evaluation predictions -g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o -m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o -theta <- as.numeric(dml_plr$coef) # estimated regression coefficient -predictions_y <- as.matrix(d*theta)+g_hat # predictions for y -boost_y_rmse <- sqrt(mean((y-predictions_y)^2)) -boost_y_rmse - -# cross-fitted RMSE: treatment -boost_d_rmse <- sqrt(mean((d-m_hat)^2)) -boost_d_rmse - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -Let's sum up the results: - -```{r} -library(xtable) -table <- matrix(0, 4, 4) -table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr) -table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr) -table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse) -table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) -tab -``` - -The best model with lowest RMSE in both equation is the PLR model estimated via lasso. It gives the following estimate: - -```{r} -lasso_plr -``` - -## Interactive Regression Model (IRM) - - -Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous: - - - \begin{eqnarray}\label{eq: HetPL1} - & Y = g_0(D, X) + U, & \quad E[U \mid X, D]= 0,\\ - & D = m_0(X) + V, & \quad E[V\mid X] = 0. -\end{eqnarray} - - -To reduce the disproportionate impact of extreme propensity score weights in the interactive model -we trim the propensity scores which are close to the bounds. - -```{r} -set.seed(123) -lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds=3) -dml_irm$fit(store_predictions=TRUE) -dml_irm$summary() -lasso_irm <- dml_irm$coef -lasso_std_irm <- dml_irm$se - - -# predictions -dml_irm$params_names() -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -``` - -```{r} -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -lasso_y_irm <- sqrt(mean((y-g_hat)^2)) -lasso_y_irm - -# cross-fitted RMSE: treatment -lasso_d_irm <- sqrt(mean((d-m_hat)^2)) -lasso_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -##### forest ##### - -dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, - ml_m = randomForest_class, - trimming_threshold = 0.01, n_folds=3) -dml_irm$fit(store_predictions=TRUE) -dml_irm$summary() -forest_irm <- dml_irm$coef -forest_std_irm <- dml_plr$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -forest_y_irm <- sqrt(mean((y-g_hat)^2)) -forest_y_irm - -# cross-fitted RMSE: treatment -forest_d_irm <- sqrt(mean((d-m_hat)^2)) -forest_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) - -##### trees ##### - -dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class, - trimming_threshold = 0.01, n_folds=3) -dml_irm$fit(store_predictions=TRUE) -dml_irm$summary() -tree_irm <- dml_irm$coef -tree_std_irm <- dml_irm$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -tree_y_irm <- sqrt(mean((y-g_hat)^2)) -tree_y_irm - -# cross-fitted RMSE: treatment -tree_d_irm <- sqrt(mean((d-m_hat)^2)) -tree_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) - - -##### boosting ##### - -dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class, - trimming_threshold = 0.01, n_folds=3) -dml_irm$fit(store_predictions=TRUE) -dml_irm$summary() -boost_irm <- dml_irm$coef -boost_std_irm <- dml_irm$se - -# predictions -g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) -g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) -g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 -m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$e401) -boost_y_irm <- sqrt(mean((y-g_hat)^2)) -boost_y_irm - -# cross-fitted RMSE: treatment -boost_d_irm <- sqrt(mean((d-m_hat)^2)) -boost_d_irm - -# cross-fitted ce: treatment -mean(ifelse(m_hat > 0.5, 1, 0) != d) -``` - -```{r} -library(xtable) -table <- matrix(0, 4, 4) -table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm) -table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm) -table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm) -table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) -tab -``` - -Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility. - -```{r} -set.seed(123) -lgr::get_logger("mlr3")$set_threshold("warn") -dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, - ml_m = lasso_class, - trimming_threshold = 0.01, n_folds=3) -dml_irm$fit(store_predictions=TRUE) -dml_irm$summary() -best_irm <- dml_irm$coef -best_std_irm <- dml_irm$se -``` - -These estimates that flexibly account for confounding are -substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings. - - -## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets - - -## Interactive IV Model (IIVM) - - -Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: - -\begin{eqnarray} -& Y = g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ -& D = r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ -& Z = m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. -\end{eqnarray} - -```{r} -# Constructing the data (as DoubleMLData) -formula_flex2 = "net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" -model_flex2 = as.data.table(model.frame(formula_flex2, data)) -x_cols = colnames(model_flex2)[-c(1,2,3)] -data_IV = DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols ="e401",x_cols=x_cols) -``` - -```{r} -set.seed(123) -lgr::get_logger("mlr3")$set_threshold("warn") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso, - ml_m = lasso_class, ml_r = lasso_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -lasso_MLIIVM <- dml_MLIIVM$coef -lasso_std_MLIIVM <- dml_MLIIVM$se -``` - -The confidence interval for the local average treatment effect of participation is given by - -```{r} -dml_MLIIVM$confint(level = 0.95) -``` - -Here we can also check the accuracy of the model: - -```{r} -# variables -y <- as.matrix(pension$net_tfa) # true observations -d <- as.matrix(pension$p401) -z <- as.matrix(pension$e401) - -# predictions -dml_MLIIVM$params_names() -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o -``` - -```{r} -# cross-fitted RMSE: outcome -lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -lasso_y_MLIIVM - -# cross-fitted RMSE: treatment -lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -lasso_d_MLIIVM - -# cross-fitted RMSE: instrument -lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -lasso_z_MLIIVM - -``` - -Again, we repeat the procedure for the other machine learning methods: - -```{r} -### random forest ### - -set.seed(123) -lgr::get_logger("mlr3")$set_threshold("warn") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, - ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -forest_MLIIVM <- dml_MLIIVM$coef -forest_std_MLIIVM <- dml_MLIIVM$se - -# predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -forest_y_MLIIVM - -# cross-fitted RMSE: treatment -forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -forest_d_MLIIVM - -# cross-fitted RMSE: instrument -forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -forest_z_MLIIVM - -### trees ### - -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees, - ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -tree_MLIIVM <- dml_MLIIVM$coef -tree_std_MLIIVM <- dml_MLIIVM$se - -# predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -tree_y_MLIIVM - -# cross-fitted RMSE: treatment -tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -tree_d_MLIIVM - -# cross-fitted RMSE: instrument -tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -tree_z_MLIIVM - - -### boosting ### -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost, - ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -boost_MLIIVM <- dml_MLIIVM$coef -boost_std_MLIIVM <- dml_MLIIVM$se - -# predictions -g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) -g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) -g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 -r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) -r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) -r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 -m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o - -# cross-fitted RMSE: outcome -boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) -boost_y_MLIIVM - -# cross-fitted RMSE: treatment -boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) -boost_d_MLIIVM - -# cross-fitted RMSE: instrument -boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) -boost_z_MLIIVM -``` - -```{r} -library(xtable) -table <- matrix(0, 5, 4) -table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM) -table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM) -table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM) -table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM) -table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM) -rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D","RMSE Z") -colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") -tab<- xtable(table, digits = 2) -tab -``` - -We report results based on four ML methods for estimating the nuisance functions used in -forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis -that the effect of 401(k) participation has no effect on financial health. - - -We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation: - -```{r} -set.seed(123) -lgr::get_logger("mlr3")$set_threshold("warn") -dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, - ml_m = lasso_class, ml_r = lasso_class,n_folds=3, subgroups = list(always_takers = FALSE, - never_takers = TRUE)) -dml_MLIIVM$fit(store_predictions=TRUE) -dml_MLIIVM$summary() -best_MLIIVM <- dml_MLIIVM$coef -best_std_MLIIVM <- dml_MLIIVM$se -``` diff --git a/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd b/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd deleted file mode 100644 index 67145e54..00000000 --- a/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd +++ /dev/null @@ -1,377 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -This notebook contains an example for teaching. - - -# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates - - -We consider the problem of estimating the effect of gun -ownership on the homicide rate. For this purpose, we estimate the following partially -linear model - -$$ - Y_{j,t} = \beta D_{j,(t-1)} + g(Z_{j,t}) + \epsilon_{j,t}. -$$ - - -## Data - - -$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. The parameter $\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics. - -The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations. - -```{r} -data <- read.csv("../input/gun-example/gun_clean.csv") -dim(data)[1] -``` - -### Preprocessing - - -To account for heterogeneity across counties and time trends in all variables, we remove from them county-specific and time-specific effects in the following preprocessing. - -```{r} -##################### Find Variable Names from Dataset ###################### - -varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { - vars <- character(0) - if (any(type %in% "numeric")) { - vars <- c(vars,names(df)[sapply(df,is.numeric)]) - } - if (any(type %in% "factor")) { - vars <- c(vars,names(df)[sapply(df,is.factor)]) - } - if (any(type %in% "character")) { - vars <- c(vars,names(df)[sapply(df,is.character)]) - } - vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] -} - -############################# Create Variables ############################## - -# dummy variables for year and county fixed effects -fixed <- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE) -year <- varlist(data, pattern="X_Tyear") - -# census control variables -census <- NULL -census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") - -for(i in 1:length(census_var)){ - census <- append(census, varlist(data, pattern=census_var[i])) -} - -################################ Variables ################################## -# treatment variable -d <- "logfssl" - -# outcome variable -y <- "logghomr" - -# other control variables -X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") -X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") - -######################## Partial out Fixed Effects ########################## - -# new dataset for partialled-out variables -rdata <- as.data.frame(data$CountyCode) -colnames(rdata) <- "CountyCode" - -# variables to partial out -varlist <- c(y, d,X1, X2, census) - -# partial out year and county fixed effect from variables in varlist -for(i in 1:length(varlist)){ - form <- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"), paste(fixed,collapse="+"), sep="+"))) - rdata[, varlist[i]] <- lm(form, data)$residuals -} -``` - -Now, we can construct the treatment variable, the outcome variable and the matrix $Z$ that includes the control variables. - -```{r} -# treatment variable -D <- rdata[which(colnames(rdata) == d)] - -# outcome variable -Y <- rdata[which(colnames(rdata) == y)] - -# construct matrix Z -Z <- rdata[which(colnames(rdata) %in% c(X1,X2,census))] -dim(Z) -``` - -We have 195 control variables in total. The control variables $Z_{j,t}$ are from the U.S. Census Bureau and contain demographic and economic characteristics of the counties such as the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates. - -```{r} -clu <- rdata[which(colnames(rdata) == "CountyCode")] # for clustering the standard errors -data <- data.frame(cbind(Y, D, Z,as.matrix(clu))) -``` - -```{r} -library(lfe) # linear group fixed effects package -``` - -## The effect of gun ownership - - -### OLS - - -After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls. - -```{r} -# baseline_formula <- as.formula(paste(y, "~", d )) -# baseline.ols <- lm(baseline_formula,data=rdata) - -baseline.ols <- felm(logghomr ~ logfssl |0|0| CountyCode,data=data) # ols with clustered standard errors -est_baseline <- summary(baseline.ols)$coef[2,] -confint(baseline.ols)[2,] -est_baseline -``` - -The point estimate is $0.282$ with the confidence interval ranging from 0.155 to 0.41. This -suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative -to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties' characteristics. - -Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach. - -```{r} -control_formula <- as.formula(paste("logghomr", "~", paste("logfssl",paste(colnames(Z),collapse="+"), - sep="+"),"|0|0| CountyCode")) -control.ols <- felm(control_formula,data=data) # fixed effects lm function -est_ols <- summary(control.ols)$coef[2,] -confint(control.ols)[2,] -est_ols -``` - -After controlling for a rich set of characteristics, the point estimate of gun ownership reduces to $0.19$. - - -# DML algorithm - -Here we perform inference on the predictive coefficient $\beta$ in our partially linear statistical model, - -$$ -Y = D\beta + g(Z) + \epsilon, \quad E (\epsilon | D, Z) = 0, -$$ - -using the **double machine learning** approach. - -For $\tilde Y = Y- E(Y|Z)$ and $\tilde D= D- E(D|Z)$, we can write -$$ -\tilde Y = \alpha \tilde D + \epsilon, \quad E (\epsilon |\tilde D) =0. -$$ - -Using cross-fitting, we employ modern regression methods -to build estimators $\hat \ell(Z)$ and $\hat m(Z)$ of $\ell(Z):=E(Y|Z)$ and $m(Z):=E(D|Z)$ to obtain the estimates of the residualized quantities: - -$$ -\tilde Y_i = Y_i - \hat \ell (Z_i), \quad \tilde D_i = D_i - \hat m(Z_i), \quad \text{ for each } i = 1,\dots,n. -$$ - -Finally, using ordinary least squares of $\tilde Y_i$ on $\tilde D_i$, we obtain the -estimate of $\beta$. - - -The following algorithm comsumes $Y, D, Z$, and a machine learning method for learning the residuals $\tilde Y$ and $\tilde D$, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient $\beta$ and the corresponding standard error from the final OLS regression. - -```{r} -DML2.for.PLM <- function(z, d, y, dreg, yreg, nfold=2, clu) { - nobs <- nrow(z) # number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(z[-I[[b]],], d[-I[[b]]]) # take a fold out - yfit <- yreg(z[-I[[b]],], y[-I[[b]]]) # take a fold out - dhat <- predict(dfit, z[I[[b]],], type="response") # predict the left-out fold - yhat <- predict(yfit, z[I[[b]],], type="response") # predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - cat(b," ") - } - #rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other - data <- data.frame(cbind(ytil, dtil, as.matrix(clu))) - rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data) - coef.est <- coef(rfit)[2] # extract coefficient - #HC <- vcovHC(rfit) - se <- summary(rfit,robust=T)$coefficients[2,2] # record robust standard error by county - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) # print output - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) # save output and residuals -} -``` - -Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries. - -```{r} -library(hdm) -library(glmnet) -library(sandwich) -library(randomForest) -``` - -Let us, construct the input matrices. - -```{r} -y <- as.matrix(Y) -d <- as.matrix(D) -z <- as.matrix(Z) -clu <- rdata[which(colnames(rdata) == "CountyCode")] -head(data.frame(cbind(y,d,as.matrix(clu)))) -``` - -In the following, we apply the DML approach with the different versions of lasso. - - - -## Lasso - -```{r} -# DML with Lasso: -set.seed(123) -dreg <- function(z,d){ rlasso(z,d, post=FALSE) } # ML method= lasso from hdm -yreg <- function(z,y){ rlasso(z,y, post=FALSE) } # ML method = lasso from hdm -DML2.lasso = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10,clu) -``` - -```{r} -# DML with Post-Lasso: -dreg <- function(z,d){ rlasso(z,d, post=T) } # ML method= lasso from hdm -yreg <- function(z,y){ rlasso(z,y, post=T) } # ML method = lasso from hdm -DML2.post = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) -``` - -```{r} -# DML with cross-validated Lasso: -dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=1) } # ML method = lasso from glmnet -yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=1) } # ML method = lasso from glmnet -DML2.lasso.cv = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) - -dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0.5) } # ML method = elastic net from glmnet -yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0.5) } # ML method = elastic net from glmnet -DML2.elnet = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) - -dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0) } # ML method = ridge from glmnet -yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) } # ML method = ridge from glmnet -DML2.ridge = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) -``` - -Here we also compute DML with OLS used as the ML method - -```{r} -dreg <- function(z,d){ glmnet(z,d,family="gaussian", lambda=0) } # ML method = ols from glmnet -yreg <- function(z,y){ glmnet(z,y,family="gaussian", lambda=0) } # ML method = ols from glmnet -DML2.ols = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) -``` - -Next, we also apply Random Forest for comparison purposes. - - -### Random Forest - - -```{r} -# DML with Random Forest: -dreg <- function(z,d){ randomForest(z, d) } # ML method = random forest -yreg <- function(z,y){ randomForest(z, y) } # ML method = random forest -set.seed(1) -DML2.RF = DML2.for.PLM(z, d, y, dreg, yreg, nfold=2, clu) # set folds to 2 to limit computation time -``` - -We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative -to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties' characteristics. - - -Finally, let's see which method is best. We compute RMSE for predicting D and Y, and see which -of the methods works better. - - -```{r} -mods<- list(DML2.ols, DML2.lasso, DML2.post, DML2.lasso.cv, DML2.ridge, DML2.elnet, DML2.RF) - -RMSE.mdl<- function(mdl) { -RMSEY <- sqrt(mean(mdl$ytil)^2) -RMSED <- sqrt(mean(mdl$dtil)^2) -return( list(RMSEY=RMSEY, RMSED=RMSED)) -} - -#RMSE.mdl(DML2.lasso) -#DML2.lasso$ytil - -Res<- lapply(mods, RMSE.mdl) - -prRes.Y<- c( Res[[1]]$RMSEY,Res[[2]]$RMSEY, Res[[3]]$RMSEY, Res[[4]]$RMSEY, Res[[5]]$RMSEY, Res[[6]]$RMSEY, Res[[7]]$RMSEY) -prRes.D<- c( Res[[1]]$RMSED,Res[[2]]$RMSED, Res[[3]]$RMSED, Res[[4]]$RMSED, Res[[5]]$RMSED, Res[[6]]$RMSED, Res[[7]]$RMSED) - -prRes<- rbind(prRes.Y, prRes.D); -rownames(prRes)<- c("RMSE D", "RMSE Y"); -colnames(prRes)<- c("OLS", "Lasso", "Post-Lasso", "CV Lasso", "CV Ridge", "CV Elnet", "RF") -print(prRes,digit=6) -``` - -It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge. - - -```{r} -dreg <- function(z,d){ rlasso(z,d, post=T) } # ML method = lasso from hdm -yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) } # ML method = ridge from glmnet -DML2.best= DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) -``` - -Let's organize the results in a table. - -```{r} -library(xtable) - -table <- matrix(0,9,2) -table[1,1] <- as.numeric(est_baseline[1]) -table[2,1] <- as.numeric(est_ols[1]) -table[3,1] <- as.numeric(DML2.lasso$coef.est) -table[4,1] <- as.numeric(DML2.post$coef.est) -table[5,1] <-as.numeric(DML2.lasso.cv$coef.est) -table[6,1] <-as.numeric(DML2.elnet$coef.est) -table[7,1] <-as.numeric(DML2.ridge$coef.est) -table[8,1] <-as.numeric(DML2.RF$coef.est) -table[9,1] <-as.numeric(DML2.best$coef.est) -table[1,2] <- as.numeric(est_baseline[2]) -table[2,2] <- as.numeric(est_ols[2]) -table[3,2] <- as.numeric(DML2.lasso$se) -table[4,2] <- as.numeric(DML2.post$se) -table[5,2] <-as.numeric(DML2.lasso.cv$se) -table[6,2] <-as.numeric(DML2.elnet$se) -table[7,2] <-as.numeric(DML2.ridge$se) -table[8,2] <-as.numeric(DML2.RF$se) -table[9,2] <-as.numeric(DML2.best$se) - -# print results -colnames(table) <- c("Estimate","Standard Error") -rownames(table) <- c("Baseline OLS", "Least Squares with controls", "Lasso", "Post-Lasso", "CV Lasso","CV Elnet", "CV Ridge", "Random Forest", - "Best") -table -``` - -```{r} -print(table, digit=3) -``` - -```{r} -tab<- xtable(table, digits=3) -print(tab, type="latex") -``` diff --git a/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd b/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd deleted file mode 100644 index 621999bf..00000000 --- a/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd +++ /dev/null @@ -1,172 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -This notebook contains an example for teaching. - - -# The Effect of Gun Ownership on Gun-Homicide Rates using DML for neural nets - - -In this lab, we estimate the effect of gun ownership on the homicide rate using a neural network. - -```{r} -library(keras) -library(lfe) -``` - -First, we need to load and preprocess the data. - -```{r} -# read in dataset -data <- read.csv("../input/gun-example/gun_clean.csv") - - -################## Find Variable Names from the Dataset ################### - -varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { - vars <- character(0) - if (any(type %in% "numeric")) { - vars <- c(vars,names(df)[sapply(df,is.numeric)]) - } - if (any(type %in% "factor")) { - vars <- c(vars,names(df)[sapply(df,is.factor)]) - } - if (any(type %in% "character")) { - vars <- c(vars,names(df)[sapply(df,is.character)]) - } - vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] -} - -########################### Create Variables ############################## - -# dummy variables for year and county fixed effects -fixed <- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE) -year <- varlist(data, pattern="X_Tyear") - -# census control variables -census <- NULL -census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") - -for(i in 1:length(census_var)){ - census <- append(census, varlist(data, pattern=census_var[i])) -} - -############################### Variables ################################# - -# treatment variable -d <- "logfssl" - -# outcome variable -y <- "logghomr" - -# other control variables -X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") -X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") - -###################### Partial-out Fixed Effects ######################### - -# new dataset for partialled-out variables -rdata <- as.data.frame(data$CountyCode) -colnames(rdata) <- "CountyCode" - -# variables to partial-out -varlist <- c(y, d,X1, X2, census) - -# partial out year and county fixed effects from variables in varlist -for(i in 1:length(varlist)){ - form <- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"), paste(fixed,collapse="+"), sep="+"))) - rdata[, varlist[i]] <- lm(form, data)$residuals -} -``` - -# DML for neural nets - - - -The following algorithm consumes $Y$,$D$ and $Z$, and learns the residuals $\tilde{Y}$ and $\tilde{D}$ via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient $\beta$ and the clustered standard error from the final OLS regression. - -```{r} -DML2.for.NN <- function(z, d, y, nfold=2, clu, num_epochs, batch_size) { - nobs <- nrow(z) # number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] # define folds indices - I <- split(1:nobs, foldid) # split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for(b in 1:length(I)){ - # normalize the data - mean <- apply(z[-I[[b]],], 2, mean) - std <- apply(z[-I[[b]],], 2, sd) - z[-I[[b]],] <- scale(z[-I[[b]],], center = mean, scale = std) - z[I[[b]],] <- scale(z[I[[b]],], center = mean, scale = std) - # building the model with 3 layers, the ReLU activation function, mse loss and rmsprop optimizer - build_model <- function(){ - model <- keras_model_sequential() %>% - layer_dense(units = 16, activation = "relu", - input_shape = dim(z[-I[[b]],][2]))%>% - layer_dense(units = 16, activation = "relu") %>% - layer_dense(units = 1) - - model %>% compile( - optimizer = "rmsprop", - loss = "mse", - metrics = c("mae") - ) - } - model.Y <- build_model() - model.D <- build_model() - # fitting the model - model.D %>% fit(z[-I[[b]],], d[-I[[b]]], - epochs = num_epochs, batch_size = batch_size, verbose = 0) - model.Y %>% fit(z[-I[[b]],], y[-I[[b]]], - epochs = num_epochs, batch_size = batch_size, verbose = 0) - dhat <- model.D %>% predict(z[I[[b]],]) - yhat <- model.Y %>% predict(z[I[[b]],]) - dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold - cat(b," ") - } - #rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other - data <- data.frame(cbind(ytil, dtil, as.matrix(clu))) - rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data) - coef.est <- coef(rfit)[2] # extract the coefficient - #HC <- vcovHC(rfit) - se <- summary(rfit,robust=T)$coefficients[2,2] # record robust standard error by county - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) # print the output - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) # save the output and residuals -} -``` - -# Estimating the effect with DML for neural nets - -```{r} -# treatment variable -D <- rdata[which(colnames(rdata) == d)] -# outcome variable -Y <- rdata[which(colnames(rdata) == y)] -# construct matrix Z -Z <- rdata[which(colnames(rdata) %in% c(X1,X2,census))] - -# inputs -y_nn <- as.matrix(Y) -d_nn <- as.matrix(D) -z_nn <- as.matrix(Z) -clu <- rdata[which(colnames(rdata) == "CountyCode")] -``` - -```{r} -# DML with a NN -set.seed(123) -DML2.nn = DML2.for.NN(z_nn, d_nn, y_nn, nfold=2, clu, 100, 10) -``` diff --git a/T/deprecated/r-weak-iv-experiments.irnb.Rmd b/T/deprecated/r-weak-iv-experiments.irnb.Rmd deleted file mode 100644 index 68758644..00000000 --- a/T/deprecated/r-weak-iv-experiments.irnb.Rmd +++ /dev/null @@ -1,92 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - -# A Simple Example of Properties of IV estimator when Instruments are Weak - - -Simulation Design - -```{r} -# Simulation Design - -library(hdm) -set.seed(1) -B= 10000 # trials -IVEst = rep(0, B) -n=100 -beta = .25 # .2 weak IV -#beta = 1 # 1 strong IV - - -U = rnorm(n) -Z = rnorm(n) #generate instrument -D = beta*Z + U #generate endogenougs variable -Y = D+ U # the true causal effect is 1 - - -summary(lm(D~Z)) # first stage is very weak here - -summary(tsls(x=NULL, d=D, y=Y, z=Z)) # - -``` - -Note that the instrument is weak here (contolled by $\beta$) -- the t-stat is less than 4. - - -# Run 1000 trials to evaluate distribution of the IV estimator - -```{r} -# Simulation Design - -set.seed(1) -B= 10000 # trials -IVEst = rep(0, B) - -for(i in 1:B){ -U = rnorm(n) -Z = rnorm(n) #generate instrument -D = beta*Z + U #generate endogenougs variable -Y = D+ U # the true causal effect is 1 -IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1] -} - - -``` - -# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption) - -```{r} -plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5), - xlab= "IV Estimator -True Effect", main="Actual Distribution vs Gaussian") - -val=seq(-5, 5, by=.05) -var = (1/beta^2)*(1/100) # theoretical variance of IV -sd = sqrt(var) -lines(val, dnorm(val, sd=sd), col=2, lty=2) - -rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B - -cat(c("Rejection Frequency is ", rejection.frequency, " while we expect it to be .05")) - -``` - -# Some Help Functions - -```{r} -help(tsls) -``` - -```{r} -help(density) -``` diff --git a/T/dml-for-conditional-average-treatment-effect.Rmd b/T/dml-for-conditional-average-treatment-effect.Rmd deleted file mode 100644 index 2dad6251..00000000 --- a/T/dml-for-conditional-average-treatment-effect.Rmd +++ /dev/null @@ -1,610 +0,0 @@ ---- -title: An R Markdown document converted from "T/dml-for-conditional-average-treatment-effect.irnb" -output: html_document ---- - -# DML for CATE - -This is a simple demonstration of Debiased Machine Learning estimator for the Conditional Average Treatment Effect. -Goal is to estimate the effect of 401(k) eligibility on net financial assets for each value of income. -The method is based on the following paper. - -* Title: Debiased Machine Learning of Conditional Average Treatment Effect and Other Causal Functions - -* Authors: Semenova, Vira and Chernozhukov, Victor. - -* Arxiv version: https://arxiv.org/pdf/1702.06240.pdf - -* Published version with replication code: https://academic.oup.com/ectj/advance-article/doi/10.1093/ectj/utaa027/5899048 - -Background - -The target function is Conditional Average Treatment Effect, defined as - -$$ g(x)=E [ Y(1) - Y(0) |X=x], $$ - -where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income. -$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$. - - - -If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is - -$$ g(x) = E [ Y (\eta_0) \mid X=x], $$ -where the transformed outcome variable is - -$$Y (\eta) = \dfrac{D}{s(Z)} \left( Y - \mu(1,Z) \right) - \dfrac{1-D}{1-s(Z)} \left( Y - \mu(0,Z) \right) + \mu(1,Z) - \mu(0,Z),$$ - -the probability of eligibility is - -$$s_0(z) = Pr (D=1 \mid Z=z),$$ - -the expected net financial asset given $D =d \in \{1,0\}$ and $Z=z$ is - -$$ \mu(d,z) = E[ Y \mid Z=z, D=d]. $$ - -Our goal is to estimate $g(x)$. - - -In step 1, we estimate the unknown functions $s_0(z), \mu(1,z), \mu(0,z)$ and plug them into $Y (\eta)$. - - -In step 2, we approximate the function $g(x)$ by a linear combination of basis functions: - -$$ g(x) = p(x)' \beta_0, $$ - - -where $p(x)$ is a vector of polynomials or splines and - -$$ \beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\eta_0) $$ - -is the best linear predictor. We report - -$$ -\widehat{g}(x) = p(x)' \widehat{\beta}, -$$ - -where $\widehat{\beta}$ is the ordinary least squares estimate of $\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$ - -$$ - \widehat{\beta} :=\left( \dfrac{1}{N} \sum_{i=1}^N p(X_i) p(X_i)' \right)^{-1} \dfrac{1}{N} \sum_{i=1}^N p(X_i)Y_i(\widehat{\eta}) -$$ - - - - - - - -```{r} -## load packages -install.packages("foreign") -install.packages("quantreg") -install.packages("splines") -install.packages("lattice") -install.packages("Hmisc") -install.packages("fda") -install.packages("hdm") -install.packages("randomForest") -install.packages("ranger") -install.packages("sandwich") -install.packages("ggplot2") -``` - -```{r} -library(foreign) -library(quantreg) -library(splines) -library(lattice) -library(Hmisc) -library(fda) -library(hdm) -library(randomForest) -library(ranger) -library(sandwich) -library(ggplot2) -``` - -```{r} -## 401k dataset -data(pension) -pension$net_tfa <- pension$net_tfa / 10000 -## covariate of interest -- log income -- -pension$inc <- log(pension$inc) -pension <- pension[!is.na(pension$inc) & pension$inc != -Inf & pension$inc != Inf, ] - - -## outcome variable -- total net financial assets -Y <- pension$net_tfa -## binary treatment -- indicator of 401(k) eligibility -D <- pension$e401 - - -X <- pension$inc -## target parameter is CATE = E[ Y(1) - Y(0) | X] - - -## raw covariates so that Y(1) and Y(0) are independent of D given Z -Z <- pension[, c( - "age", "inc", "fsize", "educ", "male", "db", "marr", "twoearn", "pira", "hown", "hval", "hequity", "hmort", - "nohs", "hs", "smcol" -)] - - -y_name <- "net_tfa" -d_name <- "e401" -form_z <- paste("(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + ", - "as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2") -cat(sprintf("\n sample size is %g \n", length(Y))) -cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2])) -``` - -In Step 1, we estimate three functions: - -1. probability of treatment assignment $s_0(z)$ - -2.-3. regression functions $\mu_0(1,z)$ and $\mu_0(0,z)$. - -We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r - -For each function, we try random forest. - -First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by lasso - -```{r} -first_stage_lasso <- function(data, d_name, y_name, form_z, seed = 1) { - # Sample size - N <- dim(data)[1] - # Estimated regression function in control group - mu0_hat <- rep(1, N) - # Estimated regression function in treated group - mu1_hat <- rep(1, N) - # Propensity score - s_hat <- rep(1, N) - seed <- 1 - ## define sample splitting - set.seed(seed) - inds_train <- sample(1:N, floor(N / 2)) - inds_eval <- setdiff(1:N, inds_train) - - print("Estimate treatment probability, first half") - ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - fitted_lasso_pscore <- rlassologit(as.formula(paste0(d_name, "~", form_z)), data = data[inds_train, ]) - - s_hat[inds_eval] <- predict(fitted_lasso_pscore, data[inds_eval, ], type = "response") - print("Estimate treatment probability, second half") - fitted_lasso_pscore <- rlassologit(as.formula(paste0(d_name, "~", form_z)), data = data[inds_eval, ]) - s_hat[inds_train] <- predict(fitted_lasso_pscore, data[inds_train, ], type = "response") - - - data1 <- data - data1[, d_name] <- 1 - - data0 <- data - data0[, d_name] <- 0 - - print("Estimate expectation function, first half") - fitted_lasso_mu <- rlasso(as.formula(paste0(y_name, "~", d_name, "+(", form_z, ")")), data = data[inds_train, ]) - mu1_hat[inds_eval] <- predict(fitted_lasso_mu, data1[inds_eval, ]) - mu0_hat[inds_eval] <- predict(fitted_lasso_mu, data0[inds_eval, ]) - - print("Estimate expectation function, second half") - fitted_lasso_mu <- rlasso(as.formula(paste0(y_name, "~", d_name, "+(", form_z, ")")), data = data[inds_eval, ]) - mu1_hat[inds_train] <- predict(fitted_lasso_mu, data1[inds_train, ]) - mu0_hat[inds_train] <- predict(fitted_lasso_mu, data0[inds_train, ]) - - return(list( - mu1_hat = mu1_hat, - mu0_hat = mu0_hat, - s_hat = s_hat - )) -} -``` - -First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by random forest - -```{r} -first_stage_rf <- function(Y, D, Z, seed = 1) { - # Sample size - N <- length(D) - # Estimated regression function in control group - mu0_hat <- rep(1, N) - # Estimated regression function in treated group - mu1_hat <- rep(1, N) - # Propensity score - s_hat <- rep(1, N) - - - ## define sample splitting - set.seed(seed) - inds_train <- sample(1:N, floor(N / 2)) - inds_eval <- setdiff(1:N, inds_train) - - print("Estimate treatment probability, first half") - ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest - Df <- as.factor(as.character(D)) - fitted_rf_pscore <- randomForest(Z, Df, subset = inds_train) - s_hat[inds_eval] <- predict(fitted_rf_pscore, Z[inds_eval, ], type = "prob")[, 2] - print("Estimate treatment probability, second half") - fitted_rf <- randomForest(Z, Df, subset = inds_eval) - s_hat[inds_train] <- predict(fitted_rf_pscore, Z[inds_train, ], type = "prob")[, 2] - - ## conditional expected net financial assets (i.e., regression function) based on random forest - - covariates <- cbind(Z, D) - - covariates1 <- cbind(Z, D = rep(1, N)) - covariates0 <- cbind(Z, D = rep(0, N)) - - print("Estimate expectation function, first half") - fitted_rf_mu <- randomForest(cbind(Z, D), Y, subset = inds_train) - mu1_hat[inds_eval] <- predict(fitted_rf_mu, covariates1[inds_eval, ]) - mu0_hat[inds_eval] <- predict(fitted_rf_mu, covariates0[inds_eval, ]) - - print("Estimate expectation function, second half") - fitted_rf_mu <- randomForest(cbind(Z, D), Y, subset = inds_eval) - mu1_hat[inds_train] <- predict(fitted_rf_mu, covariates1[inds_train, ]) - mu0_hat[inds_train] <- predict(fitted_rf_mu, covariates0[inds_train, ]) - - return(list( - mu1_hat = mu1_hat, - mu0_hat = mu0_hat, - s_hat = s_hat - )) -} -``` - -In Step 2, we approximate $Y(\eta_0)$ by a vector of basis functions. There are two use cases: -**** -2.A. Group Average Treatment Effect, described above - - -2.B. Average Treatment Effect conditional on income value. There are three smoothing options: - -1. splines offered in ```least_squares_splines``` - -2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series``` - -3. standard polynomials with the highest degree input by user ```least_squares_series_old``` - - -The default option is option 3. - -2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as - -$$ - \infty = \ell_0 < \ell_1 < \ell_2 \dots \ell_K = \infty $$ - -define intervals $I_k = [ \ell_{k-1}, \ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator - -$$ G_k(X) = 1[X \in I_k], $$ - -and the vector of basis functions - -$$ p(X) = (G_1(X), G_2(X), \dots, G_K(X)) $$ - -Then, the Best Linear Predictor $\beta_0$ vector shows the average treatment effect for each group. - -```{r} -## estimate first stage functions by random forest -## may take a while -fs_hat_rf <- first_stage_rf(Y, D, Z) -``` - -```{r} -X <- pension$inc -fs_hat <- fs_hat_rf -min_cutoff <- 0.01 -# regression function -mu1_hat <- fs_hat[["mu1_hat"]] -mu0_hat <- fs_hat[["mu0_hat"]] -# propensity score -s_hat <- fs_hat[["s_hat"]] -s_hat <- sapply(s_hat, max, min_cutoff) - -### Construct Orthogonal Signal -RobustSignal <- (Y - mu1_hat) * D / s_hat - (Y - mu0_hat) * (1 - D) / (1 - s_hat) + mu1_hat - mu0_hat -``` - -```{r} -qtmax <- function(C, S = 10000, alpha) { - p <- nrow(C) - tmaxs <- apply(abs(matrix(rnorm(p * S), nrow = p, ncol = S)), 2, max) - return(quantile(tmaxs, 1 - alpha)) -} - -# This function computes the square root of a symmetric matrix using the spectral decomposition; - -group_average_treatment_effect <- function(X, Y, max_grid = 5, alpha = 0.05, B = 10000) { - grid <- quantile(X, probs = c((0:max_grid) / max_grid)) - Xraw <- matrix(NA, nrow = length(Y), ncol = length(grid) - 1) - - for (k in 2:((length(grid)))) { - Xraw[, k - 1] <- sapply(X, function(x) ifelse(x >= grid[k - 1] & x < grid[k], 1, 0)) - } - k <- length(grid) - Xraw[, k - 1] <- sapply(X, function(x) ifelse(x >= grid[k - 1] & x <= grid[k], 1, 0)) - - ols_fit <- lm(Y ~ Xraw - 1) - coefs <- coef(ols_fit) - vars <- names(coefs) - hcv_coefs <- vcovHC(ols_fit, type = "HC") - coefs_se <- sqrt(diag(hcv_coefs)) # White std errors - ## this is an identity matrix - ## qtmax is simplified - c_coefs <- (diag(1 / sqrt(diag(hcv_coefs)))) %*% hcv_coefs %*% (diag(1 / sqrt(diag(hcv_coefs)))) - - - tes <- coefs - tes_se <- coefs_se - tes_cor <- c_coefs - crit_val <- qtmax(tes_cor, B, alpha) - - tes_ucb <- tes + crit_val * tes_se - tes_lcb <- tes - crit_val * tes_se - - tes_uci <- tes + qnorm(1 - alpha / 2) * tes_se - tes_lci <- tes + qnorm(alpha / 2) * tes_se - - - return(list( - beta_hat = coefs, ghat_lower_point = tes_lci, ghat_upper_point = tes_uci, - ghat_lower = tes_lcb, ghat_upper = tes_ucb, crit_val = crit_val - )) -} -``` - -```{r} -res <- group_average_treatment_effect(X = X, Y = RobustSignal) -``` - -```{r} -## this code is taken from L1 14.382 taught at MIT -## author: Mert Demirer -options(repr.plot.width = 10, repr.plot.height = 8) - -tes <- res$beta_hat -tes_lci <- res$ghat_lower_point -tes_uci <- res$ghat_upper_point - -tes_lcb <- res$ghat_lower -tes_ucb <- res$ghat_upper -tes_lev <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") - -plot(c(1, 5), las = 2, xlim = c(0.6, 5.4), ylim = c(.05, 2.09), type = "n", xlab = "Income group", - ylab = "Average Effect on NET TFA (per 10 K)", - main = "Group Average Treatment Effects on NET TFA", - xaxt = "n") -axis(1, at = 1:5, labels = tes_lev) -for (i in 1:5) { - rect(i - 0.2, tes_lci[i], i + 0.2, tes_uci[i], col = NA, border = "red", lwd = 3) - rect(i - 0.2, tes_lcb[i], i + 0.2, tes_ucb[i], col = NA, border = 4, lwd = 3) - segments(i - 0.2, tes[i], i + 0.2, tes[i], lwd = 5) -} -abline(h = 0) - -legend(2.5, 2.0, - c("Regression Estimate", "95% Simultaneous Confidence Interval", "95% Pointwise Confidence Interval"), - col = c(1, 4, 2), lwd = c(4, 3, 3), horiz = FALSE, bty = "n", cex = 0.8) - -dev.off() -``` - -```{r} -least_squares_splines <- function(X, Y, max_knot = 9, norder, nderiv, ...) { - ## Create technical regressors - cv_bsp <- rep(0, max_knot - 1) - for (knot in 2:max_knot) { - breaks <- quantile(X, c(0:knot) / knot) - formula.bsp <- Y ~ bsplineS(X, breaks = breaks, norder = norder, nderiv = nderiv)[, -1] - fit <- lm(formula.bsp) - cv_bsp[knot - 1] <- sum((fit$res / (1 - hatvalues(fit)))^2) - } - ## Number of knots chosen by cross-validation - cv_knot <- which.min(cv_bsp) + 1 - breaks <- quantile(X, c(0:cv_knot) / cv_knot) - formula.bsp <- Y ~ bsplineS(X, breaks = breaks, norder = norder, nderiv = 0)[, -1] - fit <- lm(formula.bsp) - - return(list(cv_knot = cv_knot, fit = fit)) -} - - -least_squares_series <- function(X, Y, max_degree, ...) { - cv_pol <- rep(0, max_degree) - for (degree in 1:max_degree) { - formula.pol <- Y ~ poly(X, degree) - fit <- lm(formula.pol) - cv_pol[degree] <- sum((fit$res / (1 - hatvalues(fit)))^2) - } - ## Number of knots chosen by cross-validation - cv_degree <- which.min(cv_pol) - ## Estimate coefficients - formula.pol <- Y ~ poly(X, cv_degree) - fit <- lm(formula.pol) - - return(list(fit = fit, cv_degree = cv_degree)) -} -``` - -```{r} -msqrt <- function(C) { - Ceig <- eigen(C) - return(Ceig$vectors %*% diag(sqrt(Ceig$values)) %*% solve(Ceig$vectors)) -} - -tboot <- function(regressors_grid, omega_hat, alpha, B = 10000) { - numerator_grid <- regressors_grid %*% msqrt(omega_hat) - denominator_grid <- sqrt(diag(regressors_grid %*% omega_hat %*% t(regressors_grid))) - - norm_numerator_grid <- numerator_grid - for (k in seq_len(dim(numerator_grid)[1])) { - norm_numerator_grid[k, ] <- numerator_grid[k, ] / denominator_grid[k] - } - - tmaxs <- apply(abs(norm_numerator_grid %*% matrix(rnorm(dim(numerator_grid)[2] * B), - nrow = dim(numerator_grid)[2], ncol = B)), 2, max) - return(quantile(tmaxs, 1 - alpha)) -} -``` - -```{r} -second_stage <- function(fs_hat, Y, D, X, max_degree = 3, norder = 4, nderiv = 0, - ss_method = "poly", min_cutoff = 0.01, alpha = 0.05, eps = 0.1, ...) { - x_grid <- seq(min(X), max(X), eps) - mu1_hat <- fs_hat[["mu1_hat"]] - mu0_hat <- fs_hat[["mu0_hat"]] - # propensity score - s_hat <- fs_hat[["s_hat"]] - s_hat <- sapply(s_hat, max, min_cutoff) - ### Construct Orthogonal Signal - - RobustSignal <- (Y - mu1_hat) * D / s_hat - (Y - mu0_hat) * (1 - D) / (1 - s_hat) + mu1_hat - mu0_hat - - # Estimate the target function using least squares series - if (ss_method == "ortho_poly") { - res <- least_squares_series(X = X, Y = RobustSignal, eps = 0.1, max_degree = max_degree) - fit <- res$fit - cv_degree <- res$cv_degree - regressors_grid <- cbind(rep(1, length(x_grid)), poly(x_grid, cv_degree)) - } - if (ss_method == "splines") { - res <- least_squares_splines(X = X, Y = RobustSignal, eps = 0.1, norder = norder, nderiv = nderiv) - fit <- res$fit - cv_knot <- res$cv_knot - breaks <- quantile(X, c(0:cv_knot) / cv_knot) - regressors_grid <- cbind(rep(1, length(x_grid)), - bsplineS(x_grid, breaks = breaks, norder = norder, nderiv = nderiv)[, -1]) - degree <- cv_knot - } - - g_hat <- regressors_grid %*% coef(fit) - - hcv_coefs <- vcovHC(fit, type = "HC") - standard_error <- sqrt(diag(regressors_grid %*% hcv_coefs %*% t(regressors_grid))) - - ### Lower Pointwise CI - ghat_lower_point <- g_hat + qnorm(alpha / 2) * standard_error - ### Upper Pointwise CI - ghat_upper_point <- g_hat + qnorm(1 - alpha / 2) * standard_error - - max_tstat <- tboot(regressors_grid = regressors_grid, omega_hat = hcv_coefs, alpha = alpha) - - ## Lower Uniform CI - ghat_lower <- g_hat - max_tstat * standard_error - ## Upper Uniform CI - ghat_upper <- g_hat + max_tstat * standard_error - return(list( - ghat_lower = ghat_lower, g_hat = g_hat, ghat_upper = ghat_upper, fit = fit, - ghat_lower_point = ghat_lower_point, ghat_upper_point = ghat_upper_point, x_grid = x_grid - )) -} -``` - -```{r} -make_plot <- function(res, lowy, highy, degree, ss_method = "series", uniform = TRUE, ...) { - title <- paste0("Effect of 401(k) on Net TFA, ", ss_method) - x_grid <- res$x_grid - len <- length(x_grid) - - - if (uniform) { - group <- c(rep("UCI", len), rep("PCI", len), rep("Estimate", len), rep("PCIL", len), rep("UCIL", len)) - group_type <- c(rep("CI", len), rep("CI", len), rep("Estimate", len), rep("CI", len), rep("CI", len)) - group_ci_type <- c(rep("Uniform", len), rep("Point", len), - rep("Uniform", len), rep("Point", len), rep("Uniform", len)) - - df <- data.frame(income = rep(x_grid, 5), - outcome = c(res$ghat_lower, res$ghat_lower_point, - res$g_hat, res$ghat_upper_point, res$ghat_upper), - group = group, group_col = group_type, group_line = group_ci_type) - p <- ggplot(data = df) + - aes(x = exp(income), y = outcome, colour = group) + - theme_bw() + - xlab("Income") + - ylab("Net TFA, (thousand dollars)") + - scale_colour_manual(values = c("black", "blue", "blue", "blue", "blue")) + - theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 20, family = "serif")) + - theme(legend.title = element_blank()) + - theme(legend.position = "none") + - ylim(low = lowy, high = highy) + - geom_line(aes(linetype = group_line), size = 1.5) + - scale_linetype_manual(values = c("dashed", "solid")) + - ggtitle(title) - } - - if (!uniform) { - group <- c(rep("PCI", len), rep("Estimate", len), rep("PCIL", len)) - group_type <- c(rep("CI", len), rep("Estimate", len), rep("CI", len)) - group_ci_type <- c(rep("Point", len), rep("Uniform", len), rep("Point", len)) - - df <- data.frame(income = rep(x_grid, 3), - outcome = c(res$ghat_lower_point, res$g_hat, res$ghat_upper_point), - group = group, group_col = group_type, group_line = group_ci_type) - - p <- ggplot(data = df) + - aes(x = exp(income), y = outcome, colour = group) + - theme_bw() + - xlab("Income") + - ylab("Net TFA, (thousand dollars)") + - scale_colour_manual(values = c("black", "blue", "blue", "blue", "blue")) + - theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 20, family = "serif")) + - theme(legend.title = element_blank()) + - theme(legend.position = "none") + - ylim(low = lowy, high = highy) + - geom_line(aes(linetype = group_line), size = 1.5) + - scale_linetype_manual(values = c("dashed", "solid")) + - ggtitle(title) - } - - return(p) -} -``` - -```{r} -res_ortho_rf_splines <- second_stage(fs_hat = fs_hat_rf, X = X, D = D, Y = Y, - ss_method = "splines", max_degree = 3) -``` - -```{r} -res_ortho_rf_ortho_poly <- second_stage(fs_hat = fs_hat_rf, X = X, D = D, Y = Y, - ss_method = "ortho_poly", max_degree = 3) -``` - -plot findings - --- black solid line shows estimated function $p(x)' \widehat{\beta}$ - --- blue dashed lines show pointwise confidence bands for this function - -```{r} -p <- make_plot(res_ortho_rf_ortho_poly, ss_method = "ortho_poly", uniform = FALSE, lowy = -10, highy = 20) -options(repr.plot.width = 15, repr.plot.height = 10) -print(p) -``` - -plot findings: - --- black solid line shows estimated function $p(x)' \widehat{\beta}$ - --- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\beta_0$ with probability 0.95 - --- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \rightarrow p(x)'\beta_0$ with probability 0.95 on some compact range - -```{r} -p <- make_plot(res_ortho_rf_ortho_poly, ss_method = "ortho polynomials", uniform = TRUE, lowy = -10, highy = 25) -options(repr.plot.width = 15, repr.plot.height = 10) -print(p) -``` - -```{r} -p <- make_plot(res_ortho_rf_splines, ss_method = "splines", uniform = FALSE, lowy = -15, highy = 10) -options(repr.plot.width = 15, repr.plot.height = 10) -print(p) -``` - -```{r} -p <- make_plot(res_ortho_rf_splines, ss_method = "splines", uniform = TRUE, lowy = -20, highy = 20) -options(repr.plot.width = 15, repr.plot.height = 10) -print(p) -``` - diff --git a/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd b/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd deleted file mode 100644 index 7d506973..00000000 --- a/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd +++ /dev/null @@ -1,264 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Double/Debiased ML for Partially Linear IV Model - -References: - -https://arxiv.org/abs/1608.00060 - - -https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 - -The code is based on the book. - - - - - -# Partially Linear IV Model - -We consider the partially linear structural equation model: -\begin{eqnarray} - & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ - & Z = m_0(X) + V, & E[V \mid X] = 0. -\end{eqnarray} - - -Note that this model is not a regression model unless $Z=D$. The model is a canonical -model in causal inference, going back to P. Wright's work on IV methods for estimaing demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. - - -The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation -in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. - - -The causal DAG this model corresponds to is given by: -$$ -Z \to D, X \to (Y, Z, D), L \to (Y,D), -$$ -where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. - - - ---- - -# Example - -A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. - ----- - - - -# PLIVM in Residualized Form - -The PLIV model above can be rewritten in the following residualized form: -$$ - \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, -$$ -where -$$ - \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ - \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ - \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. -$$ - The tilded variables above represent original variables after taking out or "partialling out" - the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ - and $U$ have non-zero correlation, which automatically means that $U$ and $V$ - must have non-zero variation. - - - ------ - -# DML for PLIV Model - -Given identification, DML proceeds as follows - -Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts -to solving the three problems of predicting $Y$, $D$, and $Z$ using -$X$, using any generic ML method, giving us estimated residuals -$$ -\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). -$$ -The estimates should be of a cross-validated form, as detailed in the algorithm below. - -Estimate $\theta_0$ by the the intstrumental -variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. -Use the conventional inference for the IV regression estimator, ignoring -the estimation error in these residuals. - -The reason we work with this residualized form is that it eliminates the bias -arising when solving the prediction problem in stage 1. The role of cross-validation -is to avoid another source of bias due to potential overfitting. - -The estimator is adaptive, -in the sense that the first stage estimation errors do not affect the second -stage errors. - - - -```{r _kg_hide-output=TRUE} -install.packages("hdm") -install.packages("AER") -install.packages("randomForest") -``` - -```{r} - -library(AER) #applied econometrics library -library(randomForest) #random Forest library -library(hdm) #high-dimensional econometrics library -library(glmnet) #glm net - - -# DML for PLIVM - -DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=2) { - # this implements DML2 algorithm, where there moments are estimated via DML, before constructing - # the pooled estimate of theta randomly split data into folds - nobs <- nrow(x) - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] - I <- split(1:nobs, foldid) - # create residualized objects to fill - ytil <- dtil <- ztil<- rep(NA, nobs) - # obtain cross-fitted residuals - cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out - zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual - ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial - cat(b," ") - } - ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) - coef.est <- ivfit$coef #extract coefficient - se <- ivfit$se #record standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) -} - - -``` - - ------ - -# Emprical Example: Acemoglu, Jonsohn, Robinson (AER). - - -* Y is log GDP; -* D is a measure of Protection from Expropriation, a proxy for -quality of insitutions; -* Z is the log of Settler's mortality; -* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) - - - - -```{r} - -data(AJR); - -y = AJR$GDP; -d = AJR$Exprop; -z = AJR$logMort -xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) -x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + - Asia + Namer + Samer)^2, data=AJR) -dim(x) - -# DML with Random Forest -cat(sprintf("\n DML with Random Forest \n")) - -dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest - -set.seed(1) -DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=20) - -# DML with PostLasso -cat(sprintf("\n DML with Post-Lasso \n")) - -dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso -yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso -zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso - -set.seed(1) -DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=20) - - -# Compare Forest vs Lasso - -comp.tab= matrix(NA, 3, 2) -comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) -comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) -comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) -rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") -colnames(comp.tab) = c("RF", "LASSO") -print(comp.tab, digits=3) -``` - -# Examine if we have weak instruments - -```{r} -install.packages("lfe") -library(lfe) -summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) -summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) -``` - -# We do have weak instruments, because t-stats in regression $\tilde D \sim \tilde Z$ are less than 4 in absolute value - - -So let's carry out DML inference combined with Anderson-Rubin Idea - -```{r} -# DML-AR (DML with Anderson-Rubin) - -DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ - n = length(rY) - Cstat = rep(0, length(grid)) - for (i in 1:length(grid)) { - Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) - }; - LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); - UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); - plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); - lines(grid, Cstat, lty = 1, col = 1); - abline(h=qchisq(1-alpha,1), lty = 3, col = 4); - abline(v=LB, lty = 3, col = 2); - abline(v=UB, lty = 3, col = 2); - return(list(UB=UB, LB=LB)) - } - -``` - -```{r} -DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, - grid = seq(-2, 2, by =.01)) - - -DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, - grid = seq(-2, 2, by =.01)) - -``` diff --git a/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd b/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd deleted file mode 100644 index d0691fcf..00000000 --- a/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd +++ /dev/null @@ -1,265 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.2' - jupytext_version: 1.13.7 - kernelspec: - display_name: R - language: R - name: ir ---- - - -# Sensititivy Analysis for Unobserved Confounder with DML and Sensmakr - - -## Here we experiment with using package "sensemakr" in conjunction with debiased ML - - -![Screen%20Shot%202021-04-02%20at%204.53.15%20PM.png](attachment:Screen%20Shot%202021-04-02%20at%204.53.15%20PM.png) - - -![Screen%20Shot%202021-04-02%20at%205.01.36%20PM.png](attachment:Screen%20Shot%202021-04-02%20at%205.01.36%20PM.png) - - -## We will - -## * mimic the partialling out procedure with machine learning tools, - -## * and invoke Sensmakr to compute $\phi^2$ and plot sensitivity results. - - -```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} -# loads package -install.packages("sensemakr") -library(sensemakr) - -# loads data -data("darfur") - -``` - -Data is described here -https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html - -The main outcome is attitude towards peace -- the peacefactor. -The key variable of interest is whether the responders were directly harmed (directlyharmed). -We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures. -The measured confounders include female indicator, age, farmer, herder, voted in the past, and household size. -There is also a village indicator, which we will treat as fixed effect and partial it out before conducting -the analysis. The standard errors will be clustered at the village level. - - -# Take out village fixed effects and run basic linear analysis - -```{r} -#get rid of village fixed effects - -attach(darfur) -library(lfe) - -peacefactorR<- lm(peacefactor~village)$res -directlyharmedR<- lm(directlyharmed~village)$res -femaleR<- lm(female~village)$res -ageR<- lm(age~village)$res -farmerR<- lm(farmer_dar~village)$res -herderR<- lm(herder_dar~village)$res -pastvotedR<- lm(pastvoted~village)$res -hhsizeR<- lm(hhsize_darfur~village)$res - - -# Preliminary linear model analysis - -summary(felm(peacefactorR~ directlyharmedR+ femaleR + - ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) - -# here we are clustering standard errors at the village level - - -summary(felm(peacefactorR~ femaleR + - ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) - -# here we are clustering standard errors at the village level - - - -summary(felm(directlyharmedR~ femaleR + - ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) - -# here we are clustering standard errors at the village level - - -``` - -# We first use Lasso for Partilling Out Controls - -```{r} -library(hdm) - - -resY = rlasso(peacefactorR ~ (femaleR + - ageR + farmerR+ herderR + pastvotedR + hhsizeR)^3, post=F)$res - -resD = rlasso(directlyharmedR ~ (femaleR + - ageR + farmerR + herderR + pastvotedR + hhsizeR)^3 , post=F)$res - - -print(c("Controls explain the following fraction of variance of Outcome", 1-var(resY)/var(peacefactorR))) - - -print(c("Controls explain the following fraction of variance of Treatment", 1-var(resD)/var(directlyharmedR))) - - - -library(lfe) - - -dml.darfur.model= felm(resY ~ resD|0|0|village) # cluster SEs by village - -summary(dml.darfur.model,robust=T) #culster SE by village - -dml.darfur.model= lm(resY ~ resD) #lineaer model to use as input in sensemakr - - - - -``` - -# Manual Bias Analysis - -```{r} -# Main estimate - -beta = dml.darfur.model$coef[2] - -# Hypothetical values of partial R2s - -R2.YC = .16; R2.DC = .01 - -# Elements of the formal - -kappa<- (R2.YC * R2.DC)/(1- R2.DC) - -varianceRatio<- mean(dml.darfur.model$res^2)/mean(dml.darfur.model$res^2) - -# Compute square bias - -BiasSq <- kappa*varianceRatio - -# Compute absolute value of the bias - -print(sqrt(BiasSq)) - - -# plotting - -gridR2.DC<- seq(0,.3, by=.001) - -gridR2.YC<- kappa*(1 - gridR2.DC)/gridR2.DC - -gridR2.YC<- ifelse(gridR2.YC> 1, 1, gridR2.YC); - - - -plot(gridR2.DC, gridR2.YC, type="l", col=4, xlab="Partial R2 of Treatment with Confounder", - ylab="Partial R2 of Outcome with Confounder", - main= c("Combo of R2 such that |Bias|< ", round(sqrt(BiasSq), digits=4)) -) - - - - - - -``` - -# Bias Analysis with Sensemakr - -```{r} - -dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, - treatment = "resD") -summary(dml.darfur.sensitivity) - -plot(dml.darfur.sensitivity, nlevels = 15) - -``` - -# Next We use Random Forest as ML tool for Partialling Out - - -The following code does DML with clsutered standard errors by ClusterID - -```{r} -DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2, clusterID) { - nobs <- nrow(x) #number of observations - foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices - I <- split(1:nobs, foldid) #split observation indices into folds - ytil <- dtil <- rep(NA, nobs) - cat("fold: ") - for(b in 1:length(I)){ - dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out - yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out - dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold - yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold - dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold - ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold - cat(b," ") - } - rfit <- felm(ytil ~ dtil |0|0|clusterID) #get clustered standard errors using felm - rfitSummary<- summary(rfit) - coef.est <- rfitSummary$coef[2] #extract coefficient - se <- rfitSummary$coef[2,2] #record robust standard error - cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output - return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals -} -``` - -```{r} -library(randomForest) #random Forest library - -``` - -```{r} - -x= model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR) - -dim(x) - -d= directlyharmedR - -y = peacefactorR; - -#DML with Random Forest: -dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest -yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest -set.seed(1) -DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10, clusterID=village) - - -resY = DML2.RF$ytil - -resD = DML2.RF$dtil - - -print(c("Controls explain the following fraction of variance of Outcome", max(1-var(resY)/var(peacefactorR),0))) - - -print(c("Controls explain the following fraction of variance of Treatment", max(1-var(resD)/var(directlyharmedR),0))) - - - -dml.darfur.model= lm(resY~resD) - - -dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, - treatment = "resD") -summary(dml.darfur.sensitivity) - -plot(dml.darfur.sensitivity,nlevels = 15) - - -``` From 70324bddd06b291ac215a9e4b1dc09ac0c81feef Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:09:23 -0700 Subject: [PATCH 228/261] Rename check-and-transform-R-notebooks.yml to check-R-notebooks.yml --- ...{check-and-transform-R-notebooks.yml => check-R-notebooks.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{check-and-transform-R-notebooks.yml => check-R-notebooks.yml} (100%) diff --git a/.github/workflows/check-and-transform-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml similarity index 100% rename from .github/workflows/check-and-transform-R-notebooks.yml rename to .github/workflows/check-R-notebooks.yml From 8661df371f4b7c42c414c6d47a4503bd68bc3b49 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:09:44 -0700 Subject: [PATCH 229/261] Rename python-notebooks.yml to check-python-notebooks.yml --- .../{python-notebooks.yml => check-python-notebooks.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{python-notebooks.yml => check-python-notebooks.yml} (100%) diff --git a/.github/workflows/python-notebooks.yml b/.github/workflows/check-python-notebooks.yml similarity index 100% rename from .github/workflows/python-notebooks.yml rename to .github/workflows/check-python-notebooks.yml From 842cd183bc752f4db1af9904c9d6e0a97fe47aed Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:09:59 -0700 Subject: [PATCH 230/261] Rename strip-python-notebooks.yml to transform-notebooks.yml --- .../{strip-python-notebooks.yml => transform-notebooks.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{strip-python-notebooks.yml => transform-notebooks.yml} (100%) diff --git a/.github/workflows/strip-python-notebooks.yml b/.github/workflows/transform-notebooks.yml similarity index 100% rename from .github/workflows/strip-python-notebooks.yml rename to .github/workflows/transform-notebooks.yml From c38efaa1f426f2d77d1532b1e6da0b2045a6fcf7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:43:14 -0700 Subject: [PATCH 231/261] re-committing old deprecated .Rmd files --- ...yzing-rct-reemployment-experiment.irnb.Rmd | 198 +++++++++++++ ...book-analyzing-rct-with-precision.irnb.Rmd | 120 ++++++++ CM1/Old/r-notebook-some-rct-examples.irnb.Rmd | 259 ++++++++++++++++++ 3 files changed, 577 insertions(+) create mode 100644 CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd create mode 100644 CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd create mode 100644 CM1/Old/r-notebook-some-rct-examples.irnb.Rmd diff --git a/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd b/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd new file mode 100644 index 00000000..4cd8137f --- /dev/null +++ b/CM1/Old/analyzing-rct-reemployment-experiment.irnb.Rmd @@ -0,0 +1,198 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Analyzing RCT data with Precision Adjustment + + +## Data + +In this lab, we analyze the Pennsylvania re-employment bonus experiment, which was previously studied in "Sequential testing of duration data: the case of the Pennsylvania ‘reemployment bonus’ experiment" (Bilias, 2000), among others. These experiments were conducted in the 1980s by the U.S. Department of Labor to test the incentive effects of alternative compensation schemes for unemployment insurance (UI). In these experiments, UI claimants were randomly assigned either to a control group or one of five treatment groups. Actually, there are six treatment groups in the experiments. Here we focus on treatment group 4, but feel free to explore other treatment groups. In the control group the current rules of the UI applied. Individuals in the treatment groups were offered a cash bonus if they found a job within some pre-specified period of time (qualification period), provided that the job was retained for a specified duration. The treatments differed in the level of the bonus, the length of the qualification period, and whether the bonus was declining over time in the qualification period; see http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt for further details on data. + + +```{r} +## loading the data +Penn <- as.data.frame(read.table("../input/reemployment-experiment/penn_jae.dat", header=T )) +n <- dim(Penn)[1] +p_1 <- dim(Penn)[2] +Penn<- subset(Penn, tg==4 | tg==0) +attach(Penn) +``` + +```{r} +T4<- (tg==4) +summary(T4) +``` + +```{r} +head(Penn) +``` + + +### Model +To evaluate the impact of the treatments on unemployment duration, we consider the linear regression model: + +$$ +Y = D \beta_1 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W')' = 0, +$$ + +where $Y$ is the log of duration of unemployment, $D$ is a treatment indicators, and $W$ is a set of controls including age group dummies, gender, race, number of dependents, quarter of the experiment, location within the state, existence of recall expectations, and type of occupation. Here $\beta_1$ is the ATE, if the RCT assumptions hold rigorously. + + +We also consider interactive regression model: + +$$ +Y = D \alpha_1 + D W' \alpha_2 + W'\beta_2 + \varepsilon, \quad E \varepsilon (D,W', DW')' = 0, +$$ +where $W$'s are demeaned (apart from the intercept), so that $\alpha_1$ is the ATE, if the RCT assumptions hold rigorously. + + +Under RCT, the projection coefficient $\beta_1$ has +the interpretation of the causal effect of the treatment on +the average outcome. We thus refer to $\beta_1$ as the average +treatment effect (ATE). Note that the covariates, here are +independent of the treatment $D$, so we can identify $\beta_1$ by +just linear regression of $Y$ on $D$, without adding covariates. +However we do add covariates in an effort to improve the +precision of our estimates of the average treatment effect. + + +### Analysis + +We consider + +* classical 2-sample approach, no adjustment (CL) +* classical linear regression adjustment (CRA) +* interactive regression adjusment (IRA) + +and carry out robust inference using the *estimatr* R packages. + + +# Carry out covariate balance check + + +This is done using "lm_robust" command which unlike "lm" in the base command automatically does the correct Eicher-Huber-White standard errors, instead othe classical non-robus formula based on the homoscdedasticity command. + +```{r} + +m <- lm(T4~(female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2) +library(lmtest) +library(sandwich) +coeftest(m, vcov = vcovHC(m, type="HC1")) + +``` + +We see that that even though this is a randomized experiment, balance conditions are failed. + + +# Model Specification + +```{r} +# model specifications + + +# no adjustment (2-sample approach) +formula_cl <- log(inuidur1)~T4 + +# adding controls +formula_cra <- log(inuidur1)~T4+ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2 +# Omitted dummies: q1, nondurable, muld + + +ols.cl <- lm(formula_cl) +ols.cra <- lm(formula_cra) + + +ols.cl = coeftest(ols.cl, vcov = vcovHC(ols.cl, type="HC1")) +ols.cra = coeftest(ols.cra, vcov = vcovHC(ols.cra, type="HC1")) + +print(ols.cl) +print(ols.cra) + + +``` + +The interactive specificaiton corresponds to the approach introduced in Lin (2013). + +```{r} + +#interactive regression model; + +X = model.matrix (~ (female+black+othrace+factor(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)[,-1] +dim(X) +demean<- function(x){ x - mean(x)} +X = apply(X, 2, demean) + +ols.ira = lm(log(inuidur1) ~ T4*X) +ols.ira= coeftest(ols.ira, vcov = vcovHC(ols.ira, type="HC1")) +print(ols.ira) + + + +``` + +Next we try out partialling out with lasso + +```{r} +library(hdm) + +T4 = demean(T4) + +DX = model.matrix(~T4*X)[,-1] + +rlasso.ira = summary(rlassoEffects(DX, log(inuidur1), index = 1)) + + +print(rlasso.ira) + + +``` + +### Results + +```{r} +str(ols.ira) +ols.ira[2,1] +``` + +```{r} +library(xtable) +table<- matrix(0, 2, 4) +table[1,1]<- ols.cl[2,1] +table[1,2]<- ols.cra[2,1] +table[1,3]<- ols.ira[2,1] +table[1,4]<- rlasso.ira[[1]][1] + +table[2,1]<- ols.cl[2,2] +table[2,2]<- ols.cra[2,2] +table[2,3]<- ols.ira[2,2] +table[2,4]<- rlasso.ira[[1]][2] + + +colnames(table)<- c("CL","CRA","IRA", "IRA w Lasso") +rownames(table)<- c("estimate", "standard error") +tab<- xtable(table, digits=5) +tab + +print(tab, type="latex", digits=5) +``` + + +Treatment group 4 experiences an average decrease of about $7.8\%$ in the length of unemployment spell. + + +Observe that regression estimators delivers estimates that are slighly more efficient (lower standard errors) than the simple 2 mean estimator, but essentially all methods have very similar standard errors. From IRA results we also see that there is not any statistically detectable heterogeneity. We also see the regression estimators offer slightly lower estimates -- these difference occur perhaps to due minor imbalance in the treatment allocation, which the regression estimators try to correct. + + + + diff --git a/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd b/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd new file mode 100644 index 00000000..667b07d3 --- /dev/null +++ b/CM1/Old/r-notebook-analyzing-rct-with-precision.irnb.Rmd @@ -0,0 +1,120 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Analyzing RCT with Precision by Adjusting for Baseline Covariates + + +# Jonathan Roth's DGP + +Here we set up a DGP with heterogenous effects. In this example, which is due to Jonathan Roth, we have +$$ +E [Y(0) | Z] = - Z, \quad E [Y(1) |Z] = Z, \quad Z \sim N(0,1). +$$ +The CATE is +$$ +E [Y(1) - Y(0) | Z ]= 2 Z. +$$ +and the ATE is +$$ +2 E Z = 0. +$$ + +We would like to estimate the ATE as precisely as possible. + +An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and $Z$ academic skills. Suppose that academic skills cause lower earnings Y(0) in jobs that don't require college degree, and cause higher earnings Y(1) in jobs that require college degrees. This type of scenario is reflected in the DGP set-up above. + + + +```{r} +# generate the simulated dataset +set.seed(123) # set MC seed +n = 1000 # sample size +Z = rnorm(n) # generate Z +Y0 = -Z + rnorm(n) # conditional average baseline response is -Z +Y1 = Z + rnorm(n) # conditional average treatment effect is +Z +D = (runif(n)<.2) # treatment indicator; only 20% get treated +Y = Y1*D + Y0*(1-D) # observed Y +D = D - mean(D) # demean D +Z = Z-mean(Z) # demean Z +``` + +# Analyze the RCT data with Precision Adjustment + +Consider the follow regression models: + +* classical 2-sample approach, no adjustment (CL) +* classical linear regression adjustment (CRA) +* interactive regression adjusment (IRA) + +We carry out inference using heteroskedasticity robust inference, using the sandwich formulas for variance (Eicker-Huber-White). + +We observe that the CRA delivers estimates that are less efficient than the CL (pointed out by Freedman), whereas the IRA delivers a more efficient approach (pointed out by Lin). In order for the CRA to be more efficient than the CL, we need the CRA to be a correct model of the conditional expectation function of Y given D and X, which is not the case here. + +```{r} +# implement each of the models on the simulated data +CL = lm(Y ~ D) +CRA = lm(Y ~ D+ Z) #classical +IRA = lm(Y ~ D+ Z+ Z*D) #interactive approach + +# we are interested in the coefficients on variable "D". +library(sandwich) # heterokedasticity robust standard errors +library(lmtest) # coefficient testing +coeftest(CL, vcov = vcovHC(CL, type="HC1")) +coeftest(CRA, vcov = vcovHC(CRA, type="HC1")) +coeftest(IRA, vcov = vcovHC(IRA, type="HC1")) +``` + +# Using classical standard errors (non-robust) is misleading here. + +We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for lm() procedure in R, summary.lm(), still uses 100-year old concepts, perhaps in part due to historical legacy. + +Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors. + + +```{r} +summary(CL) +summary(CRA) +summary(IRA) +``` + +# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +set.seed(123) +n = 1000 +B= 1000 + +CLs = rep(0, B) +CRAs = rep(0, B) +IRAs = rep(0, B) + +for ( i in 1:B){ + Z = rnorm(n) + Y0 = -Z + rnorm(n) + Y1 = Z + rnorm(n) + Z = Z - mean(Z) + D = (runif(n)<.1) + D = D- mean(D) + Y = Y1*D + Y0*(1-D) + CLs[i]= lm(Y ~ D)$coef[2] + CRAs[i] = lm(Y ~ D+ Z)$coef[2] + IRAs[i] = lm(Y ~ D+ Z+ Z*D)$coef[2] + } + +print("Standard deviations for estimators") + +sqrt(mean(CLs^2)) +sqrt(mean(CRAs^2)) +sqrt(mean(IRAs^2)) +``` diff --git a/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd b/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd new file mode 100644 index 00000000..baed1868 --- /dev/null +++ b/CM1/Old/r-notebook-some-rct-examples.irnb.Rmd @@ -0,0 +1,259 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains some RCT examples for teaching. + + + +# Polio RCT + +One of the earliest randomized experiments were the Polio vaccination trials conducted by the Public Health Service in 1954. The question was whether Salk vaccine prevented polio. Children in the study were randomly assigned either a treatment (polio vaccine shot) or a placebo (saline solution shot), without knowing which one they received. The doctors in the study, making the diagnosis, did not know whether a child received a vaccine or not. In other words, the trial was a double-blind, randomized control trial. The trial had to be large, because the rate at which Polio occured in the population was 50 per 100,000. The treatment group saw 33 polio cases per 200,745; the control group saw 115 cases per 201,229. The estimated avearage treatment effect is about +$$ +-40 +$$ +with the 95% confidence band (based on approximate normality of the two sample means and their differences): +$$[-52, -28].$$ +As this is an RCT, the confidence band suggests that the Polio vaccine **caused** the reduction in the risk of polio. + +The interesting thing here is that we don't need the underlying individual data to evaluate the effectivess of the vaccine. This is because the outcomes are Bernoulli random variables, and we have enough information to compute the estimate of ATE as well as the confidence intervals from the group case counts. + +We also compute the Vaccine Efficacy metric, which refers to the following measure according to the [CDC](https://www.cdc.gov/csels/dsepd/ss1978/lesson3/section6.html): +$$ +VE = \frac{\text{Risk for Unvaccinated - Risk for Vaccinated}}{\text{Risk for Unvaccinated}}. +$$ +It describes the relative reduction in risk caused by vaccination. + + +It is staightforward to get the VE estimate by just plugging-in the numbers, but how do we get the approximate variance estimate? I am too lazy to do calculations for the delta method, so I will just use a simulation (a form of approximate bootstrap) to obtain the confidence intervals. + + + + +```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} +NV = 200745 # number of vaccinated (treated) +NU = 201229 # number of unvaccinated (control) +RV= 33/NV # average outcome for vaccinated +RU =115/NU # average outcome for unvaccinated +VE = (RU - RV)/RU; # vaccine efficacy + +# incidence per 100000 +Incidence.RV=RV*100000 +Incidence.RU=RU*100000 + +print(paste("Incidence per 100000 among treated:", round(Incidence.RV,4))) + +print(paste("Incidence per 100000 among controlled:", round(Incidence.RU,4))) + +# treatment effect - estimated reduction in incidence per 100000 people +delta.hat = 100000*(RV-RU) + +print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) + +# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU +Var.delta.hat = 100000^2*(Var.RV + Var.RU) +Std.delta.hat = sqrt(Var.delta.hat) + +print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) + +CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), + delta.hat +1.96*sqrt(Var.delta.hat)) + +print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", + round(CI.delta[2],4), "]" )) + +print(paste("Overall VE is", round(VE,4) )) + +# we use an approximate bootstrap to find the confidence interval of vaccine efficacy +# via Monte Carlo draws +set.seed(1) +B = 10000 # number of bootstraps +RVs = RV + rnorm(B)*sqrt(Var.RV) +RUs = RU + rnorm(B)*sqrt(Var.RU) +VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) # use the empirical quantiles from the bootstraps + +print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", + round(CI.VE[2],4), "]")) +``` + +# Pfizer/BNTX Covid-19 RCT + +Here is a link to the FDA [briefing](https://www.fda.gov/media/144245/download) and an interesting [discussion]( +https://garycornell.com/2020/12/09/statistics-in-the-pfizer-data-how-good-is-the-vaccine/?fbclid=IwAR282lS0Vl3tWmicQDDhIJAQCMO8NIsCXyWbUWwTtPuKcnuJ2v0VWXRDQac), as well as data. + +Pfizer/BNTX was the first vaccine approved for emergency use to reduce the risk of Covid-19 decease. In studies to assess vaccine efficacy, volunteers were randomly assigned to receive either a treatment (2-dose vaccination) or a placebo, without knowing which they recieved. The doctors making the diagnoses did not know now whether a given volunteer received a vaccination or not. The results of the study are given in the following table. + +![](https://lh6.googleusercontent.com/oiO6gYom1UZyrOhgpFx2iq8ike979u3805JHiVygP-Efh1Yaz2ttyPcgWKlT1AqHDM4v46th3EPIkOvRLyXA0fNUloPL-mL9eOFmSAzfbNOHyCZSQ0DyzMhcFUtQuZ520R5Qd2lj): + +Here we see both the overall effects and the effects by age group. The confidence intervals for the overall ATE are tight and suggest high effectiveness of the vaccine. The confidence intervals for the age group 65-75 are much wider due to the relatively small number of people in this group. We could group 65-75 and >75 groups to evaluate the effectiveness of the vaccine for this broader age group and narrow the width of the confidence band. + +We use the same approach as that for the Polio example. This gives slightly different results than the FDA result, because the FDA used inversion of exact binomial tests to construct confidence intervals. We use asymptotic approches based on approximate normality, which is more crude, but delivers a rather similar result. + + +```{r _uuid="d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", _cell_guid="79c7e3d0-c299-4dcb-8224-4455121ee9b0"} +NV = 19965; # number vaccinated +NU = 20172; # number unvaccinated +RV = 9/NV; # average outcome for vaccinated +RU = 169/NU; # average outcome for unvaccinated +VE = (RU - RV)/RU; # vaccine efficacy + +# incidence per 100000 +Incidence.RV=RV*100000 +Incidence.RU=RU*100000 + +print(paste("Incidence per 100000 among vaccinated:", round(Incidence.RV,4))) + +print(paste("Incidence per 100000 among unvaccinated:", round(Incidence.RU,4))) + +# treatment effect - estimated reduction in incidence per 100000 people +delta.hat = 100000*(RV-RU) + +print(paste("Estimated ATE of occurances per 100,000 is", round(delta.hat,4))) + +# variance, standard deviation and confidence interval of ATE using that outcomes are Bernoulli +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU +Var.delta.hat = 100000^2*(Var.RV + Var.RU) +Std.delta.hat = sqrt(Var.delta.hat) + +print(paste("Standard deviation for ATE is", round(Std.delta.hat,4))) + +CI.delta = c(delta.hat -1.96*sqrt(Var.delta.hat), + delta.hat +1.96*sqrt(Var.delta.hat)) + +print(paste("95% confidence interval of ATE is [", round(CI.delta[1],4), ",", + round(CI.delta[2],4), "]" )) + +print(paste("Overall VE is", round(VE,4) )) + +# we use an approximate bootstrap to find the VE confidence interval +# using Monte Carlo draws as before +set.seed(1) +B = 10000 +RVs = RV + rnorm(B)*sqrt(Var.RV) +RUs = RU + rnorm(B)*sqrt(Var.RU) +VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("95% confidence interval of VE is [", round(CI.VE[1],4), ",", + round(CI.VE[2],4), "]" )) +``` + +In the code cell below we calculate the effectiveness of the vaccine for the two groups that are 65 or older + +```{r} +# Here we calculate the overall effectiveness of the vaccine for the two groups that are 65 or older +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +print(paste("Overall VE is", round(VE,4)) ) + +Var.RV = RV*(1-RV)/NV +Var.RU = RU*(1-RU)/NU + +# As before, we use an approximate bootstrap to find the confidence intervals +# using Monte Carlo draws + +set.seed(1) +B = 10000 + RVs = RV + rnorm(B)*sqrt(Var.RV)+ 10^(-10) + RUs = RU + rnorm(B)*sqrt(Var.RU)+ 10^(-10) + VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", + CI.VE[2], "]" )) + +OneSidedCI.VE = quantile(VEs, c(.05)) + +print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", + 1, "]" )) +``` + +Let's try the parametric boostrap next, using the fact that the outcome is Bernouli. + +```{r} +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +print(paste("Overall VE is", VE) ) + +set.seed(1) +B = 10000 #number of simulation draw + RVs = rbinom(100000, size= NV, prob = RV) + RUs = rbinom(100000, size= NU, prob = RU) + VEs= (RUs - RVs)/RUs + +plot(density(VEs), col=2, main="Approximate Distribution of VE estimates") + +CI.VE = quantile(VEs, c(.025, .975)) + +print(paste("two-sided 95 % confidence interval is [", CI.VE[1], ",", + CI.VE[2], "]" )) + +OneSidedCI.VE = quantile(VEs, c(.05)) + +print(paste("one-sided 95 % confidence interval is [", OneSidedCI.VE[1], ",", 1, "]" )) +``` + +# Exact Binomial Test Inversion + +It is possible to use exact inference by inverting tests based on the exact binomial nature of the outcome variable. Here, we perform the Cornfield Procedure to find the exact confidence interval on the estimate of vaccine efficacy. + +```{r} +# Exact CI exploiting Bernoulli outcome using the Cornfield Procedure +install.packages("ORCI") +library(ORCI) + +NV = 19965; +NU = 20172; +RV = 9/NV; +RU = 169/NU; +VE = (RU - RV)/RU; + +1- Cornfieldexact.CI(9, NV, 169, NU, conf = 0.95, interval = c(1e-08, 1e+08)) +``` + +Note that this exactly recovers the result in the FDA table (first row). + + +Next we repeat the cornfield procedure to find the exact confidence interval on vaccine effectiveness for the two groups that are 65 or older. Here we see a big discrepancy between various asymptotic approaches and the exact finite-sample inference. This occurs because the binomial counts are too low for central limit theorems to work successfully. + +```{r} +# Exact CI exploiting Bernoulli outcome for the two groups that are 65 or older +NV = 3239+805; +NU = 3255+812; +RV = 1/NV; +RU = (14+5)/NU; +VE = (RU - RV)/RU; + +1- Cornfieldexact.CI(1, NV, 19, NU, conf = 0.95, interval = c(1e-08, 1e+08)) +``` From 7f30063d35784cc302f4d269394ec35e8ad98a3b Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:44:01 -0700 Subject: [PATCH 232/261] Create r-colliderbias-hollywood.irnb.Rmd --- CM2/Old/r-colliderbias-hollywood.irnb.Rmd | 66 +++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 CM2/Old/r-colliderbias-hollywood.irnb.Rmd diff --git a/CM2/Old/r-colliderbias-hollywood.irnb.Rmd b/CM2/Old/r-colliderbias-hollywood.irnb.Rmd new file mode 100644 index 00000000..5c57b0b3 --- /dev/null +++ b/CM2/Old/r-colliderbias-hollywood.irnb.Rmd @@ -0,0 +1,66 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Collider Bias + + +Here is a simple mnemonic example to illustate the collider or M-bias. + +Here the idea is that people who get to Hollywood have to have high congenility = talent + beauty. Funnily enough this induces a negative correlation between talents and looks, when we condition on the set of actors or celebrities. This simple example explains an anecdotal observation that "talent and beaty are negatively correlated" for celebrities. + +```{r} +install.packages("dagitty") +library(dagitty) +``` + +```{r} +g <- dagitty( "dag{ T -> C <- B }" ) +plot(g) +``` + +```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} +#collider bias +n=1000000 +T = rnorm(n) #talent +B = rnorm(n) #beaty +C = T+B + rnorm(n) #congeniality +T.H= subset(T, C>0) # condition on C>0 +B.H= subset(B, C>0) # condition on C>0 + +summary(lm(T~ B)) #regression of T on B +summary(lm(T~ B +C)) #regression of T on B and C +summary(lm(T.H~ B.H)) #regression of T on B, conditional on C>0. + + + +``` + +We can also use package Dagitty to illustrate collider bias, also known as M-bias. + +```{r _uuid="d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", _cell_guid="79c7e3d0-c299-4dcb-8224-4455121ee9b0"} +## If we want to infer causal effec of B on T, +## we can apply the command to figure out +## variables we should condition on: + +adjustmentSets( g, "T", "B" ) + +## empty set -- we should not condition on the additional +## variable C. + +## Generate data where C = .5T + .5B +set.seed( 123); d <- simulateSEM( g, .5 ) +confint( lm( T ~ B, d ) )["B",] # includes 0 +confint( lm( T ~ B + C, d ) )["B",] # does not include 0 + +``` From 0e04131567fb5b1040450fcf33b03bd11b7e9159 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:44:35 -0700 Subject: [PATCH 233/261] recommitting old --- .../r-colliderbias-hollywood.irnb.Rmd | 0 CM3/Old/notebook-dagitty.irnb.Rmd | 220 ++++++++++++++++ CM3/Old/notebook-dosearch.irnb.Rmd | 235 ++++++++++++++++++ 3 files changed, 455 insertions(+) rename CM2/{Old => }/r-colliderbias-hollywood.irnb.Rmd (100%) create mode 100644 CM3/Old/notebook-dagitty.irnb.Rmd create mode 100644 CM3/Old/notebook-dosearch.irnb.Rmd diff --git a/CM2/Old/r-colliderbias-hollywood.irnb.Rmd b/CM2/r-colliderbias-hollywood.irnb.Rmd similarity index 100% rename from CM2/Old/r-colliderbias-hollywood.irnb.Rmd rename to CM2/r-colliderbias-hollywood.irnb.Rmd diff --git a/CM3/Old/notebook-dagitty.irnb.Rmd b/CM3/Old/notebook-dagitty.irnb.Rmd new file mode 100644 index 00000000..73737f28 --- /dev/null +++ b/CM3/Old/notebook-dagitty.irnb.Rmd @@ -0,0 +1,220 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Causal Identification in DAGs using Backdoor and Swigs, Equivalence Classes, Falsifiability Tests + + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +#install and load package +install.packages("dagitty") +install.packages("ggdag") +library(dagitty) +library(ggdag) + +``` + +# Graph Generation and Plotting + + +The following DAG is due to Judea Pearl + +```{r} +#generate a couple of DAGs and plot them + +G = dagitty('dag{ +Z1 [pos="-2,-1.5"] +X1 [pos="-2,0"] +Z2 [pos="1.5,-1.5"] +X3 [pos="1.5, 0"] +Y [outcome,pos="1.5,1.5"] +D [exposure,pos="-2,1.5"] +M [mediator, pos="0,1.5"] +X2 [pos="0,0"] +Z1 -> X1 +X1 -> D +Z1 -> X2 +Z2 -> X3 +X3 -> Y +Z2 -> X2 +D -> Y +X2 -> Y +X2 -> D +M->Y +D->M +}') + + +ggdag(G)+ theme_dag() +``` + +# Report Relatives of X2 + +```{r} +print(parents(G, "X2")) +print(children(G, "X2")) +print(ancestors(G, "X2")) +print(descendants(G, "X2")) + + +``` + +# Find Paths Between D and Y + + + +```{r} +paths(G, "D", "Y") +``` + +# List All Testable Implications of the Model + +```{r} +print( impliedConditionalIndependencies(G) ) +``` + +# Identification by Backdoor: List minimal adjustment sets to identify causal effecs $D \to Y$ + +```{r} +print( adjustmentSets( G, "D", "Y" ) ) +``` + +# Identification via SWIG and D-separation + +```{r} +SWIG = dagitty('dag{ +Z1 [pos="-2,-1.5"] +X1 [pos="-2,0"] +Z2 [pos="1.5,-1.5"] +X3 [pos="1.5, 0"] +Yd [outcome,pos="1.5,1.5"] +D [exposure,pos="-2,1.5"] +d [pos="-1, 1.5"] +Md [mediator, pos="0,1.5"] +X2 [pos="0,0"] +Z1 -> X1 +X1 -> D +Z1 -> X2 +Z2 -> X3 +X3 -> Yd +Z2 -> X2 +X2 -> Yd +X2 -> D +X3-> Yd +Md-> Yd +d-> Md +}') + +ggdag(SWIG)+ theme_dag() + +``` + + +# Deduce Conditional Exogeneity or Ignorability by D-separation + + +```{r} +print( impliedConditionalIndependencies(SWIG)[5:8] ) + +``` + +This coincides with the backdoor criterion for this graph. + + +# Print All Average Effects Identifiable by Conditioning + +```{r} +for( n in names(G) ){ + for( m in children(G,n) ){ + a <- adjustmentSets( G, n, m ) + if( length(a) > 0 ){ + cat("The effect ",n,"->",m, + " is identifiable by controlling for:\n",sep="") + print( a, prefix=" * " ) + } + } +} +``` + +# Equivalence Classes + +```{r} +P=equivalenceClass(G) +plot(P) +#equivalentDAGs(G,10) +``` + +Next Consider the elemntary Triangular Model: +$$ +D \to Y, \quad X \to (D,Y). +$$ +This model has not testable implications and is Markov-equivalent to any other DAG difined on names $(X, D, Y)$. + +```{r} +G3<- dagitty('dag{ +D -> Y +X -> D +X -> Y +} +') + +ggdag(G3)+ theme_dag() + +print(impliedConditionalIndependencies(G3)) + + +``` + +```{r} +P=equivalenceClass(G3) +plot(P) +equivalentDAGs(G3,10) + + +``` + + +# Example of Testing DAG Validity + +Next we simulate the data from a Linear SEM associated to DAG G, and perform a test of conditional independence restrictions, exploting linearity. + + +There are many other options for nonlinear models and discrete categorical variabales. Type help(localTests). + + +```{r} +set.seed(1) +x <- simulateSEM(G) +head(x) +#cov(x) +localTests(G, data = x, type = c("cis")) + + +``` + +Next we replaced $D$ by $\bar D$ generated differently: +$$ +\bar D= (D + Y)/2 +$$ +So basically $\bar D$ is an average of $D$ and $Y$ generated by $D$. We then test if the resulting collection of random variables satisifes conditional indepdendence restrictions, exploiting linearity. We end up rejectiong these restrictions and thefore the validity of this model for the data generated in this way. This makes sense, because the new data no longer obeys the previous DAG structure. + + + +```{r} +x.R = x +x.R$D = (x$D+ x$Y)/2 + +localTests(G, data = x.R, type = c("cis")) + + +``` diff --git a/CM3/Old/notebook-dosearch.irnb.Rmd b/CM3/Old/notebook-dosearch.irnb.Rmd new file mode 100644 index 00000000..bd5b8e15 --- /dev/null +++ b/CM3/Old/notebook-dosearch.irnb.Rmd @@ -0,0 +1,235 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Dosearch for Causal Identification in DAGs. + + +This a simple notebook for teaching that illustrates capabilites of the "dosearch" package, which is a great tool. + +NB. In my experience, the commands are sensitive to syntax ( e.g. spacing when -> are used), so be careful when changing to other examples. + + +```{r _uuid="8f2839f25d086af736a60e9eeb907d3b93b6e0e5", _cell_guid="b1076dfc-b9ad-4769-8c92-a6c4dae69d19"} +install.packages("dosearch") +library("dosearch") + + +``` + +We start with the simplest graph, with the simplest example +where $D$ is policy, $Y$ is outcomes, $X$ is a confounder: +$$ +D\to Y, \quad X \to (D,Y) +$$ + + + +Now suppose we want conditional average policy effect. + +```{r} +data <- "p(y,d,x)" #data structure + +query <- "p(y | do(d),x)" #query -- target parameter + +graph <- "x -> y + x -> d + d -> y" + +dosearch(data, query, graph) +``` + +This recovers the correct identification formula for law of the counterfactual $Y(d)$ induced by $do(D=d)$: +$$ +p_{Y(d)|X}(y|x) := p(y|do(d),x) = p(y|d,x). +$$ + +```{r} +data <- "p(y,d,x)" + +query <- "p(y | do(d))" + +graph <- "x -> y + x -> d + d -> y" + + +dosearch(data, query, graph) + +``` + +This recovers the correct identification formula: +$$ +p_{Y(d)}(y) := p(y: do(d)) = \sum_{x}\left(p(x)p(y|d,x)\right) +$$ +We integreate out $x$ in the previous formula. + + + +Suppose we don't observe the confounder. The effect is generally not identified. + + +```{r} + +data <- "p(y,d)" + +query <- "p(y | do(d))" + +graph <- "x -> y + x -> d + d -> y" + +dosearch(data, query, graph) + +``` + +The next graph is an example of J. Pearl (different notation), where the graph is considerably more complicated. We are interested in $D \to Y$. + +![image.png](attachment:image.png) + +Here we try conditioning on $X_2$. This would block one backdoor path from $D$ to $Y$, but would open another path on which $X_2$ is a collider, so this shouldn't work. The application below gave a correct answer (after I put the spacings carefully). + + +```{r} + +data <- "p(y,d,x2)" #observed only (Y, D, X_2) + +query<- "p(y|do(d))" #target parameter + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" + +dosearch(data, query, graph) + +``` + +Intuitively, we should add more common causes. For example, adding $X_3$ and using $S = (X_2, X_3)$ should work. + +```{r} + +data <- "p(y,d,x2,x3)" + +conditional.query<- "p(y|do(d),x2, x3)" #can ID conditional average effect? +query<- "p(y|do(d))" #can ID unconditional effect? + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" + +print(dosearch(data, conditional.query, graph)) +print(dosearch(data, query, graph)) + +``` + +This retrives correct formulas for counterfactual distributions of $Y(d)$ induced by $Do(D=d)$: + +The conditional distribution is identified by: +$$ +p_{Y(d) \mid X_2, X_3}(y) := p(y |x_2, x_3: do(d)) = p(y|x_2,x_3,d). +$$ + +The unconditional distribution is obtained by integration out $x_2$ and $x_3$: + +$$ +p_{Y(d) }(y) := p(y do(d)) = \sum_{x2,x3}\left(p(x_2,x_3)p(y|x_2,x_3,d)\right). +$$ + + + + + +Next we suppose that we observe only $(Y,D, M)$. Can we identify the effect $D \to Y$? Can we use back-door-criterion twice to get $D \to M$ and $M \to Y$ affect? Yes, that's called front-door criterion -- so we just need to remember only the back-door and the fact that we can use it iteratively. + +```{r} +data <- "p(y,d, m)" + +query.dm<- "p(m|do(d))" +query.md<- "p(y|do(m))" +query<- "p(y|do(d))" + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" +print(dosearch(data, query.dm, graph)) +print(dosearch(data, query.md, graph)) +print(dosearch(data, query, graph)) + +``` + +So we get identification results: +First, +$$ +p_{M(d)}(m) := p(m|do(d)) = p(m|d). +$$ +Second, +$$ +p_{Y(m)}(y) := p(y|do(m)) = \sum_{d}\left(p(d)p(y|d,m)\right), +$$ +and the last by integrating the product of these two formulas: +$$ +p_{Y(d)}(y) := p(y|do(d)) = \sum_{m}\left(p(m|d)\sum_{d}\left(p(d)p(y|d,m)\right)\right) +$$ + + + +The package is very rich and allows identification analysis, when the data comes from multiple sources. Suppose we observe marginal distributions $(Y,D)$ and $(D,M)$ only. Can we identify the effect of $D \to Y$. The answer is (guess) and the package correctly recovers it. + +```{r} +data <- "p(y,m) + p(m,d)" + +query.dm<- "p(m|do(d))" +query.md<- "p(y|do(m))" +query<- "p(y|do(d))" + +graph<- "z1 -> x1 +z1 -> x2 +z2 -> x2 +z2 -> x3 +x2 -> d +x2 -> y +x3 -> y +x1 -> d +d -> m +m -> y +" +print(dosearch(data, query.dm, graph)) +print(dosearch(data, query.md, graph)) +print(dosearch(data, query, graph)) +``` From ac8b293eb195de9a4784dcef68261277a0c8bc21 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:45:31 -0700 Subject: [PATCH 234/261] recommitting old --- CM2/{ => Old}/r-colliderbias-hollywood.irnb.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename CM2/{ => Old}/r-colliderbias-hollywood.irnb.Rmd (100%) diff --git a/CM2/r-colliderbias-hollywood.irnb.Rmd b/CM2/Old/r-colliderbias-hollywood.irnb.Rmd similarity index 100% rename from CM2/r-colliderbias-hollywood.irnb.Rmd rename to CM2/Old/r-colliderbias-hollywood.irnb.Rmd From a29c315550178f7d28d7d6bf4a6c09d2736ab85a Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:46:48 -0700 Subject: [PATCH 235/261] =?UTF-8?q?=CE=BF=CE=BB=CE=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...or-partially-linear-iv-model-in-r.irnb.Rmd | 264 +++++++++++++++++ ...sis-with-sensmakr-and-debiased-ml.irnb.Rmd | 265 ++++++++++++++++++ 2 files changed, 529 insertions(+) create mode 100644 deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd create mode 100644 deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd diff --git a/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd b/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd new file mode 100644 index 00000000..7d506973 --- /dev/null +++ b/deprecated/CM4/Old/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd @@ -0,0 +1,264 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Double/Debiased ML for Partially Linear IV Model + +References: + +https://arxiv.org/abs/1608.00060 + + +https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 + +The code is based on the book. + + + + + +# Partially Linear IV Model + +We consider the partially linear structural equation model: +\begin{eqnarray} + & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ + & Z = m_0(X) + V, & E[V \mid X] = 0. +\end{eqnarray} + + +Note that this model is not a regression model unless $Z=D$. The model is a canonical +model in causal inference, going back to P. Wright's work on IV methods for estimaing demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. + + +The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation +in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. + + +The causal DAG this model corresponds to is given by: +$$ +Z \to D, X \to (Y, Z, D), L \to (Y,D), +$$ +where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. + + + +--- + +# Example + +A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. + +---- + + + +# PLIVM in Residualized Form + +The PLIV model above can be rewritten in the following residualized form: +$$ + \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, +$$ +where +$$ + \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ + \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ + \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. +$$ + The tilded variables above represent original variables after taking out or "partialling out" + the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ + and $U$ have non-zero correlation, which automatically means that $U$ and $V$ + must have non-zero variation. + + + +----- + +# DML for PLIV Model + +Given identification, DML proceeds as follows + +Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts +to solving the three problems of predicting $Y$, $D$, and $Z$ using +$X$, using any generic ML method, giving us estimated residuals +$$ +\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). +$$ +The estimates should be of a cross-validated form, as detailed in the algorithm below. + +Estimate $\theta_0$ by the the intstrumental +variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. +Use the conventional inference for the IV regression estimator, ignoring +the estimation error in these residuals. + +The reason we work with this residualized form is that it eliminates the bias +arising when solving the prediction problem in stage 1. The role of cross-validation +is to avoid another source of bias due to potential overfitting. + +The estimator is adaptive, +in the sense that the first stage estimation errors do not affect the second +stage errors. + + + +```{r _kg_hide-output=TRUE} +install.packages("hdm") +install.packages("AER") +install.packages("randomForest") +``` + +```{r} + +library(AER) #applied econometrics library +library(randomForest) #random Forest library +library(hdm) #high-dimensional econometrics library +library(glmnet) #glm net + + +# DML for PLIVM + +DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=2) { + # this implements DML2 algorithm, where there moments are estimated via DML, before constructing + # the pooled estimate of theta randomly split data into folds + nobs <- nrow(x) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- ztil<- rep(NA, nobs) + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out + zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual + ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial + cat(b," ") + } + ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) + coef.est <- ivfit$coef #extract coefficient + se <- ivfit$se #record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) +} + + +``` + + +----- + +# Emprical Example: Acemoglu, Jonsohn, Robinson (AER). + + +* Y is log GDP; +* D is a measure of Protection from Expropriation, a proxy for +quality of insitutions; +* Z is the log of Settler's mortality; +* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) + + + + +```{r} + +data(AJR); + +y = AJR$GDP; +d = AJR$Exprop; +z = AJR$logMort +xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) +x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data=AJR) +dim(x) + +# DML with Random Forest +cat(sprintf("\n DML with Random Forest \n")) + +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest + +set.seed(1) +DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=20) + +# DML with PostLasso +cat(sprintf("\n DML with Post-Lasso \n")) + +dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso +yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso +zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso + +set.seed(1) +DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=20) + + +# Compare Forest vs Lasso + +comp.tab= matrix(NA, 3, 2) +comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) +comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) +comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) +rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") +colnames(comp.tab) = c("RF", "LASSO") +print(comp.tab, digits=3) +``` + +# Examine if we have weak instruments + +```{r} +install.packages("lfe") +library(lfe) +summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) +summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) +``` + +# We do have weak instruments, because t-stats in regression $\tilde D \sim \tilde Z$ are less than 4 in absolute value + + +So let's carry out DML inference combined with Anderson-Rubin Idea + +```{r} +# DML-AR (DML with Anderson-Rubin) + +DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ + n = length(rY) + Cstat = rep(0, length(grid)) + for (i in 1:length(grid)) { + Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) + }; + LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); + UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); + plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); + lines(grid, Cstat, lty = 1, col = 1); + abline(h=qchisq(1-alpha,1), lty = 3, col = 4); + abline(v=LB, lty = 3, col = 2); + abline(v=UB, lty = 3, col = 2); + return(list(UB=UB, LB=LB)) + } + +``` + +```{r} +DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, + grid = seq(-2, 2, by =.01)) + + +DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, + grid = seq(-2, 2, by =.01)) + +``` diff --git a/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd b/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd new file mode 100644 index 00000000..d0691fcf --- /dev/null +++ b/deprecated/CM4/Old/sensitivity-analysis-with-sensmakr-and-debiased-ml.irnb.Rmd @@ -0,0 +1,265 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Sensititivy Analysis for Unobserved Confounder with DML and Sensmakr + + +## Here we experiment with using package "sensemakr" in conjunction with debiased ML + + +![Screen%20Shot%202021-04-02%20at%204.53.15%20PM.png](attachment:Screen%20Shot%202021-04-02%20at%204.53.15%20PM.png) + + +![Screen%20Shot%202021-04-02%20at%205.01.36%20PM.png](attachment:Screen%20Shot%202021-04-02%20at%205.01.36%20PM.png) + + +## We will + +## * mimic the partialling out procedure with machine learning tools, + +## * and invoke Sensmakr to compute $\phi^2$ and plot sensitivity results. + + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +# loads package +install.packages("sensemakr") +library(sensemakr) + +# loads data +data("darfur") + +``` + +Data is described here +https://cran.r-project.org/web/packages/sensemakr/vignettes/sensemakr.html + +The main outcome is attitude towards peace -- the peacefactor. +The key variable of interest is whether the responders were directly harmed (directlyharmed). +We want to know if being directly harmed in the conflict causes people to support peace-enforcing measures. +The measured confounders include female indicator, age, farmer, herder, voted in the past, and household size. +There is also a village indicator, which we will treat as fixed effect and partial it out before conducting +the analysis. The standard errors will be clustered at the village level. + + +# Take out village fixed effects and run basic linear analysis + +```{r} +#get rid of village fixed effects + +attach(darfur) +library(lfe) + +peacefactorR<- lm(peacefactor~village)$res +directlyharmedR<- lm(directlyharmed~village)$res +femaleR<- lm(female~village)$res +ageR<- lm(age~village)$res +farmerR<- lm(farmer_dar~village)$res +herderR<- lm(herder_dar~village)$res +pastvotedR<- lm(pastvoted~village)$res +hhsizeR<- lm(hhsize_darfur~village)$res + + +# Preliminary linear model analysis + +summary(felm(peacefactorR~ directlyharmedR+ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) + +# here we are clustering standard errors at the village level + + +summary(felm(peacefactorR~ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) + +# here we are clustering standard errors at the village level + + + +summary(felm(directlyharmedR~ femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR |0|0|village)) + +# here we are clustering standard errors at the village level + + +``` + +# We first use Lasso for Partilling Out Controls + +```{r} +library(hdm) + + +resY = rlasso(peacefactorR ~ (femaleR + + ageR + farmerR+ herderR + pastvotedR + hhsizeR)^3, post=F)$res + +resD = rlasso(directlyharmedR ~ (femaleR + + ageR + farmerR + herderR + pastvotedR + hhsizeR)^3 , post=F)$res + + +print(c("Controls explain the following fraction of variance of Outcome", 1-var(resY)/var(peacefactorR))) + + +print(c("Controls explain the following fraction of variance of Treatment", 1-var(resD)/var(directlyharmedR))) + + + +library(lfe) + + +dml.darfur.model= felm(resY ~ resD|0|0|village) # cluster SEs by village + +summary(dml.darfur.model,robust=T) #culster SE by village + +dml.darfur.model= lm(resY ~ resD) #lineaer model to use as input in sensemakr + + + + +``` + +# Manual Bias Analysis + +```{r} +# Main estimate + +beta = dml.darfur.model$coef[2] + +# Hypothetical values of partial R2s + +R2.YC = .16; R2.DC = .01 + +# Elements of the formal + +kappa<- (R2.YC * R2.DC)/(1- R2.DC) + +varianceRatio<- mean(dml.darfur.model$res^2)/mean(dml.darfur.model$res^2) + +# Compute square bias + +BiasSq <- kappa*varianceRatio + +# Compute absolute value of the bias + +print(sqrt(BiasSq)) + + +# plotting + +gridR2.DC<- seq(0,.3, by=.001) + +gridR2.YC<- kappa*(1 - gridR2.DC)/gridR2.DC + +gridR2.YC<- ifelse(gridR2.YC> 1, 1, gridR2.YC); + + + +plot(gridR2.DC, gridR2.YC, type="l", col=4, xlab="Partial R2 of Treatment with Confounder", + ylab="Partial R2 of Outcome with Confounder", + main= c("Combo of R2 such that |Bias|< ", round(sqrt(BiasSq), digits=4)) +) + + + + + + +``` + +# Bias Analysis with Sensemakr + +```{r} + +dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, + treatment = "resD") +summary(dml.darfur.sensitivity) + +plot(dml.darfur.sensitivity, nlevels = 15) + +``` + +# Next We use Random Forest as ML tool for Partialling Out + + +The following code does DML with clsutered standard errors by ClusterID + +```{r} +DML2.for.PLM <- function(x, d, y, dreg, yreg, nfold=2, clusterID) { + nobs <- nrow(x) #number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices + I <- split(1:nobs, foldid) #split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a foldt out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the left-out fold + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold + cat(b," ") + } + rfit <- felm(ytil ~ dtil |0|0|clusterID) #get clustered standard errors using felm + rfitSummary<- summary(rfit) + coef.est <- rfitSummary$coef[2] #extract coefficient + se <- rfitSummary$coef[2,2] #record robust standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) #printing output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil) ) #save output and residuals +} +``` + +```{r} +library(randomForest) #random Forest library + +``` + +```{r} + +x= model.matrix(~ femaleR + ageR + farmerR + herderR + pastvotedR + hhsizeR) + +dim(x) + +d= directlyharmedR + +y = peacefactorR; + +#DML with Random Forest: +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +set.seed(1) +DML2.RF = DML2.for.PLM(x, d, y, dreg, yreg, nfold=10, clusterID=village) + + +resY = DML2.RF$ytil + +resD = DML2.RF$dtil + + +print(c("Controls explain the following fraction of variance of Outcome", max(1-var(resY)/var(peacefactorR),0))) + + +print(c("Controls explain the following fraction of variance of Treatment", max(1-var(resD)/var(directlyharmedR),0))) + + + +dml.darfur.model= lm(resY~resD) + + +dml.darfur.sensitivity <- sensemakr(model = dml.darfur.model, + treatment = "resD") +summary(dml.darfur.sensitivity) + +plot(dml.darfur.sensitivity,nlevels = 15) + + +``` From 730134ecb15e989e34f467ec600737fffb6db9de Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:47:42 -0700 Subject: [PATCH 236/261] =?UTF-8?q?=CE=BF=CE=BB=CE=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...-and-lasso-for-wage-gap-inference.irnb.Rmd | 419 +++++++++++++++++ ...ols-and-lasso-for-wage-prediction.irnb.Rmd | 440 ++++++++++++++++++ ...notebook-linear-model-overfitting.irnb.Rmd | 79 ++++ 3 files changed, 938 insertions(+) create mode 100644 PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd create mode 100644 PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd create mode 100644 PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd diff --git a/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd b/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd new file mode 100644 index 00000000..396faa4a --- /dev/null +++ b/PM1/Old/ols-and-lasso-for-wage-gap-inference.irnb.Rmd @@ -0,0 +1,419 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains an example for teaching. + + +# An inferential problem: The Wage Gap + + +In the previous lab, we analyzed data from the March Supplement of the U.S. Current Population Survey (2015) and answered the question of how to use job-relevant characteristics, such as education and experience, to best predict wages. Now, we focus on the following inference question: + +What is the difference in predicted wages between men and women with the same job-relevant characteristics? + +Thus, we analyze if the differences in the wages of groups defined by the "sex" variable in the data. This wage gap may partly reflect *discrimination* against women, both in the labor market and in settings that affect future labor marketability such as in education, and may partly reflect a *selection effect*, namely that women are relatively more likely to take on occupations that pay somewhat less (for example, school teaching). + + +To investigate the wage gap, we consider the following log-linear regression model + +\begin{align} +\log(Y) &= \beta'X + \epsilon\\ +&= \beta_1 D + \beta_2' W + \epsilon, +\end{align} + +where $Y$ is hourly wage, $D$ is the indicator of being female ($1$ if female and $0$ otherwise) and the +$W$'s are a vector of worker characteristics explaining variation in wages. Considering transformed wages by the logarithm, we are analyzing the relative difference in the payment of men and women. + + +## Data analysis + + +We consider the same subsample of the U.S. Current Population Survey (2015) as in the previous lab. Let us load the data set. + +```{r} +load("../data/wage2015_subsample_inference.Rdata") +attach(data) +dim(data) +``` + +To start our (causal) analysis, we compare the sample means given the "sex" variable: + +```{r} +library(xtable) + +Z <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +data_female <- data[data$sex==1,] +Z_female <- data_female[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +data_male <- data[data$sex==0,] +Z_male <- data_male[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"))] + +table <- matrix(0, 12, 3) +table[1:12,1] <- as.numeric(lapply(Z,mean)) +table[1:12,2] <- as.numeric(lapply(Z_male,mean)) +table[1:12,3] <- as.numeric(lapply(Z_female,mean)) +rownames(table) <- c("Log Wage","Sex","Less then High School","High School Graduate","Some College","College Graduate","Advanced Degree", "Northeast","Midwest","South","West","Experience") +colnames(table) <- c("All","Men","Women") +tab<- xtable(table, digits = 4) +tab +``` + +```{r, results = FALSE} +print(tab,type="html") # set type="latex" for printing table in LaTeX +``` + + + + + + + + + + + + + + + + + +
All Men Women
Log Wage 2.9708 2.9878 2.9495
Sex 0.4445 0.0000 1.0000
Less then High School 0.0233 0.0318 0.0127
High School Graduate 0.2439 0.2943 0.1809
Some College 0.2781 0.2733 0.2840
Gollage Graduate 0.3177 0.2940 0.3473
Advanced Degree 0.1371 0.1066 0.1752
Northeast 0.2596 0.2590 0.2604
Midwest 0.2965 0.2981 0.2945
South 0.2161 0.2209 0.2101
West 0.2278 0.2220 0.2350
Experience 13.7606 13.7840 13.7313
+ + +In particular, the table above shows that the difference in average *logwage* between men and women is equal to $0.038$ + +```{r} +mean(data_female$lwage)-mean(data_male$lwage) +``` + +Thus, the unconditional wage gap is about $3.8$\% for the group of never married workers (women get paid less on average in our sample). We also observe that never married working women are relatively more educated than working men and have lower working experience. + + +This unconditional (predictive) effect of the variable "sex" equals the coefficient $\beta$ in the univariate ols regression of $Y$ on $D$: + +\begin{align} +\log(Y) &=\beta D + \epsilon. +\end{align} + + +We verify this by running an ols regression in R. + +```{r} +library(sandwich) # a package used to compute robust standard errors +nocontrol.fit <- lm(lwage ~ sex) +nocontrol.est <- summary(nocontrol.fit)$coef["sex",1] +HCV.coefs <- vcovHC(nocontrol.fit, type = 'HC'); # HC HEW - "heteroskedasticity consistent" +nocontrol.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors + +# print unconditional effect of sex and the corresponding standard error +cat ("The estimated coefficient on the dummy for \"sex\" is",nocontrol.est," and the corresponding robust standard error is",nocontrol.se) + +``` + +Note that the standard error is computed with the *R* package *sandwich* to be robust to heteroskedasticity. + + + +Next, we run an ols regression of $Y$ on $(D,W)$ to control for the effect of covariates summarized in $W$: + +\begin{align} +\log(Y) &=\beta_1 D + \beta_2' W + \epsilon. +\end{align} + +Here, we are considering the flexible model from the previous lab. Hence, $W$ controls for experience, education, region, and occupation and industry indicators plus transformations and two-way interactions. + + +Let us run the ols regression with controls. + +```{r} +# ols regression with controls + +flex <- lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) + +# Note that ()*() operation in formula objects in R creates a formula of the sort: +# (exp1+exp2+exp3+exp4)+ (shs+hsg+scl+clg+occ2+ind2+mw+so+we) + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) +# This is not intuitive at all, but that's what it does. + +control.fit <- lm(flex, data=data) +control.est <- summary(control.fit)$coef[2,1] + +summary(control.fit) + +cat("Coefficient for OLS with controls", control.est) + +HCV.coefs <- vcovHC(control.fit, type = 'HC'); +control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors +JCV.coefs <- vcovHC(control.fit, type = 'HC3'); # Jackknife estimate is more appropriate in moderate dimensional settings +control.Jse <- sqrt(diag(JCV.coefs))[2] # Estimated std errors +``` + +We now show how the conditional gap and the remainder decompose the marginal wage gap into the parts explained and unexplained by the additional controls. (Note that this does *not* explain why there is a difference in the controls to begin with in the two groups.) + +```{r} +cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage)) + +cat("The unexplained difference: ",control.est) + +XX0 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we), data = data[data$sex==0,]) +y0 = data[data$sex==0,]$lwage +XX1 = model.matrix(~(exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we), data = data[data$sex==1,]) +y1 = data[data$sex==1,]$lwage +mu1 = colMeans(XX1) +mu0 = colMeans(XX0) +betarest = summary(control.fit)$coef[3:246,1] # the coefficients excluding intercept and "sex" +cat("The explained difference:",sum(betarest*(mu1[2:245]-mu0[2:245]))) + +cat("The sum of these differences:",control.est + sum(betarest*(mu1[2:245]-mu0[2:245]))) +``` + +We next consider a Oaxaca-Blinder decomposition that also incorporates an interaction term. + +```{r} +library(MASS) +beta0 = ginv(t(XX0) %*% XX0) %*% t(XX0) %*% y0 +beta1 = ginv(t(XX1) %*% XX1) %*% t(XX1) %*% y1 + +cat("The marginal gap:",mean(data_female$lwage)-mean(data_male$lwage)) +cat("The unexplained difference:",beta1[1]-beta0[1]) +cat("The difference explained by endowment:",sum(beta0[2:245]*(mu1[2:245]-mu0[2:245]))) +cat("The difference explained by coefficient:",sum((beta1[2:245]-beta0[2:245])*mu1[2:245])) +cat("The sum of these differences:",beta1[1]-beta0[1] + sum(beta0[2:245]*(mu1[2:245]-mu0[2:245])) + sum((beta1[2:245]-beta0[2:245])*mu1[2:245])) +``` + +Let's compare Huber-Eicker-White (HEW) standard errors to jackknife standard errors (which are more appropriate in moderate dimensional settings.) We can see that they're pretty close in this case. + +```{r} +cat("HEW s.e. : ", control.se) +cat("Jackknife s.e. : ", control.Jse) + +``` + +The estimated regression coefficient $\beta_1\approx-0.0696$ measures how our linear prediction of wage changes if we set the "sex" variable $D$ from 0 to 1, holding the controls $W$ fixed. +We can call this the *predictive effect* (PE), as it measures the impact of a variable on the prediction we make. Overall, we see that the unconditional wage gap of size $4$\% for women increases to about $7$\% after controlling for worker characteristics. + + + +Next, we use the Frisch-Waugh-Lovell (FWL) theorem from lecture, partialling-out the linear effect of the controls via ols. + +```{r} +# Partialling-out using ols + +# models +flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for Y +flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for D + +# partialling-out the linear effect of W from Y +t.Y <- lm(flex.y, data=data)$res +# partialling-out the linear effect of W from D +t.D <- lm(flex.d, data=data)$res + +# regression of Y on D after partialling-out the effect of W +partial.fit <- lm(t.Y~t.D-1) +partial.est <- summary(partial.fit)$coef[1] + +cat("Coefficient for D via partialling-out", partial.est) + +# standard error +HCV.coefs <- vcovHC(partial.fit, type = 'HC') +partial.se <- sqrt(diag(HCV.coefs)) +# Note that jackknife standard errors depend on all the variables in the model and so are not appropriate for the partialed out regression (without adjustment) + + +# confidence interval +confint(partial.fit) +``` + +Again, the estimated coefficient measures the linear predictive effect (PE) of $D$ on $Y$ after taking out the linear effect of $W$ on both of these variables. This coefficient is numerically equivalent to the estimated coefficient from the ols regression with controls, confirming the FWL theorem. + + +We know that the partialling-out approach works well when the dimension of $W$ is low +in relation to the sample size $n$. When the dimension of $W$ is relatively high, we need to use variable selection +or penalization for regularization purposes. + +In the following, we illustrate the partialling-out approach using lasso instead of ols. + +```{r} +# Partialling-out using lasso + +library(hdm) + +# models +flex.y <- lwage ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for Y +flex.d <- sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) # model for D + +# partialling-out the linear effect of W from Y +t.Y <- rlasso(flex.y, data=data)$res +# partialling-out the linear effect of W from D +t.D <- rlasso(flex.d, data=data)$res + +# regression of Y on D after partialling-out the effect of W +partial.lasso.fit <- lm(t.Y~t.D-1) +partial.lasso.est <- summary(partial.lasso.fit)$coef[1] + +cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) + +# standard error +HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC') +partial.lasso.se <- sqrt(diag(HCV.coefs)) +``` + +Using lasso for partialling-out here provides similar results as using ols. + + +Next, we summarize the results. + +```{r} +table<- matrix(0, 4, 2) +table[1,1]<- nocontrol.est +table[1,2]<- nocontrol.se +table[2,1]<- control.est +table[2,2]<- control.se +table[3,1]<- partial.est +table[3,2]<- partial.se +table[4,1]<- partial.lasso.est +table[4,2]<- partial.lasso.se +colnames(table)<- c("Estimate","Std. Error") +rownames(table)<- c("Without controls", "full reg", "partial reg", "partial reg via lasso") +tab<- xtable(table, digits=c(3, 3, 4)) +tab +``` + +```{r, results = FALSE} +print(tab, type="html") +``` + + + + + + + + + +
Estimate Std. Error
Without controls -0.038 0.0159
full reg -0.070 0.0150
partial reg -0.070 0.0150
partial reg via lasso -0.072 0.0154
+ + + +It it worth noticing that controlling for worker characteristics increases the wage gap from less than 4\% to 7\%. The controls we used in our analysis include 5 educational attainment indicators (less than high school graduates, high school graduates, some college, college graduate, and advanced degree), 4 region indicators (midwest, south, west, and northeast); a quartic term (first, second, third, and fourth power) in experience and 22 occupation and 23 industry indicators. + +Keep in mind that the predictive effect (PE) does not only measures discrimination (causal effect of being female), it also may reflect +selection effects of unobserved differences in covariates between men and women in our sample. + + + +Next we try an "extra" flexible model, where we take interactions of all controls, giving us about 800 non-redundant controls. + +```{r} +# extra flexible model + +extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 + +control.fit <- lm(extraflex, data=data) +#summary(control.fit) +control.est <- summary(control.fit)$coef[2,1] + +cat("Number of Extra-Flex Controls", summary(control.fit)$df[1]-1, "\n") + +cat("Coefficient for OLS with extra flex controls", control.est) + +HCV.coefs <- vcovHC(control.fit, type = 'HC'); +control.se <- sqrt(diag(HCV.coefs))[2] # Estimated std errors + +JCV.coefs <- vcovHC(control.fit, type = 'HC3'); # Jackknife +control.Jse <- sqrt(diag(JCV.coefs))[2] # Estimated std errors + +cat("HEW s.e. : ", control.se) +cat("Jackknife s.e. : ", control.Jse) + +# From Cattaneo, Jannson, and Newey (2018), we expect jackknife s.e.'s to be +# conservative. +``` + +Interestingly, jackknife standard errors are undefined in this case. Due to the high dimensional control vector, we know that conventional heteroskedasticity robust standard errors will also be severely biased. That is, the approximation obtained under $p/n$ being small is clearly breaking down here. We might then like to implement Cattaneo, Jannson, and Newey (2018) (CJN) which is supposed to work in the $p/n \rightarrow c < 1$ regime. However, computing CJN requires inversion of a matrix which is computationally singular in this example (which is related to why the jackknife s.e. are undefined.) + +```{r, eval = FALSE, echo = TRUE} +# Try to make a brute force implementation of Cattaneo, Jannson, Newey (2018). +# This is slow and doesn't actually add anything as the matrix needed to +# construct CJN is (at least) numerically singular. Don't run this block. + +# models +extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for Y +extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for D + +# CJN requires that M.*M is invertible where M = I-W(W'W)^{-1}W' and .* is the Hadamard product + +regexflex.y = lm(extraflex.y, data = data, x = TRUE) # Regression of outcome on controls +W = tmp$x # Design matrix +Wli = W[,!is.na(regexflex.y$coefficients)] # Linearly independent columns of W +np = dim(Wli) +M = diag(np[1])-Wli%*%solve(t(Wli)%*%Wli)%*%t(Wli) # Matrix M (after removing redundant columns) +scM = 1 - min(diag(M)) # This number needs to be smaller than 1/2 for CJN theoretical results +scM # Just stop here + +#MM = M^2 # Hadamard product M.*M + +#library(Matrix) +#rankMatrix(MM) # find the (numeric) rank of MM which ends up being less than its dimension. Tried to actually move forward and use other methods to invert/assess invertibility, but none worked. Not going to use a generalized inverse as this goes against the idea of the theory. + +# Not going to be able to compute CJN +``` + +We can also try to use Lasso to partial out the control variables. We'll justify this later. + +```{r} +library(hdm) + +# models +extraflex.y <- lwage ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for Y +extraflex.d <- sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 # model for D + +# partialling-out the linear effect of W from Y +t.Y <- rlasso(extraflex.y, data=data)$res +# partialling-out the linear effect of W from D +t.D <- rlasso(extraflex.d, data=data)$res + +# regression of Y on D after partialling-out the effect of W +partial.lasso.fit <- lm(t.Y~t.D-1) +partial.lasso.est <- summary(partial.lasso.fit)$coef[1] + +cat("Coefficient for D via partialling-out using lasso", partial.lasso.est) + +# standard error +HCV.coefs <- vcovHC(partial.lasso.fit, type = 'HC3') +partial.lasso.se <- sqrt(diag(HCV.coefs)) +``` + +```{r} +table<- matrix(0, 2, 2) +table[1,1]<- control.est +table[1,2]<- control.se +table[2,1]<- partial.lasso.est +table[2,2]<- partial.lasso.se +colnames(table)<- c("Estimate","Std. Error") +rownames(table)<- c("full reg","partial reg via lasso") +tab<- xtable(table, digits=c(3, 3, 4)) +tab + +print(tab, type="latex") +``` + +In this case $p/n$ = 20%, that is $p/n$ is no longer small and we start seeing the differences between unregularized partialling out and regularized partialling out with lasso (double lasso). The results based on double lasso have rigorous guarantees in this non-small $p/n$ regime under approximate sparsity. The results based on OLS still +have guarantees in $p/n< 1$ regime under assumptions laid out in Cattaneo, Newey, and Jansson (2018), without approximate sparsity, although other regularity conditions are needed. These additional regularity conditions appear to potentially be violated in our "very flexible" specification. + + + + + diff --git a/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd b/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd new file mode 100644 index 00000000..342b5c8f --- /dev/null +++ b/PM1/Old/ols-and-lasso-for-wage-prediction.irnb.Rmd @@ -0,0 +1,440 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains an example for teaching. + + +## Introduction + + +An important question in labor economics is what determines the wage of workers. This is a causal question, but we can begin to investigate it from a predictive perspective. + +In the following wage example, $Y$ is the hourly wage of a worker and $X$ is a vector of worker's characteristics, e.g., education, experience, gender. Two main questions here are + +* How can we use job-relevant characteristics, such as education and experience, to best predict wages? + +* What is the difference in predicted wages between men and women with the same job-relevant characteristics? + +In this lab, we focus on the prediction question first. + + +## Data + + + +The data set we consider is from the 2015 March Supplement of the U.S. Current Population Survey. We select white non-hispanic individuals, aged 25 to 64 years, and working more than 35 hours per week for at least 50 weeks of the year. We exclude self-employed workers; individuals living in group quarters; individuals in the military, agricultural or private household sectors; individuals with inconsistent reports on earnings and employment status; individuals with allocated or missing information in any of the variables used in the analysis; and individuals with hourly wage below $3$. + +The variable of interest $Y$ is the hourly wage rate constructed as the ratio of the annual earnings to the total number of hours worked, which is constructed in turn as the product of number of weeks worked and the usual number of hours worked per week. In our analysis, we also focus on single (never married) workers. The final sample is of size $n=5150$. + + +## Data analysis + + +We start by loading the data set. + +```{r} +load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") +#load("../Data/wage2015_subsample_inference.Rdata") # To run locally on Hansen's PC +dim(data) +``` + +Let's have a look at the structure of the data. + +```{r} +str(data) +``` + +We construct the output variable $Y$ and the matrix $Z$ which includes the characteristics of workers that are given in the data. + +```{r} +# construct matrices for estimation from the data +Y <- log(data$wage) +n <- length(Y) +Z <- data[-which(colnames(data) %in% c("wage","lwage"))] +p <- dim(Z)[2] + +cat("Number of observations:", n, '\n') +cat( "Number of raw regressors:", p) +``` + +For the outcome variable *wage* and a subset of the raw regressors, we calculate the empirical mean to get familiar with the data. + +```{r} +# generate a table of means of variables +library(xtable) +Z_subset <- data[which(colnames(data) %in% c("lwage","sex","shs","hsg","scl","clg","ad","mw","so","we","ne","exp1"))] +table <- matrix(0, 12, 1) +table[1:12,1] <- as.numeric(lapply(Z_subset,mean)) +rownames(table) <- c("Log Wage","Sex","Some High School","High School Graduate","Some College","College Graduate", "Advanced Degree","Midwest","South","West","Northeast","Experience") +colnames(table) <- c("Sample mean") +tab<- xtable(table, digits = 2) +tab +``` + +E.g., the share of female workers in our sample is ~44% ($sex=1$ if female). + + +Alternatively, using the xtable package, we can also print the table in LaTeX. + +```{r} +print(tab, type="latex") # type="latex" for printing table in LaTeX +``` + +## Prediction Question + + +Now, we will construct a prediction rule for hourly wage $Y$, which depends linearly on job-relevant characteristics $X$: + +\begin{equation}\label{decompose} +Y = \beta'X+ \epsilon. +\end{equation} + + +Our goals are + +* Predict wages using various characteristics of workers. + +* Assess the predictive performance of a given model using the (adjusted) sample MSE, the (adjusted) sample $R^2$ and the out-of-sample MSE and $R^2$. + + +We employ two different specifications for prediction: + + +1. Basic Model: $X$ consists of a set of raw regressors (e.g. gender, experience, education indicators, occupation and industry indicators and regional indicators). + + +2. Flexible Model: $X$ consists of all raw regressors from the basic model plus a dictionary of transformations (e.g., ${exp}^2$ and ${exp}^3$) and additional two-way interactions of a polynomial in experience with other regressors. An example of a regressor created through a two-way interaction is *experience* times the indicator of having a *college degree*. + +3. ``Extra Flexible'' Model: $X$ consists of two way interactions of all raw variables, giving us about 1000 controls. + +Using more flexible models enables us to approximate the real relationship by a more complex regression model and therefore has the potential to reduce the bias relative to a more simple specification that cannot capture a complex relationship. That is, flexible models increase the range of potential shapes that can be accommodated by the estimated regression function. With sufficient data, flexible models often deliver higher prediction accuracy than simpler models but are harder to interpret. In small data sets, simpler models often perform relatively well. + + +Now, let us our three candidate models to our data by running ordinary least squares (ols): + +```{r} +# 1. basic model +basic <- lwage~ (sex + exp1 + shs + hsg+ scl + clg + mw + so + we +occ2+ind2) +regbasic <- lm(basic, data=data) # perform ols using the defined model +regbasic # estimated coefficients +cat( "Number of regressors in the basic model:",length(regbasic$coef), '\n') # number of regressors in the Basic Model + +``` + +##### Note that the basic model consists of $51$ regressors. + +```{r} +# 2. flexible model +flex <- lwage ~ sex + shs+hsg+scl+clg+mw+so+we+occ2+ind2 + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we) +regflex <- lm(flex, data=data) +regflex # estimated coefficients +cat( "Number of regressors in the flexible model:",length(regflex$coef), "\n") # number of regressors in the Flexible Model + +``` + +##### Note that the flexible model consists of $246$ regressors. + + +```{r} +# 3. extra flexible model +extraflex <- lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)^2 +regextra <- lm(extraflex, data=data) +cat( "Number of regressors in the extra flexible model:",sum(!is.na(regextra$coefficients)), "\n") # number of regressors in the extra flexible Model + +``` + +##### Note that the extra flexible model consists of $780$ non-redundant regressors. + + + +#### Re-estimating the flexible and extra-flexible model using Lasso +We re-estimate the flexible model using Lasso (the least absolute shrinkage and selection operator) rather than ols. Lasso is a penalized regression method that can be used to reduce the complexity of a regression model when the ratio $p/n$ is not small. We will introduce this approach formally later in the course, but for now, we try it out here as a black-box method. For now, we use a simple default plug-in rule for choosing the penalization for Lasso. + +```{r, results = FALSE} +# Flexible model using Lasso +library(hdm) +lassoreg<- rlasso(flex, data=data, post=FALSE) # Post= FALSE gives lasso +sumlasso<- summary(lassoreg) + +lassoexflex <- rlasso(extraflex, data = data, post=FALSE) # Post= FALSE gives lasso +sumlassoflex <- summary(lassoexflex) + +``` + +#### Evaluating the predictive performance of the models in-sample +Now, we can evaluate the performance of our models based on in-sample measures of fit -- the (adjusted) $R^2_{sample}$ and the (adjusted) $MSE_{sample}$: + +```{r} +# Assess predictive performance +sumbasic <- summary(regbasic) +sumflex <- summary(regflex) +sumextra <- summary(regextra) + +# R-squared and adjusted R-squared +R2.1 <- sumbasic$r.squared +cat("R-squared for the basic model: ", R2.1, "\n") +R2.adj1 <- sumbasic$adj.r.squared +cat("adjusted R-squared for the basic model: ", R2.adj1, "\n") + +R2.2 <- sumflex$r.squared +cat("R-squared for the flexible model: ", R2.2, "\n") +R2.adj2 <- sumflex$adj.r.squared +cat("adjusted R-squared for the flexible model: ", R2.adj2, "\n") + +R2.3 <- sumextra$r.squared +cat("R-squared for the extra flexible model: ", R2.3, "\n") +R2.adj3 <- sumextra$adj.r.squared +cat("adjusted R-squared for the extra flexible model: ", R2.adj3, "\n") + +R2.L <- sumlasso$r.squared +cat("R-squared for lasso with the flexible model: ", R2.L, "\n") +R2.adjL <- sumlasso$adj.r.squared +cat("adjusted R-squared for lasso with the flexible model: ", R2.adjL, "\n") + +R2.L2 <- sumlassoflex$r.squared +cat("R-squared for lasso with the very flexible model: ", R2.L2, "\n") +R2.adjL2 <- sumlassoflex$adj.r.squared +cat("adjusted R-squared for lasso with the flexible model: ", R2.adjL2, "\n") + + +# MSE and adjusted MSE +MSE1 <- mean(sumbasic$res^2) +cat("MSE for the basic model: ", MSE1, "\n") +p1 <- sumbasic$df[1] # number of regressors +MSE.adj1 <- (n/(n-p1))*MSE1 +cat("adjusted MSE for the basic model: ", MSE.adj1, "\n") + +MSE2 <-mean(sumflex$res^2) +cat("MSE for the flexible model: ", MSE2, "\n") +p2 <- sumflex$df[1] +MSE.adj2 <- (n/(n-p2))*MSE2 +cat("adjusted MSE for the flexible model: ", MSE.adj2, "\n") + +MSE3 <-mean(sumextra$res^2) +cat("MSE for the extra flexible model: ", MSE3, "\n") +p3 <- sumextra$df[1] +MSE.adj3 <- (n/(n-p3))*MSE3 +cat("adjusted MSE for the extra flexible model: ", MSE.adj3, "\n") + + +MSEL <-mean(sumlasso$res^2) +cat("MSE for the lasso with the flexible model: ", MSEL, "\n") +pL <- sum(sumlasso$coef != 0) +MSE.adjL <- (n/(n-pL))*MSEL +cat("adjusted MSE for the lasso with the flexible model: ", MSE.adjL, "\n") + +MSEL2 <-mean(sumlassoflex$res^2) +cat("MSE for the lasso with very flexible model: ", MSEL2, "\n") +pL2 <- sum(sumlassoflex$coef != 0) +MSE.adjL2 <- (n/(n-pL2))*MSEL2 +cat("adjusted MSE for the lasso with very flexible model: ", MSE.adjL2, "\n") + +``` + +```{r} +# Output the table +library(xtable) +table <- matrix(0, 5, 5) +table[1,1:5] <- c(p1,R2.1,MSE1,R2.adj1,MSE.adj1) +table[2,1:5] <- c(p2,R2.2,MSE2,R2.adj2,MSE.adj2) +table[3,1:5] <- c(p3,R2.3,MSE3,R2.adj3,MSE.adj3) +table[4,1:5] <- c(pL,R2.L,MSEL,R2.adjL,MSE.adjL) +table[5,1:5] <- c(pL2,R2.L2,MSEL2,R2.adjL2,MSE.adjL2) +colnames(table)<- c("p","$R^2_{sample}$","$MSE_{sample}$","$R^2_{adjusted}$", "$MSE_{adjusted}$") +rownames(table)<- c("basic","flexible","very flexible","flexible-Lasso","very flexible-Lasso") +tab<- xtable(table, digits =c(0,0,2,2,2,2)) +print(tab,type="latex") +tab +``` + +Considering the measures above, the very flexible model estimated by OLS seems to perform better than the other approaches. Note, however, that the adjusted and regular measures are very different for this specification because $p/n$ is not small in this case. We also see that the differences between the usual and adjusted measures of fit increase as $p$ increases -- as predicted by theory. Finally, Lasso produces relatively stable results in both regimes that is comparable, though seems to be mildly worse in terms of predictive performance, than the OLS prediction rules. + +Let's now look at **data splitting** which provides a general procedure to assess predictive performance regardless of the ratio $p/n$. We illustrate the approach in the following. + + +## Data Splitting + +- Randomly split the data into one training sample and one testing sample. Here we just use a simple method (stratified splitting is a more sophisticated version of splitting that we might consider). +- Use the training sample to estimate the parameters of the different predictive models. +- Use the testing sample for evaluation. Predict the $\mathtt{wage}$ of every observation in the testing sample based on the estimated parameters in the training sample. +- Calculate the Mean Squared Prediction Error $MSE_{test}$ based on the testing sample for the candidate prediction models. + +```{r} +# splitting the data +set.seed(1) # to make the results replicable (we will generate random numbers) +random <- sample(1:n, floor(n*4/5)) +# draw (4/5)*n random numbers from 1 to n without replacing them +train <- data[random,] # training sample +test <- data[-random,] # testing sample +nV <- nrow(train) + +``` + +```{r} +# basic model +# estimating the parameters in the training sample +regbasic <- lm(basic, data=train) + +# calculating the out-of-sample MSE +trainregbasic <- predict(regbasic, newdata=test) +y.test <- log(test$wage) +MSE.test1 <- sum((y.test-trainregbasic)^2)/length(y.test) +R2.test1<- 1- MSE.test1/(sum((y.test-mean(train$lwage))^2)/length(y.test)) + +cat("Test MSE for the basic model: ", MSE.test1, " ") + +cat("Test R2 for the basic model: ", R2.test1) + +# in-sample MSE and R^2 +sumbasicV <- summary(regbasic) + +R2V.1 <- sumbasicV$r.squared +cat("Training R-squared for the basic model: ", R2V.1, "\n") +R2V.adj1 <- sumbasicV$adj.r.squared +cat("Training adjusted R-squared for the basic model: ", R2V.adj1, "\n") + +MSE1V <- mean(sumbasicV$res^2) +cat("Training MSE for the basic model: ", MSE1V, "\n") +p1V <- sumbasicV$df[1] # number of regressors +MSEV.adj1 <- (nV/(nV-p1V))*MSE1V +cat("Training adjusted MSE for the basic model: ", MSEV.adj1, "\n") + +``` + +In the basic model, the $MSE_{test}$ is relatively close to the $MSE_{sample}$. + +```{r} +# flexible model +# estimating the parameters +options(warn=-1) # ignore warnings +regflex <- lm(flex, data=train) + +# calculating the out-of-sample MSE +trainregflex<- predict(regflex, newdata=test) +y.test <- log(test$wage) +MSE.test2 <- sum((y.test-trainregflex)^2)/length(y.test) +R2.test2<- 1- MSE.test2/(sum((y.test-mean(train$lwage))^2)/length(y.test)) + +cat("Test MSE for the flexible model: ", MSE.test2, " ") + +cat("Test R2 for the flexible model: ", R2.test2) + +# in-sample MSE and R^2 +sumflexV <- summary(regflex) + +R2V.2 <- sumflexV$r.squared +cat("Training R-squared for the flexible model: ", R2V.2, "\n") +R2V.adj2 <- sumflexV$adj.r.squared +cat("Training adjusted R-squared for the flexible model: ", R2V.adj2, "\n") + +MSE2V <-mean(sumflexV$res^2) +cat("Training MSE for the flexible model: ", MSE2V, "\n") +p2V <- sumflexV$df[1] +MSEV.adj2 <- (nV/(nV-p2V))*MSE2V +cat("Training adjusted MSE for the flexible model: ", MSEV.adj2, "\n") + +``` + +In the flexible model, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is modest. + +```{r} +# very flexible model +# estimating the parameters +options(warn=-1) # ignore warnings +regextra <- lm(extraflex, data=train) + +# calculating the out-of-sample MSE +trainregextra<- predict(regextra, newdata=test) +y.test <- log(test$wage) +MSE.test3 <- sum((y.test-trainregextra)^2)/length(y.test) +R2.test3<- 1- MSE.test3/(sum((y.test-mean(train$lwage))^2)/length(y.test)) + +cat("Test MSE for the very flexible model: ", MSE.test3, " ") + +cat("Test R2 for the very flexible model: ", R2.test3) + +# in-sample MSE and R^2 +sumextraV <- summary(regextra) + +R2V.3 <- sumextraV$r.squared +cat("Training R-squared for the extra flexible model: ", R2V.3, "\n") +R2V.adj3 <- sumextraV$adj.r.squared +cat("Training adjusted R-squared for the extra flexible model: ", R2V.adj3, "\n") + +MSE3V <-mean(sumextraV$res^2) +cat("Training MSE for the extra flexible model: ", MSE3V, "\n") +p3V <- sumextraV$df[1] +MSEV.adj3 <- (nV/(nV-p3V))*MSE3V +cat("Training adjusted MSE for the extra flexible model: ", MSEV.adj3, "\n") + +``` + +In the very flexible model, the discrepancy between the $MSE_{test}$ and the $MSE_{sample}$ is large because $p/n$ is not small. + +It is worth noticing that the $MSE_{test}$ varies across different data splits. Hence, it is a good idea to average the out-of-sample MSE over different data splits to get valid results. + +Nevertheless, we observe that, based on the out-of-sample $MSE$, the basic model using ols regression performs **about as well (or slightly better)** than the flexible model and both perform much better than the very flexible model. + +Next, let us use lasso regression in the flexible and very flexible models instead of ols regression. The out-of-sample $MSE$ on the test sample can be computed for any black-box prediction method, so we also compare the performance of lasso regression in these models to our previous ols regressions. + +```{r} +# flexible model using lasso +library(hdm) # a library for high-dimensional metrics +reglasso <- rlasso(flex, data=train, post=FALSE) # estimating the parameters +lassoexflex <- rlasso(extraflex, data = data, post=FALSE) # Post= FALSE gives lasso + +# calculating the out-of-sample MSE +trainreglasso<- predict(reglasso, newdata=test) +MSE.lasso <- sum((y.test-trainreglasso)^2)/length(y.test) +R2.lasso<- 1- MSE.lasso/(sum((y.test-mean(train$lwage))^2)/length(y.test)) + +cat("Test MSE for the lasso on flexible model: ", MSE.lasso, " ") + +cat("Test R2 for the lasso flexible model: ", R2.lasso) + +trainlassoexflex<- predict(lassoexflex, newdata=test) +MSE.lassoexflex <- sum((y.test-trainlassoexflex)^2)/length(y.test) +R2.lassoexflex <- 1- MSE.lassoexflex/(sum((y.test-mean(train$lwage))^2)/length(y.test)) + +cat("Test MSE for the lasso on the very flexible model: ", MSE.lassoexflex, " ") + +cat("Test R2 for the lasso on the very flexible model: ", R2.lassoexflex) + +``` + +Finally, let us summarize the results: + +```{r} +# Output the comparison table +table2 <- matrix(0, 5,2) +table2[1,1] <- MSE.test1 +table2[2,1] <- MSE.test2 +table2[3,1] <- MSE.test3 +table2[4,1] <- MSE.lasso +table2[5,1] <- MSE.lassoexflex +table2[1,2] <- R2.test1 +table2[2,2] <- R2.test2 +table2[3,2] <- R2.test3 +table2[4,2] <- R2.lasso +table2[5,2] <- R2.lassoexflex + +rownames(table2)<- rownames(table)<- c("basic","flexible","very flexible","flexible-Lasso","very flexible-Lasso") +colnames(table2)<- c("$MSE_{test}$", "$R^2_{test}$") +tab2 <- xtable(table2, digits =3) +tab2 +``` + +```{r} +print(tab2,type="latex") +``` diff --git a/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd b/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd new file mode 100644 index 00000000..f925cba0 --- /dev/null +++ b/PM1/Old/r-notebook-linear-model-overfitting.irnb.Rmd @@ -0,0 +1,79 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Simple Exercise on Overfitting + + + +First set p=n + +```{r} + +set.seed(123) +n = 1000 + +p = n +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) + +``` + +Second, set p=n/2. + +```{r} + +set.seed(123) +n = 1000 + +p = n/2 +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) + +``` + +Third, set p/n =.05 + +```{r} + +set.seed(123) +n = 1000 + +p = .05*n +X<- matrix(rnorm(n*p), n, p) +Y<- rnorm(n) + +print("p/n is") +print(p/n) +print("R2 is") +print(summary(lm(Y~X))$r.squared) +print("Adjusted R2 is") +print(summary(lm(Y~X))$adj.r.squared) + + +``` From 72abcaaf650327430dabf5ec37bce8d155b3ac0c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:48:20 -0700 Subject: [PATCH 237/261] old --- PM2/Old/heterogenous-wage-effects.irnb.Rmd | 76 +++++ ...experiment-on-orthogonal-learning.irnb.Rmd | 113 +++++++ .../r-notebook-linear-penalized-regs.irnb.Rmd | 291 ++++++++++++++++++ 3 files changed, 480 insertions(+) create mode 100644 PM2/Old/heterogenous-wage-effects.irnb.Rmd create mode 100644 PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd create mode 100644 PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd diff --git a/PM2/Old/heterogenous-wage-effects.irnb.Rmd b/PM2/Old/heterogenous-wage-effects.irnb.Rmd new file mode 100644 index 00000000..936a40e1 --- /dev/null +++ b/PM2/Old/heterogenous-wage-effects.irnb.Rmd @@ -0,0 +1,76 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Application: Heterogeneous Effect of Gender on Wage Using Double Lasso + + We use US census data from the year 2012 to analyse the effect of gender and interaction effects of other variables with gender on wage jointly. The dependent variable is the logarithm of the wage, the target variable is *female* (in combination with other variables). All other variables denote some other socio-economic characteristics, e.g. marital status, education, and experience. For a detailed description of the variables we refer to the help page. + + + +This analysis allows a closer look how discrimination according to gender is related to other socio-economic variables. + + + + +```{r} +library(hdm) +data(cps2012) +str(cps2012) +``` + +```{r} +# create the model matrix for the covariates +X <- model.matrix(~-1 + female + female:(widowed + divorced + separated + nevermarried + +hsd08 + hsd911 + hsg + cg + ad + mw + so + we + exp1 + exp2 + exp3) + +(widowed + +divorced + separated + nevermarried + hsd08 + hsd911 + hsg + cg + ad + mw + so + +we + exp1 + exp2 + exp3)^2, data = cps2012) +X <- X[, which(apply(X, 2, var) != 0)] # exclude all constant variables +dim(X) + +# target variables, index.gender specifices coefficients we are interested in +index.gender <- grep("female", colnames(X)) +y <- cps2012$lnw +``` + +The parameter estimates for the target parameters, i.e. all coefficients related to gender (i.e. by interaction with other variables) are calculated and summarized by the following commands: + + + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +effects.female <- rlassoEffects(x = X, y = y, index = index.gender) +summary(effects.female) +library(stargazer) +stargazer(summary(effects.female)[1]) +``` + +Now, we estimate and plot confident intervals, first "pointwise" and then the joint confidence intervals. + +```{r} +joint.CI <- confint(effects.female, level = 0.95) +joint.CI +plot(effects.female, level=0.95) # plot of the effects +stargazer(joint.CI) +``` + +Finally, we compare the pointwise confidence intervals to joint confidence intervals. + +```{r} +joint.CI <- confint(effects.female, level = 0.95, joint = TRUE) +joint.CI +plot(effects.female, joint=TRUE, level=0.95) # plot of the effects +stargazer(joint.CI) + +# the plot output does not look great +``` diff --git a/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd b/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd new file mode 100644 index 00000000..0ae82ac4 --- /dev/null +++ b/PM2/Old/r-notebook-experiment-on-orthogonal-learning.irnb.Rmd @@ -0,0 +1,113 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +# Simulation Design + +library(hdm) + +set.seed(1) +B= 1000 # trials +Naive = rep(0, B) +Orthogonal = rep(0, B) + +for (i in 1:B){ + +n=100 +p= 100 +beta = 1/(1:p)^2 +gamma =1/(1:p)^2 + +X=matrix(rnorm(n*p), n, p) + + +D= X%*%gamma + rnorm(n)/4 + +Y = D+ X%*%beta + rnorm(n) + +# single selection method + +SX.IDs = which(rlasso(Y~ D+X)$coef[-c(1,2)] !=0) #select covariates by Lasso + + +if (sum(SX.IDs)==0) {Naive[i] = lm(Y~ D)$coef[2]} + +if (sum(SX.IDs)>0) {Naive[i] = lm(Y~ D + X[,SX.IDs])$coef[2]} + + + +#partialling out + +resY = rlasso(Y~ X, Post=F)$res +resD = rlasso(D~ X, Post=F)$res +Orthogonal[i]= lm(resY ~ resD)$coef[2] + +} + +``` + +```{r} +hist(Orthogonal-1,col=4, freq=F, xlim= c(-2, 2), xlab= "Orhtogonal -True ", main="Orthogonal") +hist(Naive-1, col=2, freq=F, xlim= c(-2,2), xlab= "Naive- True", main = "Naive") + +``` + +```{r} +library(hdm) + +set.seed(1) +B= 1000 # trials +Naive = rep(0, B) +Orthogonal = rep(0, B) + +for (i in 1:B){ + +n=100 +p= 100 +beta = 1/(1:p)^2 +gamma =1/(1:p)^2 + +X=matrix(rnorm(n*p), n, p) + + +D= X%*%gamma + rnorm(n)/4 + +Y = D+ X%*%beta + rnorm(n) + +# single selection method + +SX.IDs = which(rlasso(Y~ D+X)$coef[-c(1,2)] !=0) #select covariates by Lasso + + +if (sum(SX.IDs)==0) {Naive[i] = lm(Y~ D)$coef[2]} + +if (sum(SX.IDs)>0) {Naive[i] = lm(Y~ D + X[,SX.IDs])$coef[2]} + + + +#partialling out + +resY = rlasso(Y~ X, Post=T)$res +resD = rlasso(D~ X, Post=T)$res +Orthogonal[i]= lm(resY ~ resD)$coef[2] + +} + +``` + +```{r} +hist(Orthogonal-1,col=4, freq=F, xlim= c(-2, 2), xlab= "Orhtogonal -True ", main="Orthogonal") +hist(Naive-1, col=2, freq=F, xlim= c(-2,2), xlab= "Naive- True", main = "Naive") + +``` diff --git a/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd b/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd new file mode 100644 index 00000000..5f4c4d56 --- /dev/null +++ b/PM2/Old/r-notebook-linear-penalized-regs.irnb.Rmd @@ -0,0 +1,291 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + + + +This notebook contains an example for teaching. + + + + +# Penalized Linear Regressions: A Simulation Experiment + + +## Data Generating Process: Approximately Sparse + +```{r} +set.seed(1) + +n = 100; +p = 400; + +Z= runif(n)-1/2; +W = matrix(runif(n*p)-1/2, n, p); + + + +beta = 1/seq(1:p)^2; # approximately sparse beta +#beta = rnorm(p)*.2 # dense beta +gX = exp(4*Z)+ W%*%beta; # leading term nonlinear +X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) + + +Y = gX + rnorm(n); #generate Y + + +plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) + +print( c("theoretical R2:", var(gX)/var(Y))) + +var(gX)/var(Y); #theoretical R-square in the simulation example + + + + +``` + +We use package Glmnet to carry out predictions using cross-validated lasso, ridge, and elastic net + +```{r} + +library(glmnet) +fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss +fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss + +yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions +yhat.ridge <- predict(fit.ridge, newx = X) +yhat.elnet <- predict(fit.elnet, newx = X) + +MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + +``` + +Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm + +```{r} +library(hdm) +fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level +fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level + +yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X +yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X + +MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +``` + +Next we code up lava, which alternates the fitting of lasso and ridge + +```{r} +library(glmnet) + +lava.predict<- function(X,Y, iter=5){ + +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part + + +i=1 +while(i<= iter) { +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part +i = i+1 } + +return(g1+m1); + } + + +yhat.lava = lava.predict(X,Y) +MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +MSE.lava +``` + +```{r} +library(xtable) +table<- matrix(0, 6, 2) +table[1,1:2] <- MSE.lasso.cv +table[2,1:2] <- MSE.ridge +table[3,1:2] <- MSE.elnet +table[4,1:2] <- MSE.lasso +table[5,1:2] <- MSE.lasso.post +table[6,1:2] <- MSE.lava + +colnames(table)<- c("MSA", "S.E. for MSA") +rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", + "Lasso","Post-Lasso","Lava") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab + + +``` + +```{r} + +plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") + +points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) +points(gX, yhat.rlasso.post, col=3, pch=17, cex = 1.2 ) +points( gX, yhat.lasso.cv,col=4, pch=19, cex = 1.2 ) + + +legend("bottomright", + legend = c("rLasso", "Post-rLasso", "CV Lasso"), + col = c(2,3,4), + pch = c(18,17, 19), + bty = "n", + pt.cex = 1.3, + cex = 1.2, + text.col = "black", + horiz = F , + inset = c(0.1, 0.1)) + + +``` + +## Data Generating Process: Approximately Sparse + Small Dense Part + +```{r} +set.seed(1) + +n = 100; +p = 400; + +Z= runif(n)-1/2; +W = matrix(runif(n*p)-1/2, n, p); + + +beta = rnorm(p)*.2 # dense beta +gX = exp(4*Z)+ W%*%beta; # leading term nonlinear +X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) + + +Y = gX + rnorm(n); #generate Y + + +plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) + +print( c("theoretical R2:", var(gX)/var(Y))) + +var(gX)/var(Y); #theoretical R-square in the simulation example + + + +``` + +```{r} + +library(glmnet) +fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss +fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss + +yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions +yhat.ridge <- predict(fit.ridge, newx = X) +yhat.elnet <- predict(fit.elnet, newx = X) + +MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + +``` + +```{r} +library(hdm) +fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level +fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level + +yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X +yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X + +MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +``` + +```{r} +library(glmnet) + +lava.predict<- function(X,Y, iter=5){ + +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part + + +i=1 +while(i<= iter) { +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part +i = i+1 } + +return(g1+m1); + } + + +yhat.lava = lava.predict(X,Y) +MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +MSE.lava +``` + +```{r} +library(xtable) +table<- matrix(0, 6, 2) +table[1,1:2] <- MSE.lasso.cv +table[2,1:2] <- MSE.ridge +table[3,1:2] <- MSE.elnet +table[4,1:2] <- MSE.lasso +table[5,1:2] <- MSE.lasso.post +table[6,1:2] <- MSE.lava + +colnames(table)<- c("MSA", "S.E. for MSA") +rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", + "Lasso","Post-Lasso","Lava") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab + +``` + +```{r} + +plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") + +points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) +points(gX, yhat.elnet, col=3, pch=17, cex = 1.2 ) +points(gX, yhat.lava, col=4, pch=19, cex = 1.2 ) + + +legend("bottomright", + legend = c("rLasso", "Elnet", "Lava"), + col = c(2,3,4), + pch = c(18,17, 19), + bty = "n", + pt.cex = 1.3, + cex = 1.2, + text.col = "black", + horiz = F , + inset = c(0.1, 0.1)) + +``` From edc2401ec065f579a53ee72a26ec1e576e4719e2 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:49:03 -0700 Subject: [PATCH 238/261] Create automl-for-wage-prediction.irnb.Rmd --- PM3/automl-for-wage-prediction.irnb.Rmd | 126 ++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 PM3/automl-for-wage-prediction.irnb.Rmd diff --git a/PM3/automl-for-wage-prediction.irnb.Rmd b/PM3/automl-for-wage-prediction.irnb.Rmd new file mode 100644 index 00000000..1d9e01c7 --- /dev/null +++ b/PM3/automl-for-wage-prediction.irnb.Rmd @@ -0,0 +1,126 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains an example for teaching. + + + +# Automatic Machine Learning with H2O AutoML + + +We illustrate how to predict an outcome variable Y in a high-dimensional setting, using the AutoML package *H2O* that covers the complete pipeline from the raw dataset to the deployable machine learning model. In last few years, AutoML or automated machine learning has become widely popular among data science community. Again, re-analyse the wage prediction problem using data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. + + +We can use AutoML as a benchmark and compare it to the methods that we used in the previous notebook where we applied one machine learning method after the other. + +```{r} +# load the H2O package +library(h2o) +``` + +```{r} +# load the data set +load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") + +# split the data +set.seed(1234) +training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) + +train <- data[training,] +test <- data[-training,] +``` + +```{r} +# start h2o cluster +h2o.init() +``` + +```{r} +# convert data as h2o type +train_h = as.h2o(train) +test_h = as.h2o(test) + +# have a look at the data +h2o.describe(train_h) +``` + +```{r} +# define the variables +y = 'lwage' +x = setdiff(names(data), c('wage','occ2', 'ind2')) + +# run AutoML for 10 base models and a maximal runtime of 100 seconds +aml = h2o.automl(x=x,y = y, + training_frame = train_h, + leaderboard_frame = test_h, + max_models = 10, + seed = 1, + max_runtime_secs = 100 + ) +# AutoML Leaderboard +lb = aml@leaderboard +print(lb, n = nrow(lb)) +``` + +We see that two Stacked Ensembles are at the top of the leaderboard. Stacked Ensembles often outperform a single model. The out-of-sample (test) MSE of the leading model is given by + +```{r} +aml@leaderboard$mse[1] +``` + +The in-sample performance can be evaluated by + +```{r} +aml@leader +``` + +This is in line with our previous results. To understand how the ensemble works, let's take a peek inside the Stacked Ensemble "All Models" model. The "All Models" ensemble is an ensemble of all of the individual models in the AutoML run. This is often the top performing model on the leaderboard. + +```{r} +model_ids <- as.data.frame(aml@leaderboard$model_id)[,1] +# Get the "All Models" Stacked Ensemble model +se <- h2o.getModel(grep("StackedEnsemble_AllModels", model_ids, value = TRUE)[1]) +# Get the Stacked Ensemble metalearner model +metalearner <- se@model$metalearner_model +h2o.varimp(metalearner) +``` + +The table above gives us the variable importance of the metalearner in the ensemble. The AutoML Stacked Ensembles use the default metalearner algorithm (GLM with non-negative weights), so the variable importance of the metalearner is actually the standardized coefficient magnitudes of the GLM. + + +```{r} +h2o.varimp_plot(metalearner) +``` + +## Generating Predictions Using Leader Model + +We can also generate predictions on a test sample using the leader model object. + +```{r} +pred <- as.matrix(h2o.predict(aml@leader,test_h)) # make prediction using x data from the test sample +head(pred) +``` + +This allows us to estimate the out-of-sample (test) MSE and the standard error as well. + +```{r} +y_test <- as.matrix(test_h$lwage) +summary(lm((y_test-pred)^2~1))$coef[1:2] +``` + +We observe both a lower MSE and a lower standard error compared to our previous results (see [here](https://www.kaggle.com/janniskueck/pm3-notebook-newdata)). + +```{r} +h2o.shutdown(prompt = F) +``` From 774266cc8931cb4b81afb3d8ff80f55058f0d623 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:49:44 -0700 Subject: [PATCH 239/261] old --- ...ural-networks-for-wage-prediction.irnb.Rmd | 139 ++++++++++++ ...so-for-the-convergence-hypothesis.irnb.Rmd | 178 +++++++++++++++ ...-analysis-of-401-k-example-w-dags.irnb.Rmd | 213 ++++++++++++++++++ 3 files changed, 530 insertions(+) create mode 100644 PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd create mode 100644 PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd create mode 100644 PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd diff --git a/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd b/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd new file mode 100644 index 00000000..54c64d1b --- /dev/null +++ b/PM4/Old/deep-neural-networks-for-wage-prediction.irnb.Rmd @@ -0,0 +1,139 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + + + +This notebook contains an example for teaching. + + + + +# Deep Neural Networks for Wage Prediction + + +So far we have considered many machine learning methods such as Lasso and Random Forests for building a predictive model. In this lab, we extend our toolbox by returning to our wage prediction problem and showing how a neural network can be used for prediction. + + +## Data preparation + + +Again, we consider data from the U.S. March Supplement of the Current Population Survey (CPS) in 2015. + +```{r} +load("../input/wage2015-inference/wage2015_subsample_inference.Rdata") +Z <- subset(data,select=-c(lwage,wage)) # regressors +``` + +First, we split the data first and normalize it. + +```{r} +# split the data into training and testing sets +set.seed(1234) +training <- sample(nrow(data), nrow(data)*(3/4), replace=FALSE) + +data_train <- data[training,1:16] +data_test <- data[-training,1:16] + +# data_train <- data[training,] +# data_test <- data[-training,] +# X_basic <- "sex + exp1 + exp2+ shs + hsg+ scl + clg + mw + so + we + occ2+ ind2" +# formula_basic <- as.formula(paste("lwage", "~", X_basic)) +# model_X_basic_train <- model.matrix(formula_basic,data_train)[,-1] +# model_X_basic_test <- model.matrix(formula_basic,data_test)[,-1] +# data_train <- as.data.frame(cbind(data_train$lwage,model_X_basic_train)) +# data_test <- as.data.frame(cbind(data_test$lwage,model_X_basic_test)) +# colnames(data_train)[1]<-'lwage' +# colnames(data_test)[1]<-'lwage' +``` + +```{r} +# normalize the data +mean <- apply(data_train, 2, mean) +std <- apply(data_train, 2, sd) +data_train <- scale(data_train, center = mean, scale = std) +data_test <- scale(data_test, center = mean, scale = std) +data_train <- as.data.frame(data_train) +data_test <- as.data.frame(data_test) +``` + +Then, we construct the inputs for our network. + +```{r} +X_basic <- "sex + exp1 + shs + hsg+ scl + clg + mw + so + we" +formula_basic <- as.formula(paste("lwage", "~", X_basic)) +model_X_basic_train <- model.matrix(formula_basic,data_train) +model_X_basic_test <- model.matrix(formula_basic,data_test) + +Y_train <- data_train$lwage +Y_test <- data_test$lwage +``` + +## Neural Networks + + +First, we need to determine the structure of our network. We are using the R package *keras* to build a simple sequential neural network with three dense layers and the ReLU activation function. + +```{r} +library(keras) + +build_model <- function() { + model <- keras_model_sequential() %>% + layer_dense(units = 20, activation = "relu", # ReLU activation function + input_shape = dim(model_X_basic_train)[2])%>% + layer_dense(units = 10, activation = "relu") %>% + layer_dense(units = 1) + + model %>% compile( + optimizer = optimizer_adam(lr = 0.005), # Adam optimizer + loss = "mse", + metrics = c("mae") + ) +} +``` + +Let us have a look at the structure of our network in detail. + +```{r} +model <- build_model() +summary(model) +``` + +We have $441$ trainable parameters in total. + + +Now, let us train the network. Note that this takes substantial computation time. To speed up the computation time, we use GPU as an accelerator. The extent of computational time improvements varies based on a number of factors, including model architecture, batch-size, input pipeline complexity, etc. + +```{r} +# training the network +num_epochs <- 1000 +model %>% fit(model_X_basic_train, Y_train, + epochs = num_epochs, batch_size = 100, verbose = 0) +``` + +After training the neural network, we can evaluate the performance of our model on the test sample. + +```{r} +# evaluating performance +model %>% evaluate(model_X_basic_test, Y_test, verbose = 0) +``` + +```{r} +# calculating the performance measures +pred.nn <- model %>% predict(model_X_basic_test) +MSE.nn = summary(lm((Y_test-pred.nn)^2~1))$coef[1:2] +R2.nn <- 1-MSE.nn[1]/var(Y_test) +# printing R^2 +cat("R^2 of the neural network:",R2.nn) +``` diff --git a/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd b/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd new file mode 100644 index 00000000..9629895f --- /dev/null +++ b/PM4/Old/double-lasso-for-the-convergence-hypothesis.irnb.Rmd @@ -0,0 +1,178 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains an example for teaching. + + +# Double Lasso for Testing the Convergence Hypothesis + + +## Introduction + + +We provide an additional empirical example of partialling-out with Lasso to estimate the regression coefficient $\beta_1$ in the high-dimensional linear regression model: + $$ + Y = \beta_1 D + \beta_2'W + \epsilon. + $$ + +Specifically, we are interested in how the rates at which economies of different countries grow ($Y$) are related to the initial wealth levels in each country ($D$) controlling for country's institutional, educational, and other similar characteristics ($W$). + +The relationship is captured by $\beta_1$, the *speed of convergence/divergence*, which measures the speed at which poor countries catch up $(\beta_1< 0)$ or fall behind $(\beta_1> 0)$ rich countries, after controlling for $W$. Our inference question here is: do poor countries grow faster than rich countries, controlling for educational and other characteristics? In other words, is the speed of convergence negative: $ \beta_1 <0?$ This is the Convergence Hypothesis predicted by the Solow Growth Model. This is a structural economic model. Under some strong assumptions that we won't state here, the predictive exercise we are doing here can be given a causal interpretation. + + + +The outcome $Y$ is the realized annual growth rate of a country's wealth (Gross Domestic Product per capita). The target regressor ($D$) is the initial level of the country's wealth. The target parameter $\beta_1$ is the speed of convergence, which measures the speed at which poor countries catch up with rich countries. The controls ($W$) include measures of education levels, quality of institutions, trade openness, and political stability in the country. + + +## Data analysis + + + +We consider the data set GrowthData which is included in the package *hdm*. First, let us load the data set to get familiar with the data. + +```{r} +library(hdm) # package of ``high dimensional models (hdm)" estimators +growth <- GrowthData +attach(growth) +names(growth) +``` + +We determine the dimensions of our data set. + +```{r} +dim(growth) +``` + +The sample contains $90$ countries and $63$ controls. Thus $p \approx 60$, $n=90$ and $p/n$ is not small. We expect the least squares method to provide a poor estimate of $\beta_1$. We expect the method based on partialling-out with Lasso to provide a high quality estimate of $\beta_1$. + + +To check this hypothesis, we analyze the relationship between the country's growth rate $Y$ and the country's other characteristics by running a linear regression in the first step. + +```{r} +reg.ols <- lm(Outcome~.-1,data=growth) +``` + +We determine the regression coefficient $\beta_1$ of the target regressor *gdpsh465* (initial wealth level, $D$), its 95% confidence interval and the standard error. + +```{r} +est_ols <- summary(reg.ols)$coef["gdpsh465",1] +# output: estimated regression coefficient corresponding to the target regressor + +std_ols <- summary(reg.ols)$coef["gdpsh465",2] +# output: std. error + +ci_ols <- confint(reg.ols)[2,] +# output: 95% confidence interval + +results_ols <- as.data.frame(cbind(est_ols,std_ols,ci_ols[1],ci_ols[2])) +colnames(results_ols) <-c("estimator","standard error", "lower bound CI", "upper bound CI") +rownames(results_ols) <-c("OLS") +``` + +```{r} +library(xtable) +table <- matrix(0, 1, 4) +table[1,1:4] <- c(est_ols,std_ols,ci_ols[1],ci_ols[2]) +colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") +rownames(table) <-c("OLS") +tab<- xtable(table, digits = 3) +print(tab,type="html") # set type="latex" for printing table in LaTeX +``` + + + + + + +
estimator standard error lower bound CI upper bound CI
OLS -0.009 0.030 -0.071 0.052
+ + +As expected, least squares provides a rather noisy estimate of the speed of convergence, and does not allow us to answer the question about the convergence hypothesis as the confidence interval includes zero. + + +In contrast, we can use the partialling-out approach based on lasso regression ("Double Lasso"). + +```{r} +Y <- growth[, 1, drop = F] # output variable +W <- as.matrix(growth)[, -c(1, 2,3)] # controls +D <- growth[, 3, drop = F] # target regressor +r.Y <- rlasso(x=W,y=Y)$res # creates the "residual" output variable +r.D <- rlasso(x=W,y=D)$res # creates the "residual" target regressor +partial.lasso <- lm(r.Y ~ r.D) +est_lasso <- partial.lasso$coef[2] +std_lasso <- summary(partial.lasso)$coef[2,2] +ci_lasso <- confint(partial.lasso)[2,] + +library(xtable) +table <- matrix(0, 1, 4) +table[1,1:4] <- c(est_lasso,std_lasso,ci_lasso[1],ci_lasso[2]) +colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") +rownames(table) <-c("Double Lasso") +tab<- xtable(table, digits = 3) +print(tab,type="html") # set type="latex" for printing table in LaTeX +``` + + + + + + +
estimator standard error lower bound CI upper bound CI
Double Lasso -0.050 0.014 -0.078 -0.022
+ + + +Lasso provides a more precise estimate (lower standard error). The Lasso based +point estimate is about $5\%$ and the $95\%$ confidence interval for the +(annual) rate of convergence is $7.8\%$ to $2.2\%$. This empirical +evidence does support the convergence hypothesis. + + +Note: Alternatively, one could also use the *rlassoEffect* funtion from the *hdm* package that directly applies the partialling-out approach. + +```{r} +lasso.effect = rlassoEffect(x = W, y = Y, d = D, method = "partialling out") +lasso.effect +``` + +## Summary + + + +Finally, let us have a look at the results. + +```{r} +library(xtable) +table <- matrix(0, 2, 4) +table[1,1:4] <- c(est_ols,std_ols,ci_ols[1],ci_ols[2]) +table[2,1:4] <- c(est_lasso,std_lasso,ci_lasso[1],ci_lasso[2]) +colnames(table) <-c("estimator","standard error", "lower bound CI", "upper bound CI") +rownames(table) <-c("OLS","Double Lasso") +tab<- xtable(table, digits = 3) +print(tab,type="html") # set type="latex" for printing table in LaTeX +``` + +The least square method provides a rather noisy estimate of the speed of convergence. We cannot answer the question of whether poor countries grow faster than rich countries. The least square method does not work when the ratio $p/n$ is large. + +In sharp contrast, partialling-out via Lasso provides a more precise estimate. The Lasso-based point estimate is $-5\%$ and the $95\%$ confidence interval for the (annual) rate of convergence $[-7.8\%,-2.2\%]$ only includes negative numbers. This empirical evidence does support the convergence hypothesis. + + + + + + + + + +
estimator standard error lower bound CI upper bound CI
OLS -0.009 0.030 -0.071 0.052
Double Lasso -0.050 0.014 -0.078 -0.022
+ diff --git a/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd b/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd new file mode 100644 index 00000000..44af9751 --- /dev/null +++ b/PM4/Old/identification-analysis-of-401-k-example-w-dags.irnb.Rmd @@ -0,0 +1,213 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Using Dagitty in the Analysis of Impact of 401(k) on Net Financial Wealth + + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +#install and load package +install.packages("dagitty") +install.packages("ggdag") +library(dagitty) +library(ggdag) + +``` + +# Graphs for 401(K) Analsyis + + + +# Here we have + # * $Y$ -- net financial assets; + # * $X$ -- worker characteristics (income, family size, other retirement plans; see lecture notes for details); + # * $F$ -- latent (unobserved) firm characteristics + # * $D$ -- 401(K) eligibility, deterimined by $F$ and $X$ + + +# State one graph (where F determines X) and plot it + + +```{r} +#generate a DAGs and plot them + +G1 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [uobserved, pos="0, -1"] +D -> Y +X -> D +F -> X +F -> D +X -> Y}') + + +ggdag(G1)+ theme_dag() +``` + +# List minimal adjustment sets to identify causal effecs $D \to Y$ + + + +```{r} +adjustmentSets( G1, "D", "Y",effect="total" ) +``` + +# What is the underlying principle? + +Here condition on X blocks backdoor paths from Y to D (Pearl). Dagitty correctly finds X (and does many more correct decisions, when we consider more elaborate structures. Why do we want to consider more elaborate structures? The very empirical problem requires us to do so! + + +# Another Graph (wherere $X$ determine $F$): + +```{r} +#generate a couple of DAGs and plot them + +G2 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [uobserved, pos="0, -1"] +D -> Y +X -> D +X -> F +F -> D +X -> Y}') + + +ggdag(G2)+ theme_dag() +``` + +```{r} +adjustmentSets( G2, "D", "Y", effect="total" ) + +``` + +# One more graph (encompassing previous ones), where (F, X) are jointly determined by latent factors $A$. We can allow in fact the whole triple (D, F, X) to be jointly determined by latent factors $A$. + + +This is much more realistic graph to consider. + +```{r} +G3 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +X -> Y}') + +adjustmentSets( G3, "D", "Y", effect="total" ) + +ggdag(G3)+ theme_dag() +``` + +# Threat to Idenitification: What if $F$ also directly affects $Y$? (Note that there are no valid adjustment sets in this case) + +```{r} +G4 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +F -> Y +X -> Y}') + + +ggdag(G4)+ theme_dag() +``` + +```{r} +adjustmentSets( G4, "D", "Y",effect="total" ) + +``` + +Note that no output means that there is no valid adustment set (among observed variables) + + +# How can F affect Y directly? Is it reasonable? + + +# Introduce Match Amount $M$ (very important mediator, why mediator?). $M$ is not observed. Luckily adjusting for $X$ still works if there is no $F \to M$ arrow. + +```{r} +G5 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +M [unobserved, pos="2, -.5"] +D -> Y +X -> D +F -> D +A -> F +A -> X +A -> D +D -> M +M -> Y +X -> M +X -> Y}') + +print( adjustmentSets( G5, "D", "Y",effect="total" ) ) + +ggdag(G5)+ theme_dag() +``` + +# If there is $F \to M$ arrow, then adjusting for $X$ is not sufficient. + +```{r} +G6 = dagitty('dag{ +Y [outcome,pos="4, 0"] +D [exposure,pos="0, 0"] +X [confounder, pos="2,-2"] +F [unobserved, pos="0, -1"] +A [unobserved, pos="-1, -1"] +M [uobserved, pos="2, -.5"] +D -> Y +X -> D +F -> D +A -> F +A -> X +D -> M +F -> M +A -> D +M -> Y +X -> M +X -> Y}') + +print( adjustmentSets( G6, "D", "Y" ),effect="total" ) + +ggdag(G6)+ theme_dag() +``` + + # Question: + +Given the analysis above, do you find the adjustment for workers' characteristics a credible strategy to identify the causal (total effect) of 401 (k) elligibility on net financial wealth? + + * If yes, click an "upvote" button at the top + * If no, please click an "upvote" button at the top From 3bdbc74fed5f058c72967c616a6b418fed68da75 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 05:53:00 -0700 Subject: [PATCH 240/261] old --- ...or-partially-linear-iv-model-in-r.irnb.Rmd | 264 +++++++ ...r-ate-and-late-of-401-k-on-wealth.irnb.Rmd | 677 ++++++++++++++++++ .../dml-inference-for-gun-ownership.irnb.Rmd | 377 ++++++++++ ...erence-using-nn-for-gun-ownership.irnb.Rmd | 172 +++++ T/deprecated/r-weak-iv-experiments.irnb.Rmd | 92 +++ 5 files changed, 1582 insertions(+) create mode 100644 T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd create mode 100644 T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd create mode 100644 T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd create mode 100644 T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd create mode 100644 T/deprecated/r-weak-iv-experiments.irnb.Rmd diff --git a/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd b/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd new file mode 100644 index 00000000..7d506973 --- /dev/null +++ b/T/deprecated/debiased-ml-for-partially-linear-iv-model-in-r.irnb.Rmd @@ -0,0 +1,264 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +# Double/Debiased ML for Partially Linear IV Model + +References: + +https://arxiv.org/abs/1608.00060 + + +https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778 + +The code is based on the book. + + + + + +# Partially Linear IV Model + +We consider the partially linear structural equation model: +\begin{eqnarray} + & Y - D\theta_0 = g_0(X) + \zeta, & E[\zeta \mid Z,X]= 0,\\ + & Z = m_0(X) + V, & E[V \mid X] = 0. +\end{eqnarray} + + +Note that this model is not a regression model unless $Z=D$. The model is a canonical +model in causal inference, going back to P. Wright's work on IV methods for estimaing demand/supply equations, with the modern difference being that $g_0$ and $m_0$ are nonlinear, potentially complicated functions of high-dimensional $X$. + + +The idea of this model is that there is a structural or causal relation between $Y$ and $D$, captured by $\theta_0$, and $g_0(X) + \zeta$ is the stochastic error, partly explained by covariates $X$. $V$ and $\zeta$ are stochastic errors that are not explained by $X$. Since $Y$ and $D$ are jointly determined, we need an external factor, commonly referred to as an instrument, $Z$ to create exogenous variation +in $D$. Note that $Z$ should affect $D$. The $X$ here serve again as confounding factors, so we can think of variation in $Z$ as being exogenous only conditional on $X$. + + +The causal DAG this model corresponds to is given by: +$$ +Z \to D, X \to (Y, Z, D), L \to (Y,D), +$$ +where $L$ is the latent confounder affecting both $Y$ and $D$, but not $Z$. + + + +--- + +# Example + +A simple contextual example is from biostatistics, where $Y$ is a health outcome and $D$ is indicator of smoking. Thus, $\theta_0$ is captures the effect of smoking on health. Health outcome $Y$ and smoking behavior $D$ are treated as being jointly determined. $X$ represents patient characteristics, and $Z$ could be a doctor's advice not to smoke (or another behavioral treatment) that may affect the outcome $Y$ only through shifting the behavior $D$, conditional on characteristics $X$. + +---- + + + +# PLIVM in Residualized Form + +The PLIV model above can be rewritten in the following residualized form: +$$ + \tilde Y = \tilde D \theta_0 + \zeta, \quad E[\zeta \mid V,X]= 0, +$$ +where +$$ + \tilde Y = (Y- \ell_0(X)), \quad \ell_0(X) = E[Y \mid X] \\ + \tilde D = (D - r_0(X)), \quad r_0(X) = E[D \mid X] \\ + \tilde Z = (Z- m_0(X)), \quad m_0(X) = E[Z \mid X]. +$$ + The tilded variables above represent original variables after taking out or "partialling out" + the effect of $X$. Note that $\theta_0$ is identified from this equation if $V$ + and $U$ have non-zero correlation, which automatically means that $U$ and $V$ + must have non-zero variation. + + + +----- + +# DML for PLIV Model + +Given identification, DML proceeds as follows + +Compute the estimates $\hat \ell_0$, $\hat r_0$, and $\hat m_0$ , which amounts +to solving the three problems of predicting $Y$, $D$, and $Z$ using +$X$, using any generic ML method, giving us estimated residuals +$$ +\tilde Y = Y - \hat \ell_0(X), \\ \tilde D= D - \hat r_0(X), \\ \tilde Z = Z- \hat m_0(X). +$$ +The estimates should be of a cross-validated form, as detailed in the algorithm below. + +Estimate $\theta_0$ by the the intstrumental +variable regression of $\tilde Y$ on $\tilde D$ using $\tilde Z$ as an instrument. +Use the conventional inference for the IV regression estimator, ignoring +the estimation error in these residuals. + +The reason we work with this residualized form is that it eliminates the bias +arising when solving the prediction problem in stage 1. The role of cross-validation +is to avoid another source of bias due to potential overfitting. + +The estimator is adaptive, +in the sense that the first stage estimation errors do not affect the second +stage errors. + + + +```{r _kg_hide-output=TRUE} +install.packages("hdm") +install.packages("AER") +install.packages("randomForest") +``` + +```{r} + +library(AER) #applied econometrics library +library(randomForest) #random Forest library +library(hdm) #high-dimensional econometrics library +library(glmnet) #glm net + + +# DML for PLIVM + +DML2.for.PLIVM <- function(x, d, z, y, dreg, yreg, zreg, nfold=2) { + # this implements DML2 algorithm, where there moments are estimated via DML, before constructing + # the pooled estimate of theta randomly split data into folds + nobs <- nrow(x) + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] + I <- split(1:nobs, foldid) + # create residualized objects to fill + ytil <- dtil <- ztil<- rep(NA, nobs) + # obtain cross-fitted residuals + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(x[-I[[b]],], d[-I[[b]]]) #take a fold out + zfit <- zreg(x[-I[[b]],], z[-I[[b]]]) #take a fold out + yfit <- yreg(x[-I[[b]],], y[-I[[b]]]) # take a folot out + dhat <- predict(dfit, x[I[[b]],], type="response") #predict the fold out + zhat <- predict(zfit, x[I[[b]],], type="response") #predict the fold out + yhat <- predict(yfit, x[I[[b]],], type="response") #predict the fold out + dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual + ztil[I[[b]]] <- (z[I[[b]]] - zhat) #record residual + ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial + cat(b," ") + } + ivfit= tsls(y=ytil,d=dtil, x=NULL, z=ztil, intercept=FALSE) + coef.est <- ivfit$coef #extract coefficient + se <- ivfit$se #record standard error + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, ztil=ztil) ) +} + + +``` + + +----- + +# Emprical Example: Acemoglu, Jonsohn, Robinson (AER). + + +* Y is log GDP; +* D is a measure of Protection from Expropriation, a proxy for +quality of insitutions; +* Z is the log of Settler's mortality; +* W are geographical variables (latitude, latitude squared, continent dummies as well as interactions) + + + + +```{r} + +data(AJR); + +y = AJR$GDP; +d = AJR$Exprop; +z = AJR$logMort +xraw= model.matrix(~ Latitude+ Africa+Asia + Namer + Samer, data=AJR) +x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data=AJR) +dim(x) + +# DML with Random Forest +cat(sprintf("\n DML with Random Forest \n")) + +dreg <- function(x,d){ randomForest(x, d) } #ML method=Forest +yreg <- function(x,y){ randomForest(x, y) } #ML method=Forest +zreg<- function(x,z){ randomForest(x, z)} #ML method=Forest + +set.seed(1) +DML2.RF = DML2.for.PLIVM(xraw, d, z, y, dreg, yreg, zreg, nfold=20) + +# DML with PostLasso +cat(sprintf("\n DML with Post-Lasso \n")) + +dreg <- function(x,d){ rlasso(x, d) } #ML method=lasso +yreg <- function(x,y){ rlasso(x, y) } #ML method=lasso +zreg<- function(x,z){ rlasso(x, z)} #ML method=lasso + +set.seed(1) +DML2.lasso = DML2.for.PLIVM(x, d, z, y, dreg, yreg, zreg, nfold=20) + + +# Compare Forest vs Lasso + +comp.tab= matrix(NA, 3, 2) +comp.tab[1,] = c( sqrt(mean((DML2.RF$ytil)^2)), sqrt(mean((DML2.lasso$ytil)^2)) ) +comp.tab[2,] = c( sqrt(mean((DML2.RF$dtil)^2)), sqrt(mean((DML2.lasso$dtil)^2)) ) +comp.tab[3,] = c( sqrt(mean((DML2.RF$ztil)^2)), sqrt(mean((DML2.lasso$ztil)^2)) ) +rownames(comp.tab) = c("RMSE for Y:", "RMSE for D:", "RMSE for Z:") +colnames(comp.tab) = c("RF", "LASSO") +print(comp.tab, digits=3) +``` + +# Examine if we have weak instruments + +```{r} +install.packages("lfe") +library(lfe) +summary(felm(DML2.lasso$dtil~DML2.lasso$ztil), robust=T) +summary(felm(DML2.RF$dtil~DML2.RF$ztil), robust=T) +``` + +# We do have weak instruments, because t-stats in regression $\tilde D \sim \tilde Z$ are less than 4 in absolute value + + +So let's carry out DML inference combined with Anderson-Rubin Idea + +```{r} +# DML-AR (DML with Anderson-Rubin) + +DML.AR.PLIV<- function(rY, rD, rZ, grid, alpha=.05){ + n = length(rY) + Cstat = rep(0, length(grid)) + for (i in 1:length(grid)) { + Cstat[i] <- n* (mean( (rY - grid[i]*rD)*rZ) )^2/var ( (rY - grid[i]*rD) * rZ ) + }; + LB<- min(grid[ Cstat < qchisq(1-alpha,1)]); + UB <- max(grid[ Cstat < qchisq(1-alpha,1)]); + plot(range(grid),range(c( Cstat)) , type="n",xlab="Effect of institutions", ylab="Statistic", main=" "); + lines(grid, Cstat, lty = 1, col = 1); + abline(h=qchisq(1-alpha,1), lty = 3, col = 4); + abline(v=LB, lty = 3, col = 2); + abline(v=UB, lty = 3, col = 2); + return(list(UB=UB, LB=LB)) + } + +``` + +```{r} +DML.AR.PLIV(rY = DML2.lasso$ytil, rD= DML2.lasso$dtil, rZ= DML2.lasso$ztil, + grid = seq(-2, 2, by =.01)) + + +DML.AR.PLIV(rY = DML2.RF$ytil, rD= DML2.RF$dtil, rZ= DML2.RF$ztil, + grid = seq(-2, 2, by =.01)) + +``` diff --git a/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd b/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd new file mode 100644 index 00000000..f5daee27 --- /dev/null +++ b/T/deprecated/dml-for-ate-and-late-of-401-k-on-wealth.irnb.Rmd @@ -0,0 +1,677 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# Inference on Predictive and Causal Effects in High-Dimensional Nonlinear Models + + +## Impact of 401(k) on Financial Wealth + +As a practical illustration of the methods developed in this lecture, we consider estimation of the effect of 401(k) eligibility and participation +on accumulated assets. 401(k) plans are pension accounts sponsored by employers. The key problem in determining the effect of participation in 401(k) plans on accumulated assets is saver heterogeneity coupled with the fact that the decision to enroll in a 401(k) is non-random. It is generally recognized that some people have a higher preference for saving than others. It also seems likely that those individuals with high unobserved preference for saving would be most likely to choose to participate in tax-advantaged retirement savings plans and would tend to have otherwise high amounts of accumulated assets. The presence of unobserved savings preferences with these properties then implies that conventional estimates that do not account for saver heterogeneity and endogeneity of participation will be biased upward, tending to overstate the savings effects of 401(k) participation. + +One can argue that eligibility for enrolling in a 401(k) plan in this data can be taken as exogenous after conditioning on a few observables of which the most important for their argument is income. The basic idea is that, at least around the time 401(k)’s initially became available, people were unlikely to be basing their employment decisions on whether an employer offered a 401(k) but would instead focus on income and other aspects of the job. + + +### Data + +The data set can be loaded from the `hdm` package for R by typing + + + +```{r} +library(hdm) +library(ggplot2) +data(pension) +data <- pension +dim(data) +``` + +See the "Details" section on the description of the data set, which can be accessed by + + +```{r} +help(pension) +``` + +The data consist of 9,915 observations at the household level drawn from the 1991 Survey of Income and Program Participation (SIPP). All the variables are referred to 1990. We use net financial assets (*net\_tfa*) as the outcome variable, $Y$, in our analysis. The net financial assets are computed as the sum of IRA balances, 401(k) balances, checking accounts, saving bonds, other interest-earning accounts, other interest-earning assets, stocks, and mutual funds less non mortgage debts. + + +Among the $9915$ individuals, $3682$ are eligible to participate in the program. The variable *e401* indicates eligibility and *p401* indicates participation, respectively. + +```{r} +hist_e401 = ggplot(data, aes(x = e401, fill = factor(e401))) + geom_bar() +hist_e401 +``` + +Eligibility is highly associated with financial wealth: + +```{r} +dens_net_tfa = ggplot(data, aes(x = net_tfa, color = factor(e401), fill = factor(e401)) ) + + geom_density() + xlim(c(-20000, 150000)) + + facet_wrap(.~e401) + +dens_net_tfa +``` + +The unconditional APE of e401 is about $19559$: + +```{r} +e1 <- data[data$e401==1,] +e0 <- data[data$e401==0,] +round(mean(e1$net_tfa)-mean(e0$net_tfa),0) +``` + +Among the $3682$ individuals that are eligible, $2594$ decided to participate in the program. The unconditional APE of p401 is about $27372$: + +```{r} +p1 <- data[data$p401==1,] +p0 <- data[data$p401==0,] +round(mean(p1$net_tfa)-mean(p0$net_tfa),0) +``` + +As discussed, these estimates are biased since they do not account for saver heterogeneity and endogeneity of participation. + + +## Double ML package + + +We are interested in valid estimators of the average treatment effect of `e401` and `p401` on `net_tfa`. To get those estimators, we use the `DoubleML` package that internally builds on mlr3. You find additional information on the package on the package website https://docs.doubleml.org/ and the R documentation page https://docs.doubleml.org/r/stable/. + +```{r} +# installing Double ML +remotes::install_github("DoubleML/doubleml-for-r",quiet=TRUE) + + +# loading the packages +library(DoubleML) +library(mlr3learners) +library(mlr3) +library(data.table) +library(randomForest) + +``` + +As mentioned, in the tutorial we use the meta package `mlr3` to generate predictions with machine learning methods. A comprehensive introduction and description of the `mlr3` package is provided in the [mlr3book](https://mlr3book.mlr-org.com/). A list of all learners that you can use in `mlr3` can be found [here](https://mlr3extralearners.mlr-org.com/articles/learners/list_learners.html). The entry in the columns *mlr3 Package* and *Packages* indicate which packages must be installed/loaded in your R session. + + +## Estimating the ATE of 401(k) Eligibility on Net Financial Assets + + +We first look at the treatment effect of e401 on net total financial assets. We give estimates of the ATE and ATT that corresponds to the linear model + +\begin{equation*} +Y = D \alpha + f(X)'\beta+ \epsilon, +\end{equation*} + +where $f(X)$ includes indicators of marital status, two-earner status, defined benefit pension status, IRA participation status, and home ownership status, and orthogonal polynomials of degrees 2, 4, 6 and 8 in family size, education, age and income, respectively. The dimensions of $f(X)$ is 25. + +In the first step, we report estimates of the average treatment effect (ATE) of 401(k) eligibility on net financial assets both in the partially linear regression (PLR) model and in the interactive regression model (IRM) allowing for heterogeneous treatment effects. + + +```{r} +# Constructing the data (as DoubleMLData) +formula_flex = "net_tfa ~ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" +model_flex = as.data.table(model.frame(formula_flex, pension)) +x_cols = colnames(model_flex)[-c(1,2)] +data_ml = DoubleMLData$new(model_flex, y_col = "net_tfa", d_cols = "e401", x_cols=x_cols) + + +p <- dim(model_flex)[2]-2 +p + +# complex model with two-way interactions +#data_interactions = fetch_401k(polynomial_features = TRUE, instrument = FALSE) + +``` + +## Partially Linear Regression Models (PLR) + + +We start using lasso to estimate the function $g_0$ and $m_0$ in the following PLR model: + + +\begin{eqnarray} + & Y = D\theta_0 + g_0(X) + \zeta, & E[\zeta \mid D,X]= 0,\\ + & D = m_0(X) + V, & E[V \mid X] = 0. +\end{eqnarray} + +```{r} +# Estimating the PLR +lgr::get_logger("mlr3")$set_threshold("warn") +set.seed(123) +lasso <- lrn("regr.cv_glmnet",nfolds = 5, s = "lambda.min") +lasso_class <- lrn("classif.cv_glmnet", nfolds = 5, s = "lambda.min") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_g = lasso, ml_m = lasso_class, n_folds=3) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +lasso_plr <- dml_plr$coef +lasso_std_plr <- dml_plr$se +``` + +Let us check the predictive performance of this model. + +```{r} +dml_plr$params_names() +g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +``` + +```{r} +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +d <- as.matrix(pension$e401) +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +lasso_y_rmse <- sqrt(mean((y-predictions_y)^2)) +lasso_y_rmse +``` + +```{r} +# cross-fitted RMSE: treatment +d <- as.matrix(pension$e401) +lasso_d_rmse <- sqrt(mean((d-m_hat)^2)) +lasso_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +Then, we repeat this procedure for various machine learning methods. + +```{r} +# Random Forest +lgr::get_logger("mlr3")$set_threshold("warn") +randomForest <- lrn("regr.ranger") +randomForest_class <- lrn("classif.ranger") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_g = randomForest, ml_m = randomForest_class, n_folds=3) +dml_plr$fit(store_predictions=TRUE) # set store_predictions=TRUE to evaluate the model +dml_plr$summary() +forest_plr <- dml_plr$coef +forest_std_plr <- dml_plr$se +``` + +We can compare the accuracy of this model to the model that has been estimated with lasso. + +```{r} +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +forest_y_rmse <- sqrt(mean((y-predictions_y)^2)) +forest_y_rmse + +# cross-fitted RMSE: treatment +forest_d_rmse <- sqrt(mean((d-m_hat)^2)) +forest_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +# Trees +lgr::get_logger("mlr3")$set_threshold("warn") + +trees <- lrn("regr.rpart") +trees_class <- lrn("classif.rpart") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_g = trees, ml_m = trees_class, n_folds=3) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +tree_plr <- dml_plr$coef +tree_std_plr <- dml_plr$se + +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +tree_y_rmse <- sqrt(mean((y-predictions_y)^2)) +tree_y_rmse + +# cross-fitted RMSE: treatment +tree_d_rmse <- sqrt(mean((d-m_hat)^2)) +tree_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +# Boosting +lgr::get_logger("mlr3")$set_threshold("warn") +boost<- lrn("regr.xgboost",objective="reg:squarederror") +boost_class <- lrn("classif.xgboost",objective = "binary:logistic",eval_metric ="logloss") + +dml_plr <- DoubleMLPLR$new(data_ml, ml_g = boost, ml_m = boost_class, n_folds=3) +dml_plr$fit(store_predictions=TRUE) +dml_plr$summary() +boost_plr <- dml_plr$coef +boost_std_plr <- dml_plr$se + +# Evaluation predictions +g_hat <- as.matrix(dml_plr$predictions$ml_g) # predictions of g_o +m_hat <- as.matrix(dml_plr$predictions$ml_m) # predictions of m_o +theta <- as.numeric(dml_plr$coef) # estimated regression coefficient +predictions_y <- as.matrix(d*theta)+g_hat # predictions for y +boost_y_rmse <- sqrt(mean((y-predictions_y)^2)) +boost_y_rmse + +# cross-fitted RMSE: treatment +boost_d_rmse <- sqrt(mean((d-m_hat)^2)) +boost_d_rmse + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +Let's sum up the results: + +```{r} +library(xtable) +table <- matrix(0, 4, 4) +table[1,1:4] <- c(lasso_plr,forest_plr,tree_plr,boost_plr) +table[2,1:4] <- c(lasso_std_plr,forest_std_plr,tree_std_plr,boost_std_plr) +table[3,1:4] <- c(lasso_y_rmse,forest_y_rmse,tree_y_rmse,boost_y_rmse) +table[4,1:4] <- c(lasso_d_rmse,forest_d_rmse,tree_d_rmse,boost_d_rmse) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +The best model with lowest RMSE in both equation is the PLR model estimated via lasso. It gives the following estimate: + +```{r} +lasso_plr +``` + +## Interactive Regression Model (IRM) + + +Next, we consider estimation of average treatment effects when treatment effects are fully heterogeneous: + + + \begin{eqnarray}\label{eq: HetPL1} + & Y = g_0(D, X) + U, & \quad E[U \mid X, D]= 0,\\ + & D = m_0(X) + V, & \quad E[V\mid X] = 0. +\end{eqnarray} + + +To reduce the disproportionate impact of extreme propensity score weights in the interactive model +we trim the propensity scores which are close to the bounds. + +```{r} +set.seed(123) +lgr::get_logger("mlr3")$set_threshold("warn") +dml_irm = DoubleMLIRM$new(data_ml, ml_g = lasso, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds=3) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +lasso_irm <- dml_irm$coef +lasso_std_irm <- dml_irm$se + + +# predictions +dml_irm$params_names() +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +``` + +```{r} +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +lasso_y_irm <- sqrt(mean((y-g_hat)^2)) +lasso_y_irm + +# cross-fitted RMSE: treatment +lasso_d_irm <- sqrt(mean((d-m_hat)^2)) +lasso_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +##### forest ##### + +dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, + ml_m = randomForest_class, + trimming_threshold = 0.01, n_folds=3) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +forest_irm <- dml_irm$coef +forest_std_irm <- dml_plr$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +forest_y_irm <- sqrt(mean((y-g_hat)^2)) +forest_y_irm + +# cross-fitted RMSE: treatment +forest_d_irm <- sqrt(mean((d-m_hat)^2)) +forest_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) + +##### trees ##### + +dml_irm <- DoubleMLIRM$new(data_ml, ml_g = trees, ml_m = trees_class, + trimming_threshold = 0.01, n_folds=3) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +tree_irm <- dml_irm$coef +tree_std_irm <- dml_irm$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +tree_y_irm <- sqrt(mean((y-g_hat)^2)) +tree_y_irm + +# cross-fitted RMSE: treatment +tree_d_irm <- sqrt(mean((d-m_hat)^2)) +tree_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) + + +##### boosting ##### + +dml_irm <- DoubleMLIRM$new(data_ml, ml_g = boost, ml_m = boost_class, + trimming_threshold = 0.01, n_folds=3) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +boost_irm <- dml_irm$coef +boost_std_irm <- dml_irm$se + +# predictions +g0_hat <- as.matrix(dml_irm$predictions$ml_g0) # predictions of g_0(D=0, X) +g1_hat <- as.matrix(dml_irm$predictions$ml_g1) # predictions of g_0(D=1, X) +g_hat <- d*g1_hat+(1-d)*g0_hat # predictions of g_0 +m_hat <- as.matrix(dml_irm$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$e401) +boost_y_irm <- sqrt(mean((y-g_hat)^2)) +boost_y_irm + +# cross-fitted RMSE: treatment +boost_d_irm <- sqrt(mean((d-m_hat)^2)) +boost_d_irm + +# cross-fitted ce: treatment +mean(ifelse(m_hat > 0.5, 1, 0) != d) +``` + +```{r} +library(xtable) +table <- matrix(0, 4, 4) +table[1,1:4] <- c(lasso_irm,forest_irm,tree_irm,boost_irm) +table[2,1:4] <- c(lasso_std_irm,forest_std_irm,tree_std_irm,boost_std_irm) +table[3,1:4] <- c(lasso_y_irm,forest_y_irm,tree_y_irm,boost_y_irm) +table[4,1:4] <- c(lasso_d_irm,forest_d_irm,tree_d_irm,boost_d_irm) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +Here, Random Forest gives the best prediction rule for $g_0$ and Lasso the best prediction rule for $m_0$, respectively. Let us fit the IRM model using the best ML method for each equation to get a final estimate for the treatment effect of eligibility. + +```{r} +set.seed(123) +lgr::get_logger("mlr3")$set_threshold("warn") +dml_irm = DoubleMLIRM$new(data_ml, ml_g = randomForest, + ml_m = lasso_class, + trimming_threshold = 0.01, n_folds=3) +dml_irm$fit(store_predictions=TRUE) +dml_irm$summary() +best_irm <- dml_irm$coef +best_std_irm <- dml_irm$se +``` + +These estimates that flexibly account for confounding are +substantially attenuated relative to the baseline estimate (*19559*) that does not account for confounding. They suggest much smaller causal effects of 401(k) eligiblity on financial asset holdings. + + +## Local Average Treatment Effects of 401(k) Participation on Net Financial Assets + + +## Interactive IV Model (IIVM) + + +Now, we consider estimation of local average treatment effects (LATE) of participation with the binary instrument `e401`. As before, $Y$ denotes the outcome `net_tfa`, and $X$ is the vector of covariates. Here the structural equation model is: + +\begin{eqnarray} +& Y = g_0(Z,X) + U, &\quad E[U\mid Z,X] = 0,\\ +& D = r_0(Z,X) + V, &\quad E[V\mid Z, X] = 0,\\ +& Z = m_0(X) + \zeta, &\quad E[\zeta \mid X] = 0. +\end{eqnarray} + +```{r} +# Constructing the data (as DoubleMLData) +formula_flex2 = "net_tfa ~ p401+ e401 + poly(age, 6, raw=TRUE) + poly(inc, 8, raw=TRUE) + poly(educ, 4, raw=TRUE) + poly(fsize, 2, raw=TRUE) + marr + twoearn + db + pira + hown" +model_flex2 = as.data.table(model.frame(formula_flex2, data)) +x_cols = colnames(model_flex2)[-c(1,2,3)] +data_IV = DoubleMLData$new(model_flex2, y_col = "net_tfa", d_cols = "p401", z_cols ="e401",x_cols=x_cols) +``` + +```{r} +set.seed(123) +lgr::get_logger("mlr3")$set_threshold("warn") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = lasso, + ml_m = lasso_class, ml_r = lasso_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +lasso_MLIIVM <- dml_MLIIVM$coef +lasso_std_MLIIVM <- dml_MLIIVM$se +``` + +The confidence interval for the local average treatment effect of participation is given by + +```{r} +dml_MLIIVM$confint(level = 0.95) +``` + +Here we can also check the accuracy of the model: + +```{r} +# variables +y <- as.matrix(pension$net_tfa) # true observations +d <- as.matrix(pension$p401) +z <- as.matrix(pension$e401) + +# predictions +dml_MLIIVM$params_names() +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o +``` + +```{r} +# cross-fitted RMSE: outcome +lasso_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +lasso_y_MLIIVM + +# cross-fitted RMSE: treatment +lasso_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +lasso_d_MLIIVM + +# cross-fitted RMSE: instrument +lasso_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +lasso_z_MLIIVM + +``` + +Again, we repeat the procedure for the other machine learning methods: + +```{r} +### random forest ### + +set.seed(123) +lgr::get_logger("mlr3")$set_threshold("warn") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, + ml_m = randomForest_class, ml_r = randomForest_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +forest_MLIIVM <- dml_MLIIVM$coef +forest_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +forest_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +forest_y_MLIIVM + +# cross-fitted RMSE: treatment +forest_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +forest_d_MLIIVM + +# cross-fitted RMSE: instrument +forest_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +forest_z_MLIIVM + +### trees ### + +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = trees, + ml_m = trees_class, ml_r = trees_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +tree_MLIIVM <- dml_MLIIVM$coef +tree_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +tree_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +tree_y_MLIIVM + +# cross-fitted RMSE: treatment +tree_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +tree_d_MLIIVM + +# cross-fitted RMSE: instrument +tree_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +tree_z_MLIIVM + + +### boosting ### +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = boost, + ml_m = boost_class, ml_r = boost_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +boost_MLIIVM <- dml_MLIIVM$coef +boost_std_MLIIVM <- dml_MLIIVM$se + +# predictions +g0_hat <- as.matrix(dml_MLIIVM$predictions$ml_g0) # predictions of g_0(Z=0, X) +g1_hat <- as.matrix(dml_MLIIVM$predictions$ml_g1) # predictions of g_0(Z=1, X) +g_hat <- z*g1_hat+(1-z)*g0_hat # predictions of g_0 +r0_hat <- as.matrix(dml_MLIIVM$predictions$ml_r0) # predictions of r_0(Z=0, X) +r1_hat <- as.matrix(dml_MLIIVM$predictions$ml_r1) # predictions of r_0(Z=1, X) +r_hat <- z*r1_hat+(1-z)*r0_hat # predictions of r_0 +m_hat <- as.matrix(dml_MLIIVM$predictions$ml_m) # predictions of m_o + +# cross-fitted RMSE: outcome +boost_y_MLIIVM <- sqrt(mean((y-g_hat)^2)) +boost_y_MLIIVM + +# cross-fitted RMSE: treatment +boost_d_MLIIVM <- sqrt(mean((d-r_hat)^2)) +boost_d_MLIIVM + +# cross-fitted RMSE: instrument +boost_z_MLIIVM <- sqrt(mean((z-m_hat)^2)) +boost_z_MLIIVM +``` + +```{r} +library(xtable) +table <- matrix(0, 5, 4) +table[1,1:4] <- c(lasso_MLIIVM,forest_MLIIVM,tree_MLIIVM,boost_MLIIVM) +table[2,1:4] <- c(lasso_std_MLIIVM,forest_std_MLIIVM,tree_std_MLIIVM,boost_std_MLIIVM) +table[3,1:4] <- c(lasso_y_MLIIVM,forest_y_MLIIVM,tree_y_MLIIVM,boost_y_MLIIVM) +table[4,1:4] <- c(lasso_d_MLIIVM,forest_d_MLIIVM,tree_d_MLIIVM,boost_d_MLIIVM) +table[5,1:4] <- c(lasso_z_MLIIVM,forest_z_MLIIVM,tree_z_MLIIVM,boost_z_MLIIVM) +rownames(table) <- c("Estimate","Std.Error","RMSE Y","RMSE D","RMSE Z") +colnames(table) <- c("Lasso","Random Forest","Trees","Boosting") +tab<- xtable(table, digits = 2) +tab +``` + +We report results based on four ML methods for estimating the nuisance functions used in +forming the orthogonal estimating equations. We find again that the estimates of the treatment effect are stable across ML methods. The estimates are highly significant, hence we would reject the hypothesis +that the effect of 401(k) participation has no effect on financial health. + + +We might rerun the model using the best ML method for each equation to get a final estimate for the treatment effect of participation: + +```{r} +set.seed(123) +lgr::get_logger("mlr3")$set_threshold("warn") +dml_MLIIVM = DoubleMLIIVM$new(data_IV, ml_g = randomForest, + ml_m = lasso_class, ml_r = lasso_class,n_folds=3, subgroups = list(always_takers = FALSE, + never_takers = TRUE)) +dml_MLIIVM$fit(store_predictions=TRUE) +dml_MLIIVM$summary() +best_MLIIVM <- dml_MLIIVM$coef +best_std_MLIIVM <- dml_MLIIVM$se +``` diff --git a/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd b/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd new file mode 100644 index 00000000..67145e54 --- /dev/null +++ b/T/deprecated/dml-inference-for-gun-ownership.irnb.Rmd @@ -0,0 +1,377 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +This notebook contains an example for teaching. + + +# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates + + +We consider the problem of estimating the effect of gun +ownership on the homicide rate. For this purpose, we estimate the following partially +linear model + +$$ + Y_{j,t} = \beta D_{j,(t-1)} + g(Z_{j,t}) + \epsilon_{j,t}. +$$ + + +## Data + + +$Y_{j,t}$ is the log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is the log fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership, and $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. The parameter $\beta$ is the effect of gun ownership on homicide rates, controlling for county-level demographic and economic characteristics. + +The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations. + +```{r} +data <- read.csv("../input/gun-example/gun_clean.csv") +dim(data)[1] +``` + +### Preprocessing + + +To account for heterogeneity across counties and time trends in all variables, we remove from them county-specific and time-specific effects in the following preprocessing. + +```{r} +##################### Find Variable Names from Dataset ###################### + +varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { + vars <- character(0) + if (any(type %in% "numeric")) { + vars <- c(vars,names(df)[sapply(df,is.numeric)]) + } + if (any(type %in% "factor")) { + vars <- c(vars,names(df)[sapply(df,is.factor)]) + } + if (any(type %in% "character")) { + vars <- c(vars,names(df)[sapply(df,is.character)]) + } + vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] +} + +############################# Create Variables ############################## + +# dummy variables for year and county fixed effects +fixed <- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE) +year <- varlist(data, pattern="X_Tyear") + +# census control variables +census <- NULL +census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") + +for(i in 1:length(census_var)){ + census <- append(census, varlist(data, pattern=census_var[i])) +} + +################################ Variables ################################## +# treatment variable +d <- "logfssl" + +# outcome variable +y <- "logghomr" + +# other control variables +X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") +X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") + +######################## Partial out Fixed Effects ########################## + +# new dataset for partialled-out variables +rdata <- as.data.frame(data$CountyCode) +colnames(rdata) <- "CountyCode" + +# variables to partial out +varlist <- c(y, d,X1, X2, census) + +# partial out year and county fixed effect from variables in varlist +for(i in 1:length(varlist)){ + form <- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"), paste(fixed,collapse="+"), sep="+"))) + rdata[, varlist[i]] <- lm(form, data)$residuals +} +``` + +Now, we can construct the treatment variable, the outcome variable and the matrix $Z$ that includes the control variables. + +```{r} +# treatment variable +D <- rdata[which(colnames(rdata) == d)] + +# outcome variable +Y <- rdata[which(colnames(rdata) == y)] + +# construct matrix Z +Z <- rdata[which(colnames(rdata) %in% c(X1,X2,census))] +dim(Z) +``` + +We have 195 control variables in total. The control variables $Z_{j,t}$ are from the U.S. Census Bureau and contain demographic and economic characteristics of the counties such as the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates. + +```{r} +clu <- rdata[which(colnames(rdata) == "CountyCode")] # for clustering the standard errors +data <- data.frame(cbind(Y, D, Z,as.matrix(clu))) +``` + +```{r} +library(lfe) # linear group fixed effects package +``` + +## The effect of gun ownership + + +### OLS + + +After preprocessing the data, as a baseline model, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls. + +```{r} +# baseline_formula <- as.formula(paste(y, "~", d )) +# baseline.ols <- lm(baseline_formula,data=rdata) + +baseline.ols <- felm(logghomr ~ logfssl |0|0| CountyCode,data=data) # ols with clustered standard errors +est_baseline <- summary(baseline.ols)$coef[2,] +confint(baseline.ols)[2,] +est_baseline +``` + +The point estimate is $0.282$ with the confidence interval ranging from 0.155 to 0.41. This +suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative +to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties' characteristics. + +Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics, we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach. + +```{r} +control_formula <- as.formula(paste("logghomr", "~", paste("logfssl",paste(colnames(Z),collapse="+"), + sep="+"),"|0|0| CountyCode")) +control.ols <- felm(control_formula,data=data) # fixed effects lm function +est_ols <- summary(control.ols)$coef[2,] +confint(control.ols)[2,] +est_ols +``` + +After controlling for a rich set of characteristics, the point estimate of gun ownership reduces to $0.19$. + + +# DML algorithm + +Here we perform inference on the predictive coefficient $\beta$ in our partially linear statistical model, + +$$ +Y = D\beta + g(Z) + \epsilon, \quad E (\epsilon | D, Z) = 0, +$$ + +using the **double machine learning** approach. + +For $\tilde Y = Y- E(Y|Z)$ and $\tilde D= D- E(D|Z)$, we can write +$$ +\tilde Y = \alpha \tilde D + \epsilon, \quad E (\epsilon |\tilde D) =0. +$$ + +Using cross-fitting, we employ modern regression methods +to build estimators $\hat \ell(Z)$ and $\hat m(Z)$ of $\ell(Z):=E(Y|Z)$ and $m(Z):=E(D|Z)$ to obtain the estimates of the residualized quantities: + +$$ +\tilde Y_i = Y_i - \hat \ell (Z_i), \quad \tilde D_i = D_i - \hat m(Z_i), \quad \text{ for each } i = 1,\dots,n. +$$ + +Finally, using ordinary least squares of $\tilde Y_i$ on $\tilde D_i$, we obtain the +estimate of $\beta$. + + +The following algorithm comsumes $Y, D, Z$, and a machine learning method for learning the residuals $\tilde Y$ and $\tilde D$, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient $\beta$ and the corresponding standard error from the final OLS regression. + +```{r} +DML2.for.PLM <- function(z, d, y, dreg, yreg, nfold=2, clu) { + nobs <- nrow(z) # number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] # define folds indices + I <- split(1:nobs, foldid) # split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + dfit <- dreg(z[-I[[b]],], d[-I[[b]]]) # take a fold out + yfit <- yreg(z[-I[[b]],], y[-I[[b]]]) # take a fold out + dhat <- predict(dfit, z[I[[b]],], type="response") # predict the left-out fold + yhat <- predict(yfit, z[I[[b]],], type="response") # predict the left-out fold + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold + cat(b," ") + } + #rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other + data <- data.frame(cbind(ytil, dtil, as.matrix(clu))) + rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data) + coef.est <- coef(rfit)[2] # extract coefficient + #HC <- vcovHC(rfit) + se <- summary(rfit,robust=T)$coefficients[2,2] # record robust standard error by county + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) # print output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) # save output and residuals +} +``` + +Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries. + +```{r} +library(hdm) +library(glmnet) +library(sandwich) +library(randomForest) +``` + +Let us, construct the input matrices. + +```{r} +y <- as.matrix(Y) +d <- as.matrix(D) +z <- as.matrix(Z) +clu <- rdata[which(colnames(rdata) == "CountyCode")] +head(data.frame(cbind(y,d,as.matrix(clu)))) +``` + +In the following, we apply the DML approach with the different versions of lasso. + + + +## Lasso + +```{r} +# DML with Lasso: +set.seed(123) +dreg <- function(z,d){ rlasso(z,d, post=FALSE) } # ML method= lasso from hdm +yreg <- function(z,y){ rlasso(z,y, post=FALSE) } # ML method = lasso from hdm +DML2.lasso = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10,clu) +``` + +```{r} +# DML with Post-Lasso: +dreg <- function(z,d){ rlasso(z,d, post=T) } # ML method= lasso from hdm +yreg <- function(z,y){ rlasso(z,y, post=T) } # ML method = lasso from hdm +DML2.post = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) +``` + +```{r} +# DML with cross-validated Lasso: +dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=1) } # ML method = lasso from glmnet +yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=1) } # ML method = lasso from glmnet +DML2.lasso.cv = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) + +dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0.5) } # ML method = elastic net from glmnet +yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0.5) } # ML method = elastic net from glmnet +DML2.elnet = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) + +dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0) } # ML method = ridge from glmnet +yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) } # ML method = ridge from glmnet +DML2.ridge = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) +``` + +Here we also compute DML with OLS used as the ML method + +```{r} +dreg <- function(z,d){ glmnet(z,d,family="gaussian", lambda=0) } # ML method = ols from glmnet +yreg <- function(z,y){ glmnet(z,y,family="gaussian", lambda=0) } # ML method = ols from glmnet +DML2.ols = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) +``` + +Next, we also apply Random Forest for comparison purposes. + + +### Random Forest + + +```{r} +# DML with Random Forest: +dreg <- function(z,d){ randomForest(z, d) } # ML method = random forest +yreg <- function(z,y){ randomForest(z, y) } # ML method = random forest +set.seed(1) +DML2.RF = DML2.for.PLM(z, d, y, dreg, yreg, nfold=2, clu) # set folds to 2 to limit computation time +``` + +We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative +to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties' characteristics. + + +Finally, let's see which method is best. We compute RMSE for predicting D and Y, and see which +of the methods works better. + + +```{r} +mods<- list(DML2.ols, DML2.lasso, DML2.post, DML2.lasso.cv, DML2.ridge, DML2.elnet, DML2.RF) + +RMSE.mdl<- function(mdl) { +RMSEY <- sqrt(mean(mdl$ytil)^2) +RMSED <- sqrt(mean(mdl$dtil)^2) +return( list(RMSEY=RMSEY, RMSED=RMSED)) +} + +#RMSE.mdl(DML2.lasso) +#DML2.lasso$ytil + +Res<- lapply(mods, RMSE.mdl) + +prRes.Y<- c( Res[[1]]$RMSEY,Res[[2]]$RMSEY, Res[[3]]$RMSEY, Res[[4]]$RMSEY, Res[[5]]$RMSEY, Res[[6]]$RMSEY, Res[[7]]$RMSEY) +prRes.D<- c( Res[[1]]$RMSED,Res[[2]]$RMSED, Res[[3]]$RMSED, Res[[4]]$RMSED, Res[[5]]$RMSED, Res[[6]]$RMSED, Res[[7]]$RMSED) + +prRes<- rbind(prRes.Y, prRes.D); +rownames(prRes)<- c("RMSE D", "RMSE Y"); +colnames(prRes)<- c("OLS", "Lasso", "Post-Lasso", "CV Lasso", "CV Ridge", "CV Elnet", "RF") +print(prRes,digit=6) +``` + +It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge. + + +```{r} +dreg <- function(z,d){ rlasso(z,d, post=T) } # ML method = lasso from hdm +yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) } # ML method = ridge from glmnet +DML2.best= DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu) +``` + +Let's organize the results in a table. + +```{r} +library(xtable) + +table <- matrix(0,9,2) +table[1,1] <- as.numeric(est_baseline[1]) +table[2,1] <- as.numeric(est_ols[1]) +table[3,1] <- as.numeric(DML2.lasso$coef.est) +table[4,1] <- as.numeric(DML2.post$coef.est) +table[5,1] <-as.numeric(DML2.lasso.cv$coef.est) +table[6,1] <-as.numeric(DML2.elnet$coef.est) +table[7,1] <-as.numeric(DML2.ridge$coef.est) +table[8,1] <-as.numeric(DML2.RF$coef.est) +table[9,1] <-as.numeric(DML2.best$coef.est) +table[1,2] <- as.numeric(est_baseline[2]) +table[2,2] <- as.numeric(est_ols[2]) +table[3,2] <- as.numeric(DML2.lasso$se) +table[4,2] <- as.numeric(DML2.post$se) +table[5,2] <-as.numeric(DML2.lasso.cv$se) +table[6,2] <-as.numeric(DML2.elnet$se) +table[7,2] <-as.numeric(DML2.ridge$se) +table[8,2] <-as.numeric(DML2.RF$se) +table[9,2] <-as.numeric(DML2.best$se) + +# print results +colnames(table) <- c("Estimate","Standard Error") +rownames(table) <- c("Baseline OLS", "Least Squares with controls", "Lasso", "Post-Lasso", "CV Lasso","CV Elnet", "CV Ridge", "Random Forest", + "Best") +table +``` + +```{r} +print(table, digit=3) +``` + +```{r} +tab<- xtable(table, digits=3) +print(tab, type="latex") +``` diff --git a/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd b/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd new file mode 100644 index 00000000..621999bf --- /dev/null +++ b/T/deprecated/dml-inference-using-nn-for-gun-ownership.irnb.Rmd @@ -0,0 +1,172 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + +This notebook contains an example for teaching. + + +# The Effect of Gun Ownership on Gun-Homicide Rates using DML for neural nets + + +In this lab, we estimate the effect of gun ownership on the homicide rate using a neural network. + +```{r} +library(keras) +library(lfe) +``` + +First, we need to load and preprocess the data. + +```{r} +# read in dataset +data <- read.csv("../input/gun-example/gun_clean.csv") + + +################## Find Variable Names from the Dataset ################### + +varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) { + vars <- character(0) + if (any(type %in% "numeric")) { + vars <- c(vars,names(df)[sapply(df,is.numeric)]) + } + if (any(type %in% "factor")) { + vars <- c(vars,names(df)[sapply(df,is.factor)]) + } + if (any(type %in% "character")) { + vars <- c(vars,names(df)[sapply(df,is.character)]) + } + vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)] +} + +########################### Create Variables ############################## + +# dummy variables for year and county fixed effects +fixed <- grep("X_Jfips", names(data), value=TRUE, fixed=TRUE) +year <- varlist(data, pattern="X_Tyear") + +# census control variables +census <- NULL +census_var <- c("^AGE", "^BN", "^BP", "^BZ", "^ED", "^EL","^HI", "^HS", "^INC", "^LF", "^LN", "^PI", "^PO", "^PP", "^PV", "^SPR", "^VS") + +for(i in 1:length(census_var)){ + census <- append(census, varlist(data, pattern=census_var[i])) +} + +############################### Variables ################################# + +# treatment variable +d <- "logfssl" + +# outcome variable +y <- "logghomr" + +# other control variables +X1 <- c("logrobr", "logburg", "burg_missing", "robrate_missing") +X2 <- c("newblack", "newfhh", "newmove", "newdens", "newmal") + +###################### Partial-out Fixed Effects ######################### + +# new dataset for partialled-out variables +rdata <- as.data.frame(data$CountyCode) +colnames(rdata) <- "CountyCode" + +# variables to partial-out +varlist <- c(y, d,X1, X2, census) + +# partial out year and county fixed effects from variables in varlist +for(i in 1:length(varlist)){ + form <- as.formula(paste(varlist[i], "~", paste(paste(year,collapse="+"), paste(fixed,collapse="+"), sep="+"))) + rdata[, varlist[i]] <- lm(form, data)$residuals +} +``` + +# DML for neural nets + + + +The following algorithm consumes $Y$,$D$ and $Z$, and learns the residuals $\tilde{Y}$ and $\tilde{D}$ via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient $\beta$ and the clustered standard error from the final OLS regression. + +```{r} +DML2.for.NN <- function(z, d, y, nfold=2, clu, num_epochs, batch_size) { + nobs <- nrow(z) # number of observations + foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] # define folds indices + I <- split(1:nobs, foldid) # split observation indices into folds + ytil <- dtil <- rep(NA, nobs) + cat("fold: ") + for(b in 1:length(I)){ + # normalize the data + mean <- apply(z[-I[[b]],], 2, mean) + std <- apply(z[-I[[b]],], 2, sd) + z[-I[[b]],] <- scale(z[-I[[b]],], center = mean, scale = std) + z[I[[b]],] <- scale(z[I[[b]],], center = mean, scale = std) + # building the model with 3 layers, the ReLU activation function, mse loss and rmsprop optimizer + build_model <- function(){ + model <- keras_model_sequential() %>% + layer_dense(units = 16, activation = "relu", + input_shape = dim(z[-I[[b]],][2]))%>% + layer_dense(units = 16, activation = "relu") %>% + layer_dense(units = 1) + + model %>% compile( + optimizer = "rmsprop", + loss = "mse", + metrics = c("mae") + ) + } + model.Y <- build_model() + model.D <- build_model() + # fitting the model + model.D %>% fit(z[-I[[b]],], d[-I[[b]]], + epochs = num_epochs, batch_size = batch_size, verbose = 0) + model.Y %>% fit(z[-I[[b]],], y[-I[[b]]], + epochs = num_epochs, batch_size = batch_size, verbose = 0) + dhat <- model.D %>% predict(z[I[[b]],]) + yhat <- model.Y %>% predict(z[I[[b]],]) + dtil[I[[b]]] <- (d[I[[b]]] - dhat) # record residual for the left-out fold + ytil[I[[b]]] <- (y[I[[b]]] - yhat) # record residial for the left-out fold + cat(b," ") + } + #rfit <- lm(ytil ~ dtil) # estimate the main parameter by regressing one residual on the other + data <- data.frame(cbind(ytil, dtil, as.matrix(clu))) + rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data) + coef.est <- coef(rfit)[2] # extract the coefficient + #HC <- vcovHC(rfit) + se <- summary(rfit,robust=T)$coefficients[2,2] # record robust standard error by county + cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se)) # print the output + return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) # save the output and residuals +} +``` + +# Estimating the effect with DML for neural nets + +```{r} +# treatment variable +D <- rdata[which(colnames(rdata) == d)] +# outcome variable +Y <- rdata[which(colnames(rdata) == y)] +# construct matrix Z +Z <- rdata[which(colnames(rdata) %in% c(X1,X2,census))] + +# inputs +y_nn <- as.matrix(Y) +d_nn <- as.matrix(D) +z_nn <- as.matrix(Z) +clu <- rdata[which(colnames(rdata) == "CountyCode")] +``` + +```{r} +# DML with a NN +set.seed(123) +DML2.nn = DML2.for.NN(z_nn, d_nn, y_nn, nfold=2, clu, 100, 10) +``` diff --git a/T/deprecated/r-weak-iv-experiments.irnb.Rmd b/T/deprecated/r-weak-iv-experiments.irnb.Rmd new file mode 100644 index 00000000..68758644 --- /dev/null +++ b/T/deprecated/r-weak-iv-experiments.irnb.Rmd @@ -0,0 +1,92 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# A Simple Example of Properties of IV estimator when Instruments are Weak + + +Simulation Design + +```{r} +# Simulation Design + +library(hdm) +set.seed(1) +B= 10000 # trials +IVEst = rep(0, B) +n=100 +beta = .25 # .2 weak IV +#beta = 1 # 1 strong IV + + +U = rnorm(n) +Z = rnorm(n) #generate instrument +D = beta*Z + U #generate endogenougs variable +Y = D+ U # the true causal effect is 1 + + +summary(lm(D~Z)) # first stage is very weak here + +summary(tsls(x=NULL, d=D, y=Y, z=Z)) # + +``` + +Note that the instrument is weak here (contolled by $\beta$) -- the t-stat is less than 4. + + +# Run 1000 trials to evaluate distribution of the IV estimator + +```{r} +# Simulation Design + +set.seed(1) +B= 10000 # trials +IVEst = rep(0, B) + +for(i in 1:B){ +U = rnorm(n) +Z = rnorm(n) #generate instrument +D = beta*Z + U #generate endogenougs variable +Y = D+ U # the true causal effect is 1 +IVEst[i] = coef(tsls(x=NULL, d=D, y=Y, z=Z))[1,1] +} + + +``` + +# Plot the Actual Distribution against the Normal Approximation (based on Strong Instrument Assumption) + +```{r} +plot(density(IVEst-1, n=1000, from=-5, to=5),col=4, xlim= c(-5, 5), + xlab= "IV Estimator -True Effect", main="Actual Distribution vs Gaussian") + +val=seq(-5, 5, by=.05) +var = (1/beta^2)*(1/100) # theoretical variance of IV +sd = sqrt(var) +lines(val, dnorm(val, sd=sd), col=2, lty=2) + +rejection.frequency = sum(( abs(IVEst-1)/sd > 1.96))/B + +cat(c("Rejection Frequency is ", rejection.frequency, " while we expect it to be .05")) + +``` + +# Some Help Functions + +```{r} +help(tsls) +``` + +```{r} +help(density) +``` From 6066677a75621b960061305651f13c9463c6f368 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 06:07:12 -0700 Subject: [PATCH 241/261] Create dml-for-conditional-average-treatment-effect.irnb.Rmd --- ...ditional-average-treatment-effect.irnb.Rmd | 634 ++++++++++++++++++ 1 file changed, 634 insertions(+) create mode 100644 T/deprecated/dml-for-conditional-average-treatment-effect.irnb.Rmd diff --git a/T/deprecated/dml-for-conditional-average-treatment-effect.irnb.Rmd b/T/deprecated/dml-for-conditional-average-treatment-effect.irnb.Rmd new file mode 100644 index 00000000..744c0296 --- /dev/null +++ b/T/deprecated/dml-for-conditional-average-treatment-effect.irnb.Rmd @@ -0,0 +1,634 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + +# DML for CATE + +This is a simple demonstration of Debiased Machine Learning estimator for the Conditional Average Treatment Effect. +Goal is to estimate the effect of 401(k) eligibility on net financial assets for each value of income. +The method is based on the following paper. + +* Title: Debiased Machine Learning of Conditional Average Treatment Effect and Other Causal Functions + +* Authors: Semenova, Vira and Chernozhukov, Victor. + +* Arxiv version: https://arxiv.org/pdf/1702.06240.pdf + +* Published version with replication code: https://academic.oup.com/ectj/advance-article/doi/10.1093/ectj/utaa027/5899048 + + + +Background + +The target function is Conditional Average Treatment Effect, defined as + +$$ g(x)=E [ Y(1) - Y(0) |X=x], $$ + +where $Y(1)$ and $Y(0)$ are potential outcomes in treated and control group. In our case, $Y(1)$ is the potential Net Financial Assets if a subject is eligible for 401(k), and $Y(0)$ is the potential Net Financial Assets if a subject is ineligible. $X$ is a covariate of interest, in this case, income. +$ g(x)$ shows expected effect of eligibility on NET TFA for a subject whose income level is $x$. + + + +If eligibility indicator is independent of $Y(1), Y(0)$, given pre-401-k assignment characteristics $Z$, the function can expressed in terms of observed data (as opposed to hypothetical, or potential outcomes). Observed data consists of realized NET TFA $Y = D Y(1) + (1-D) Y(0)$, eligibility indicator $D$, and covariates $Z$ which includes $X$, income. The expression for $g(x)$ is + +$$ g(x) = E [ Y (\eta_0) \mid X=x], $$ +where the transformed outcome variable is + +$$Y (\eta) = \dfrac{D}{s(Z)} \left( Y - \mu(1,Z) \right) - \dfrac{1-D}{1-s(Z)} \left( Y - \mu(0,Z) \right) + \mu(1,Z) - \mu(0,Z),$$ + +the probability of eligibility is + +$$s_0(z) = Pr (D=1 \mid Z=z),$$ + +the expected net financial asset given $D =d \in \{1,0\}$ and $Z=z$ is + +$$ \mu(d,z) = E[ Y \mid Z=z, D=d]. $$ + +Our goal is to estimate $g(x)$. + + +In step 1, we estimate the unknown functions $s_0(z), \mu(1,z), \mu(0,z)$ and plug them into $Y (\eta)$. + + +In step 2, we approximate the function $g(x)$ by a linear combination of basis functions: + +$$ g(x) = p(x)' \beta_0, $$ + + +where $p(x)$ is a vector of polynomials or splines and + +$$ \beta_0 = (E p(X) p(X))^{-1} E p(X) Y (\eta_0) $$ + +is the best linear predictor. We report + +$$ +\widehat{g}(x) = p(x)' \widehat{\beta}, +$$ + +where $\widehat{\beta}$ is the ordinary least squares estimate of $\beta_0$ defined on the random sample $(X_i, D_i, Y_i)_{i=1}^N$ + +$$ + \widehat{\beta} :=\left( \dfrac{1}{N} \sum_{i=1}^N p(X_i) p(X_i)' \right)^{-1} \dfrac{1}{N} \sum_{i=1}^N p(X_i)Y_i(\widehat{\eta}) +$$ + + + + + + + + + +```{r _uuid="051d70d956493feee0c6d64651c6a088724dca2a", _execution_state="idle"} +## load packages +rm(list=ls()) +library(foreign) +library(quantreg) +library(splines) +library(lattice) +#library(mnormt); +library(Hmisc) +library(fda); +library(hdm) +library(randomForest) +library(ranger) +library(sandwich) +``` + +```{r} +## 401k dataset +data(pension) +pension$net_tfa<-pension$net_tfa/10000 +## covariate of interest -- log income -- +pension$inc = log(pension$inc) +#pension$inc[is.na(pension$inc)]<-0 +pension<-pension[!is.na(pension$inc) & pension$inc!=-Inf & pension$inc !=Inf,] + + +## outcome variable -- total net financial assets +Y=pension$net_tfa +## binary treatment -- indicator of 401(k) eligibility +D=pension$e401 + + +X=pension$inc +## target parameter is CATE = E[ Y(1) - Y(0) | X] + + +## raw covariates so that Y(1) and Y(0) are independent of D given Z +Z = pension[,c("age","inc","fsize","educ","male","db","marr","twoearn","pira","hown","hval","hequity","hmort", + "nohs","hs","smcol")] + + +y_name <- "net_tfa"; +d_name <- "e401"; +form_z <- "(poly(age, 6) + poly(inc, 8) + poly(educ, 4) + poly(fsize,2) + as.factor(marr) + as.factor(twoearn) + as.factor(db) + as.factor(pira) + as.factor(hown))^2"; + + + +cat(sprintf("\n sample size is %g \n", length(Y) )) +cat(sprintf("\n num raw covariates z is %g \n", dim(Z)[2] )) +``` + +In Step 1, we estimate three functions: + +1. probability of treatment assignment $s_0(z)$ + +2.-3. regression functions $\mu_0(1,z)$ and $\mu_0(0,z)$. + +We use the cross-fitting procedure with $K=2$ holds. For definition of cross-fitting with $K$ folds, check the sample splitting in ```DML2.for.PLM``` function defined in https://www.kaggle.com/victorchernozhukov/debiased-ml-for-partially-linear-model-in-r + +For each function, we try random forest. + + + +First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by lasso + +```{r} + +first_stage_lasso<-function(data,d_name,y_name, form_z, seed=1) { + + # Sample size + N<-dim(data)[1] + # Estimated regression function in control group + mu0.hat<-rep(1,N) + # Estimated regression function in treated group + mu1.hat<-rep(1,N) + # Propensity score + s.hat<-rep(1,N) + seed=1 + ## define sample splitting + set.seed(seed) + inds.train=sample(1:N,floor(N/2)) + inds.eval=setdiff(1:N,inds.train) + + print ("Estimate treatment probability, first half") + ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest + fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.train,]) + + s.hat[inds.eval]<-predict(fitted.lasso.pscore,data[inds.eval,],type="response") + print ("Estimate treatment probability, second half") + fitted.lasso.pscore<-rlassologit(as.formula(paste0(d_name,"~",form_z )),data=data[inds.eval,]) + s.hat[inds.train]<-predict( fitted.lasso.pscore,data[inds.train,],type="response") + + + + + + data1<-data + data1[,d_name]<-1 + + data0<-data + data0[,d_name]<-0 + + print ("Estimate expectation function, first half") + fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.train,]) + mu1.hat[inds.eval]<-predict( fitted.lasso.mu,data1[inds.eval,]) + mu0.hat[inds.eval]<-predict( fitted.lasso.mu,data0[inds.eval,]) + + print ("Estimate expectation function, second half") + fitted.lasso.mu<-rlasso(as.formula(paste0(y_name,"~",d_name,"+(",form_z,")" )),data=data[inds.eval,]) + mu1.hat[inds.train]<-predict( fitted.lasso.mu,data1[inds.train,]) + mu0.hat[inds.train]<-predict( fitted.lasso.mu,data0[inds.train,]) + + return (list(mu1.hat=mu1.hat, + mu0.hat=mu0.hat, + s.hat=s.hat)) + +} +``` + +First Stage: estimate $\mu_0(1,z)$ and $\mu_0(0,z)$ and $s_0(z)$ by random forest + +```{r} +first_stage_rf<-function(Y,D,Z,seed=1) { + + # Sample size + N<-length(D) + # Estimated regression function in control group + mu0.hat<-rep(1,N) + # Estimated regression function in treated group + mu1.hat<-rep(1,N) + # Propensity score + s.hat<-rep(1,N) + + + ## define sample splitting + set.seed(seed) + inds.train=sample(1:N,floor(N/2)) + inds.eval=setdiff(1:N,inds.train) + + print ("Estimate treatment probability, first half") + ## conditional probability of 401 k eligibility (i.e., propensity score) based on random forest + D.f<-as.factor(as.character(D)) + fitted.rf.pscore<-randomForest(Z,D.f,subset=inds.train) + s.hat[inds.eval]<-predict(fitted.rf.pscore,Z[inds.eval,],type="prob")[,2] + print ("Estimate treatment probability, second half") + fitted.rf<-randomForest(Z,D.f,subset=inds.eval) + s.hat[inds.train]<-predict(fitted.rf.pscore,Z[inds.train,],type="prob")[,2] + + ## conditional expected net financial assets (i.e., regression function) based on random forest + + covariates<-cbind(Z,D) + + covariates1<-cbind(Z,D=rep(1,N)) + covariates0<-cbind(Z,D=rep(0,N)) + + print ("Estimate expectation function, first half") + fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.train) + mu1.hat[inds.eval]<-predict( fitted.rf.mu,covariates1[inds.eval,]) + mu0.hat[inds.eval]<-predict( fitted.rf.mu,covariates0[inds.eval,]) + + print ("Estimate expectation function, second half") + fitted.rf.mu<-randomForest(cbind(Z,D),Y,subset=inds.eval) + mu1.hat[inds.train]<-predict( fitted.rf.mu,covariates1[inds.train,]) + mu0.hat[inds.train]<-predict( fitted.rf.mu,covariates0[inds.train,]) + + return (list(mu1.hat=mu1.hat, + mu0.hat=mu0.hat, + s.hat=s.hat)) + +} +``` + + +In Step 2, we approximate $Y(\eta_0)$ by a vector of basis functions. There are two use cases: +**** +2.A. Group Average Treatment Effect, described above + + +2.B. Average Treatment Effect conditional on income value. There are three smoothing options: + +1. splines offered in ```least_squares_splines``` + +2. orthogonal polynomials with the highest degree chosen by cross-validation ```least_squares_series``` + +3. standard polynomials with the highest degree input by user ```least_squares_series_old``` + + +The default option is option 3. + + +2.A. The simplest use case of Conditional Average Treatment Effect is GATE, or Group Average Treatment Effect. Partition the support of income as + +$$ - \infty = \ell_0 < \ell_1 < \ell_2 \dots \ell_K = \infty $$ + +define intervals $I_k = [ \ell_{k-1}, \ell_{k})$. Let $X$ be income covariate. For $X$, define a group indicator + +$$ G_k(X) = 1[X \in I_k], $$ + +and the vector of basis functions + +$$ p(X) = (G_1(X), G_2(X), \dots, G_K(X)) $$ + +Then, the Best Linear Predictor $\beta_0$ vector shows the average treatment effect for each group. + +```{r} +## estimate first stage functions by random forest +## may take a while +fs.hat.rf = first_stage_rf(Y,D,Z) +``` + +```{r} +X=pension$inc +fs.hat<-fs.hat.rf +min_cutoff=0.01 +# regression function +mu1.hat<-fs.hat[["mu1.hat"]] +mu0.hat<-fs.hat[["mu0.hat"]] +# propensity score +s.hat<-fs.hat[["s.hat"]] +s.hat<-sapply(s.hat,max,min_cutoff) +### Construct Orthogonal Signal + + +RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat +``` + +```{r} +qtmax <- function(C, S=10000, alpha) + {; + p <- nrow(C); + tmaxs <- apply(abs(matrix(rnorm(p*S), nrow = p, ncol = S)), 2, max); + return(quantile(tmaxs, 1-alpha)); + }; + +# This function computes the square root of a symmetric matrix using the spectral decomposition; + + +group_average_treatment_effect<-function(X,Y,max_grid=5,alpha=0.05, B=10000) { + + grid<-quantile(X,probs=c((0:max_grid)/max_grid)) + X.raw<-matrix(NA, nrow=length(Y),ncol=length(grid)-1) + + for (k in 2:((length(grid)))) { + X.raw[,k-1]<-sapply(X, function (x) ifelse (x>=grid[k-1] & x=grid[k-1] & x<=grid[k],1,0) ) + + ols.fit<- lm(Y~X.raw-1) + coefs <- coef(ols.fit) + vars <- names(coefs) + HCV.coefs <- vcovHC(ols.fit, type = 'HC') + coefs.se <- sqrt(diag(HCV.coefs)) # White std errors + ## this is an identity matrix + ## qtmax is simplified + C.coefs <- (diag(1/sqrt(diag(HCV.coefs)))) %*% HCV.coefs %*% (diag(1/sqrt(diag(HCV.coefs)))); + + + tes <- coefs + tes.se <- coefs.se + tes.cor <- C.coefs + crit.val <- qtmax(tes.cor,B,alpha); + + tes.ucb <- tes + crit.val * tes.se; + tes.lcb <- tes - crit.val * tes.se; + + tes.uci <- tes + qnorm(1-alpha/2) * tes.se; + tes.lci <- tes + qnorm(alpha/2) * tes.se; + + + return(list(beta.hat=coefs, ghat.lower.point=tes.lci, ghat.upper.point=tes.uci, + ghat.lower=tes.lcb, ghat.upper= tes.ucb, crit.val=crit.val )) +} +``` + +```{r} +res<-group_average_treatment_effect(X=X,Y=RobustSignal) +``` + +```{r} +## this code is taken from L1 14.382 taught at MIT +## author: Mert Demirer +options(repr.plot.width=10, repr.plot.height=8) + +tes<-res$beta.hat +tes.lci<-res$ghat.lower.point +tes.uci<-res$ghat.upper.point + +tes.lcb<-res$ghat.lower +tes.ucb<-res$ghat.upper +tes.lev<-c('0%-20%', '20%-40%','40%-60%','60%-80%','80%-100%') + +plot( c(1,5), las = 2, xlim =c(0.6, 5.4), ylim = c(.05, 2.09), type="n",xlab="Income group", + ylab="Average Effect on NET TFA (per 10 K)", main="Group Average Treatment Effects on NET TFA", xaxt="n"); +axis(1, at=1:5, labels=tes.lev); +for (i in 1:5) +{; + rect(i-0.2, tes.lci[i], i+0.2, tes.uci[i], col = NA, border = "red", lwd = 3); + rect(i-0.2, tes.lcb[i], i+0.2, tes.ucb[i], col = NA, border = 4, lwd = 3 ); + segments(i-0.2, tes[i], i+0.2, tes[i], lwd = 5 ); +}; +abline(h=0); + +legend(2.5, 2.0, c('Regression Estimate', '95% Simultaneous Confidence Interval', '95% Pointwise Confidence Interval'), col = c(1,4,2), lwd = c(4,3,3), horiz = F, bty = 'n', cex=0.8); + +dev.off() +``` + +```{r} +least_squares_splines<-function(X,Y,max_knot=9,norder,nderiv,...) { + ## Create technical regressors + cv.bsp<-rep(0,max_knot-1) + for (knot in 2:max_knot) { + breaks<- quantile(X, c(0:knot)/knot) + formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1] + fit <- lm(formula.bsp); + cv.bsp[knot-1] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); + } + ## Number of knots chosen by cross-validation + cv_knot<-which.min(cv.bsp)+1 + breaks<- quantile(X, c(0:cv_knot)/cv_knot) + formula.bsp <- Y ~ bsplineS(X, breaks =breaks, norder = norder, nderiv = 0)[ ,-1] + fit <- lm(formula.bsp); + + + return(list(cv_knot=cv_knot,fit=fit)) +} + + +least_squares_series<-function(X, Y,max_degree,...) { + + cv.pol<-rep(0,max_degree) + for (degree in 1:max_degree) { + formula.pol <- Y ~ poly(X, degree) + fit <- lm(formula.pol ); + cv.pol[degree] <- sum( (fit$res / (1 - hatvalues(fit)) )^2); + } + ## Number of knots chosen by cross-validation + cv_degree<-which.min(cv.pol) + ## Estimate coefficients + formula.pol <- Y ~ poly(X, cv_degree) + fit <- lm(formula.pol); + + + return(list(fit=fit,cv_degree=cv_degree)) +} +``` + +```{r} +msqrt <- function(C) + {; + C.eig <- eigen(C); + return(C.eig$vectors %*% diag(sqrt(C.eig$values)) %*% solve(C.eig$vectors)); + }; + + +tboot<-function(regressors_grid, Omega.hat ,alpha, B=10000) { + + + numerator_grid<-regressors_grid%*%msqrt( Omega.hat) + denominator_grid<-sqrt(diag(regressors_grid%*% Omega.hat%*%t(regressors_grid))) + + norm_numerator_grid<-numerator_grid + for (k in 1:dim(numerator_grid)[1]) { + norm_numerator_grid[k,]<-numerator_grid[k,]/denominator_grid[k] + } + + tmaxs <- apply(abs( norm_numerator_grid%*% matrix(rnorm(dim(numerator_grid)[2]*B), nrow = dim(numerator_grid)[2], ncol = B)), 2, max) + return(quantile(tmaxs, 1-alpha)) + +} +``` + +```{r} +second_stage<-function(fs.hat,Y,D,X,max_degree=3,norder=4,nderiv=0,ss_method="poly",min_cutoff=0.01,alpha=0.05,eps=0.1,...) { + + X_grid = seq(min(X),max(X),eps) + mu1.hat<-fs.hat[["mu1.hat"]] + mu0.hat<-fs.hat[["mu0.hat"]] + # propensity score + s.hat<-fs.hat[["s.hat"]] + s.hat<-sapply(s.hat,max,min_cutoff) + ### Construct Orthogonal Signal + + RobustSignal<-(Y - mu1.hat)*D/s.hat - (Y - mu0.hat)*(1-D)/(1-s.hat) + mu1.hat - mu0.hat + + + + # Estimate the target function using least squares series + if (ss_method == "ortho_poly") { + res<-least_squares_series(X=X,Y=RobustSignal,eps=0.1,max_degree=max_degree) + fit<-res$fit + cv_degree<-res$cv_degree + regressors_grid<-cbind( rep(1,length(X_grid)), poly(X_grid,cv_degree)) + + } + if (ss_method == "splines") { + + res<-least_squares_splines(X=X,Y=RobustSignal,eps=0.1,norder=norder,nderiv=nderiv) + fit<-res$fit + cv_knot<-res$cv_knot + breaks<- quantile(X, c(0:cv_knot)/cv_knot) + regressors_grid<-cbind( rep(1,length(X_grid)), bsplineS(X_grid, breaks =breaks, norder = norder, nderiv = nderiv)[ ,-1]) + degree=cv_knot + + + } + + + g.hat<-regressors_grid%*%coef(fit) + + + HCV.coefs <- vcovHC(fit, type = 'HC') + #Omega.hat<-white_vcov(regressors,Y,b.hat=coef(fit)) + standard_error<-sqrt(diag(regressors_grid%*% HCV.coefs%*%t(regressors_grid))) + ### Lower Pointwise CI + ghat.lower.point<-g.hat+qnorm(alpha/2)*standard_error + ### Upper Pointwise CI + ghat.upper.point<-g.hat+qnorm(1-alpha/2)*standard_error + + max_tstat<-tboot(regressors_grid=regressors_grid, Omega.hat=HCV.coefs,alpha=alpha) + + + ## Lower Uniform CI + ghat.lower<-g.hat-max_tstat*standard_error + ## Upper Uniform CI + ghat.upper<-g.hat+max_tstat*standard_error + return(list(ghat.lower=ghat.lower,g.hat=g.hat, ghat.upper=ghat.upper,fit=fit,ghat.lower.point=ghat.lower.point, + ghat.upper.point=ghat.upper.point,X_grid=X_grid)) + + + +} +``` + +```{r} +make_plot<-function(res,lowy,highy,degree,ss_method = "series",uniform=TRUE,...) { + + + title=paste0("Effect of 401(k) on Net TFA, ", ss_method) + X_grid=res$X_grid + len = length(X_grid) + + + if (uniform) { + group <-c(rep("UCI",len), rep("PCI",len), rep("Estimate",len),rep("PCIL",len),rep("UCIL",len)) + group_type<- c(rep("CI",len), rep("CI",len), rep("Estimate",len),rep("CI",len),rep("CI",len)) + group_ci_type<-c(rep("Uniform",len), rep("Point",len), rep("Uniform",len),rep("Point",len),rep("Uniform",len)) + + df<-data.frame(income=rep(X_grid,5), outcome = c(res$ghat.lower,res$ghat.lower.point,res$g.hat,res$ghat.upper.point,res$ghat.upper),group=group, group_col = group_type,group_line =group_ci_type ) + p<-ggplot(data=df)+ + aes(x=exp(income),y=outcome,colour=group )+ + theme_bw()+ + xlab("Income")+ + ylab("Net TFA, (thousand dollars)")+ + scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ + theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ + theme(legend.title=element_blank())+ + theme(legend.position="none")+ + ylim(low=lowy,high=highy)+ + geom_line(aes(linetype = group_line),size=1.5)+ + scale_linetype_manual(values=c("dashed","solid"))+ + ggtitle(title) + } + + if (!uniform) { + group <-c( rep("PCI",len), rep("Estimate",len),rep("PCIL",len)) + group_type<- c(rep("CI",len), rep("Estimate",len),rep("CI",len)) + group_ci_type<-c(rep("Point",len), rep("Uniform",len),rep("Point",len)) + + df<-data.frame(income=rep(X_grid,3), outcome = c(res$ghat.lower.point,res$g.hat,res$ghat.upper.point),group=group, group_col = group_type,group_line =group_ci_type ) + + p<-ggplot(data=df)+ + aes(x=exp(income),y=outcome,colour=group )+ + theme_bw()+ + xlab("Income")+ + ylab("Net TFA, (thousand dollars)")+ + scale_colour_manual(values=c("black","blue","blue","blue","blue"))+ + theme(plot.title = element_text(hjust = 0.5),text=element_text(size=20, family="serif"))+ + theme(legend.title=element_blank())+ + theme(legend.position="none")+ + ylim(low=lowy,high=highy)+ + geom_line(aes(linetype = group_line),size=1.5)+ + scale_linetype_manual(values=c("dashed","solid"))+ + ggtitle(title) + + } + + + + return(p) +} +``` + +```{r} +res_ortho_rf_splines=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="splines",max_degree=3) +``` + +```{r} +res_ortho_rf_ortho_poly=second_stage(fs.hat=fs.hat.rf,X=X,D=D,Y=Y,ss_method="ortho_poly",max_degree=3) +``` + +#plot findings: + +-- black solid line shows estimated function $p(x)' \widehat{\beta}$ + +-- blue dashed lines show pointwise confidence bands for this function + +```{r} +p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho_poly",uniform=FALSE, lowy=-10,highy=20) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +plot findings: + +-- black solid line shows estimated function $p(x)' \widehat{\beta}$ + +-- blue dashed lines show pointwise confidence bands for this function. I.e., for each fixed point $x_0$, i.e., $x_0=1$, they cover $p(x_0)'\beta_0$ with probability 0.95 + +-- blue solid lines show uniform confidence bands for this function. I.e., they cover the whole function $x \rightarrow p(x)'\beta_0$ with probability 0.95 on some compact range + +```{r} +p<-make_plot(res_ortho_rf_ortho_poly,ss_method="ortho polynomials",uniform=TRUE,lowy=-10,highy=25) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +```{r} +p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=FALSE, lowy=-15,highy=10) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +```{r} +p<-make_plot(res_ortho_rf_splines,ss_method="splines",uniform=TRUE,lowy=-20,highy=20) +options(repr.plot.width=15, repr.plot.height=10) +print(p) +``` + +```{r} + +``` From 54135249bca3426acaea2093d212ddfea966a07f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 06:09:26 -0700 Subject: [PATCH 242/261] Create ml-for-wage-prediction.irnb.Rmd --- PM2/Old/ml-for-wage-prediction.irnb.Rmd | 291 ++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 PM2/Old/ml-for-wage-prediction.irnb.Rmd diff --git a/PM2/Old/ml-for-wage-prediction.irnb.Rmd b/PM2/Old/ml-for-wage-prediction.irnb.Rmd new file mode 100644 index 00000000..71d8ef8e --- /dev/null +++ b/PM2/Old/ml-for-wage-prediction.irnb.Rmd @@ -0,0 +1,291 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: R + language: R + name: ir +--- + + + + +This notebook contains an example for teaching. + + + + +# Penalized Linear Regressions: A Simulation Experiment + + +## Data Generating Process: Approximately Sparse + +```{r} +set.seed(1) + +n = 100; +p = 400; + +Z= runif(n)-1/2; +W = matrix(runif(n*p)-1/2, n, p); + + + +beta = 1/seq(1:p)^2; # approximately sparse beta +#beta = rnorm(p)*.2 # dense beta +gX = exp(4*Z)+ W%*%beta; # leading term nonlinear +X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) + + +Y = gX + rnorm(n); #generate Y + + +plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) + +print( c("theoretical R2:", var(gX)/var(Y))) + +var(gX)/var(Y); #theoretical R-square in the simulation example + + + + +``` + +We use package Glmnet to carry out predictions using cross-validated lasso, ridge, and elastic net + +```{r} + +library(glmnet) +fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss +fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss + +yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions +yhat.ridge <- predict(fit.ridge, newx = X) +yhat.elnet <- predict(fit.elnet, newx = X) + +MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + +``` + +Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm + +```{r} +library(hdm) +fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level +fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level + +yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X +yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X + +MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +``` + +Next we code up lava, which alternates the fitting of lasso and ridge + +```{r} +library(glmnet) + +lava.predict<- function(X,Y, iter=5){ + +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part + + +i=1 +while(i<= iter) { +g1 = predict(rlasso(X, as.vector(Y-m1), post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part +i = i+1 } + +return(g1+m1); + } + + +yhat.lava = lava.predict(X,Y) +MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +MSE.lava +``` + +```{r} +library(xtable) +table<- matrix(0, 6, 2) +table[1,1:2] <- MSE.lasso.cv +table[2,1:2] <- MSE.ridge +table[3,1:2] <- MSE.elnet +table[4,1:2] <- MSE.lasso +table[5,1:2] <- MSE.lasso.post +table[6,1:2] <- MSE.lava + +colnames(table)<- c("MSA", "S.E. for MSA") +rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", + "Lasso","Post-Lasso","Lava") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab + + +``` + +```{r} + +plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") + +points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) +points(gX, yhat.rlasso.post, col=3, pch=17, cex = 1.2 ) +points( gX, yhat.lasso.cv,col=4, pch=19, cex = 1.2 ) + + +legend("bottomright", + legend = c("rLasso", "Post-rLasso", "CV Lasso"), + col = c(2,3,4), + pch = c(18,17, 19), + bty = "n", + pt.cex = 1.3, + cex = 1.2, + text.col = "black", + horiz = F , + inset = c(0.1, 0.1)) + + +``` + +## Data Generating Process: Approximately Sparse + Small Dense Part + +```{r} +set.seed(1) + +n = 100; +p = 400; + +Z= runif(n)-1/2; +W = matrix(runif(n*p)-1/2, n, p); + + +beta = rnorm(p)*.2 # dense beta +gX = exp(4*Z)+ W%*%beta; # leading term nonlinear +X = cbind(Z, Z^2, Z^3, W ); # polynomials in Zs will be approximating exp(4*Z) + + +Y = gX + rnorm(n); #generate Y + + +plot(gX,Y, xlab="g(X)", ylab="Y") #plot V vs g(X) + +print( c("theoretical R2:", var(gX)/var(Y))) + +var(gX)/var(Y); #theoretical R-square in the simulation example + + + +``` + +```{r} + +library(glmnet) +fit.lasso.cv <- cv.glmnet(X, Y, family="gaussian", alpha=1) # family gaussian means that we'll be using square loss +fit.ridge <- cv.glmnet(X, Y, family="gaussian", alpha=0) # family gaussian means that we'll be using square loss +fit.elnet <- cv.glmnet(X, Y, family="gaussian", alpha=.5) # family gaussian means that we'll be using square loss + +yhat.lasso.cv <- predict(fit.lasso.cv, newx = X) # predictions +yhat.ridge <- predict(fit.ridge, newx = X) +yhat.elnet <- predict(fit.elnet, newx = X) + +MSE.lasso.cv <- summary(lm((gX-yhat.lasso.cv)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.ridge <- summary(lm((gX-yhat.ridge)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.elnet <- summary(lm((gX-yhat.elnet)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + +``` + +```{r} +library(hdm) +fit.rlasso <- rlasso(Y~X, post=FALSE) # lasso with plug-in penalty level +fit.rlasso.post <- rlasso(Y~X, post=TRUE) # post-lasso with plug-in penalty level + +yhat.rlasso <- predict(fit.rlasso) #predict g(X) for values of X +yhat.rlasso.post <- predict(fit.rlasso.post) #predict g(X) for values of X + +MSE.lasso <- summary(lm((gX-yhat.rlasso)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) +MSE.lasso.post <- summary(lm((gX-yhat.rlasso.post)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +``` + +```{r} +library(glmnet) + +lava.predict<- function(X,Y, iter=5){ + +g1 = predict(rlasso(X, Y, post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ) #ridge step fits the "dense" part + + +i=1 +while(i<= iter) { +g1 = predict(rlasso(X, as.vector(Y-m1), post=F)) #lasso step fits "sparse part" +m1 = predict(glmnet(X, as.vector(Y-g1), family="gaussian", alpha=0, lambda =20),newx=X ); #ridge step fits the "dense" part +i = i+1 } + +return(g1+m1); + } + + +yhat.lava = lava.predict(X,Y) +MSE.lava <- summary(lm((gX-yhat.lava)^2~1))$coef[1:2] # report MSE and standard error for MSE for approximating g(X) + + +MSE.lava +``` + +```{r} +library(xtable) +table<- matrix(0, 6, 2) +table[1,1:2] <- MSE.lasso.cv +table[2,1:2] <- MSE.ridge +table[3,1:2] <- MSE.elnet +table[4,1:2] <- MSE.lasso +table[5,1:2] <- MSE.lasso.post +table[6,1:2] <- MSE.lava + +colnames(table)<- c("MSA", "S.E. for MSA") +rownames(table)<- c("Cross-Validated Lasso", "Cross-Validated ridge","Cross-Validated elnet", + "Lasso","Post-Lasso","Lava") +tab <- xtable(table, digits =3) +print(tab,type="latex") # set type="latex" for printing table in LaTeX +tab + +``` + +```{r} + +plot(gX, gX, pch=19, cex=1, ylab="predicted value", xlab="true g(X)") + +points(gX, yhat.rlasso, col=2, pch=18, cex = 1.5 ) +points(gX, yhat.elnet, col=3, pch=17, cex = 1.2 ) +points(gX, yhat.lava, col=4, pch=19, cex = 1.2 ) + + +legend("bottomright", + legend = c("rLasso", "Elnet", "Lava"), + col = c(2,3,4), + pch = c(18,17, 19), + bty = "n", + pt.cex = 1.3, + cex = 1.2, + text.col = "black", + horiz = F , + inset = c(0.1, 0.1)) + +``` From 4223a52fe4f301b581026b1bd05e8213412f7bf6 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 06:19:45 -0700 Subject: [PATCH 243/261] old --- PM3/{ => Old}/automl-for-wage-prediction.irnb.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename PM3/{ => Old}/automl-for-wage-prediction.irnb.Rmd (100%) diff --git a/PM3/automl-for-wage-prediction.irnb.Rmd b/PM3/Old/automl-for-wage-prediction.irnb.Rmd similarity index 100% rename from PM3/automl-for-wage-prediction.irnb.Rmd rename to PM3/Old/automl-for-wage-prediction.irnb.Rmd From d2fa69bb8b4f2a442238c748080253f103993755 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 22 Jul 2024 06:36:09 -0700 Subject: [PATCH 244/261] pattern matching only to highlevel dir --- .github/workflows/check-R-notebooks.yml | 10 +++++----- .github/workflows/check-python-notebooks.yml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index bf5b9cc6..c2acb1f7 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -30,28 +30,28 @@ jobs: # git fetch origin ${{ github.event.pull_request.base.ref }} ${{ github.event.pull_request.head.ref }} # git diff --name-only origin/${{ github.event.pull_request.base.ref }}...origin/${{ github.event.pull_request.head.ref }} > changed_files.txt git diff --name-only -r HEAD^1 HEAD > changed_files.txt - if grep -q -E '^${{ matrix.directory }}/.*\.Rmd$' changed_files.txt; then + if grep -q -E '^${{ matrix.directory }}/[^/]+\.Rmd$' changed_files.txt; then echo "Changing directly the .Rmd files is prohibited. You should only be changing the .irnb files" echo "The .Rmd files will be automatically generated and updated when the PR is merged in the main branch" echo "It seems that you changed directly the following files:" - grep -E '^${{ matrix.directory }}/.*\.Rmd$' changed_files.txt + grep -E '^${{ matrix.directory }}/[^/]+\.Rmd$' changed_files.txt exit 1 fi - grep -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '^${{ matrix.directory }}/[^/]+\.irnb$|^${{ matrix.directory }}/.*\.R|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Find changed notebooks in Push if: github.event_name == 'push' id: find_notebooks_push run: | git diff --name-only ${{ github.event.before }} ${{ github.event.after }} > changed_files.txt - grep -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt + grep -E '^${{ matrix.directory }}/[^/]+\.irnb$|^${{ matrix.directory }}/.*\.R|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_files.txt > changed_notebooks.txt || echo "No notebooks changed" > changed_notebooks.txt - name: Check if any notebooks changed in PR or Push if: (github.event_name == 'push') || (github.event_name == 'pull_request') id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.irnb$|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/[^/]+\.irnb$|^${{ matrix.directory }}/.*\.R|\.github/workflows/check-and-transform-R-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index 6e63419c..555ebfb7 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -48,7 +48,7 @@ jobs: id: check_notebooks run: | cat changed_notebooks.txt - if grep -q -E '^${{ matrix.directory }}/.*\.ipynb$|^${{ matrix.directory }}/.*\.py$|\.github/workflows/python-notebooks.yml$' changed_notebooks.txt; then + if grep -q -E '^${{ matrix.directory }}/[^/]+\.ipynb$|^${{ matrix.directory }}/.*\.py$|\.github/workflows/python-notebooks.yml$' changed_notebooks.txt; then echo "notebooks_changed=true" >> $GITHUB_ENV else echo "notebooks_changed=false" >> $GITHUB_ENV From bb9544a68e316f7b7be2959b98ab58c131c12b3d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 04:03:01 -0700 Subject: [PATCH 245/261] Update check-python-notebooks.yml --- .github/workflows/check-python-notebooks.yml | 48 ++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index 555ebfb7..cb02f2c1 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -170,11 +170,59 @@ jobs: with: name: execution-logs-${{ matrix.directory }}-${{ matrix.os }} path: logs + + - name: Send failure mail + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" + uses: dawidd6/action-send-mail@v3 + with: + # Required mail server address if not connection_url: + server_address: smtp.gmail.com + # Server port, default 25: + server_port: 465 + # Optional whether this connection use TLS (default is true if server_port is 465) + secure: true + # Optional (recommended) mail server username: + username: ${{secrets.MAIL_USERNAME}} + # Optional (recommended) mail server password: + password: ${{secrets.MAIL_PASSWORD}} + # Required mail subject: + subject: URGENT Github Actions Python job for directory ${{matrix.directory}} failed! + # Required recipients' addresses: + to: bsyrganis@gmail.com,vsyrgk@stanford.edu + # Required sender full name (address can be skipped): + from: GA-MetricsML-Notebooks + # Optional plain body: + body: Python notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} failed!! + ignore_cert: true - name: Check for errors if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" run: exit 1 + - name: Send success mail + if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + uses: dawidd6/action-send-mail@v3 + with: + # Required mail server address if not connection_url: + server_address: smtp.gmail.com + # Server port, default 25: + server_port: 465 + # Optional whether this connection use TLS (default is true if server_port is 465) + secure: true + # Optional (recommended) mail server username: + username: ${{secrets.MAIL_USERNAME}} + # Optional (recommended) mail server password: + password: ${{secrets.MAIL_PASSWORD}} + # Required mail subject: + subject: URGENT Github Actions Python job for directory ${{matrix.directory}} failed! + # Required recipients' addresses: + to: bsyrganis@gmail.com,vsyrgk@stanford.edu + # Required sender full name (address can be skipped): + from: GA-MetricsML-Notebooks + # Optional plain body: + body: Python notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} failed!! + ignore_cert: true + # - name: Check out the branch for pull request # if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" # run: | From 7aae3c3e6750b7cb32c721d6bf8c07b912fc1cb5 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 04:39:47 -0700 Subject: [PATCH 246/261] Email sending after tests --- .github/workflows/check-R-notebooks.yml | 63 +++++++++++++++++++- .github/workflows/check-python-notebooks.yml | 6 +- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index c2acb1f7..b0d62efa 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -166,6 +166,7 @@ jobs: sink(type = "message") close(log_con) if (length(errors) > 0) { + writeLines("errors_found=true", "errors_check.txt") for (error in errors) { cat("Error found in file:", error$gitrfile, "\n") cat("at line::", error$location, "\n") @@ -174,7 +175,9 @@ jobs: cat(paste(error$traceback, collapse = "\n")) print("\n") } - quit(status = 1, save = "no") # Exit with an error status if errors are found + # quit(status = 1, save = "no") # Exit with an error status if errors are found + } else { + writeLines("errors_found=false", "errors_check.txt") } ' 2>/dev/null env: @@ -206,6 +209,64 @@ jobs: run: | rm -rf r_scripts rm ${{ matrix.directory }}_r_scripts.zip + + - name: Publish result of error check + if: env.notebooks_changed == 'true' + run: | + cat errors_check.txt >> $GITHUB_ENV + rm errors_check.txt + + - name: Send success mail + if: "(env.notebooks_changed == 'true') && (env.errors_found == 'true')" + uses: dawidd6/action-send-mail@v3 + with: + # Required mail server address if not connection_url: + server_address: smtp.gmail.com + # Server port, default 25: + server_port: 465 + # Optional whether this connection use TLS (default is true if server_port is 465) + secure: true + # Optional (recommended) mail server username: + username: ${{secrets.MAIL_USERNAME}} + # Optional (recommended) mail server password: + password: ${{secrets.MAIL_PASSWORD}} + # Required mail subject: + subject: FAILURE Github Actions R job for directory ${{matrix.directory}} failed! + # Required recipients' addresses: + to: bsyrganis@gmail.com,vsyrgk@stanford.edu + # Required sender full name (address can be skipped): + from: GA-MetricsML-Notebooks + # Optional plain body: + body: R notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} failed. + ignore_cert: true + + - name: fail if errors + if: "(env.notebooks_changed == 'true') && (env.errors_found == 'true')" + run: exit 1 + + - name: Send success mail + if: env.notebooks_changed == 'true' + uses: dawidd6/action-send-mail@v3 + with: + # Required mail server address if not connection_url: + server_address: smtp.gmail.com + # Server port, default 25: + server_port: 465 + # Optional whether this connection use TLS (default is true if server_port is 465) + secure: true + # Optional (recommended) mail server username: + username: ${{secrets.MAIL_USERNAME}} + # Optional (recommended) mail server password: + password: ${{secrets.MAIL_PASSWORD}} + # Required mail subject: + subject: SUCCESS Github Actions R job for directory ${{matrix.directory}} succeeded! + # Required recipients' addresses: + to: bsyrganis@gmail.com,vsyrgk@stanford.edu + # Required sender full name (address can be skipped): + from: GA-MetricsML-Notebooks + # Optional plain body: + body: R notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} succedded. + ignore_cert: true # - name: Check out the branch for pull request # if: "(github.event_name == 'pull_request') && (env.notebooks_changed == 'true')" diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index cb02f2c1..1a545951 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -186,7 +186,7 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: URGENT Github Actions Python job for directory ${{matrix.directory}} failed! + subject: FAILURE Github Actions Python job for directory ${{matrix.directory}} failed! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): @@ -214,13 +214,13 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: URGENT Github Actions Python job for directory ${{matrix.directory}} failed! + subject: SUCCESS Github Actions Python job for directory ${{matrix.directory}} succeeded! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): from: GA-MetricsML-Notebooks # Optional plain body: - body: Python notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} failed!! + body: Python notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} succedded. ignore_cert: true # - name: Check out the branch for pull request From f7cdd4266e776d1f65ac41b96c8827a964219a7d Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 04:42:43 -0700 Subject: [PATCH 247/261] updated title --- .github/workflows/check-R-notebooks.yml | 4 ++-- .github/workflows/check-python-notebooks.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index b0d62efa..b5b73cf1 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -231,7 +231,7 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: FAILURE Github Actions R job for directory ${{matrix.directory}} failed! + subject: FAILURE R (${{matrix.directory}}) test Github Action job failed! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): @@ -259,7 +259,7 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: SUCCESS Github Actions R job for directory ${{matrix.directory}} succeeded! + subject: SUCCESS R (${{matrix.directory}}) test Github Action job succeeded! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index 1a545951..8bfda577 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -186,7 +186,7 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: FAILURE Github Actions Python job for directory ${{matrix.directory}} failed! + subject: FAILURE Python (${{matrix.directory}}) test Github Action job failed! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): @@ -214,7 +214,7 @@ jobs: # Optional (recommended) mail server password: password: ${{secrets.MAIL_PASSWORD}} # Required mail subject: - subject: SUCCESS Github Actions Python job for directory ${{matrix.directory}} succeeded! + subject: SUCCESS Python (${{matrix.directory}}) test Github Action job succeeded! # Required recipients' addresses: to: bsyrganis@gmail.com,vsyrgk@stanford.edu # Required sender full name (address can be skipped): From 47d31d4b83e6dc7d29a85c7ab11197caf71d6f79 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 04:53:15 -0700 Subject: [PATCH 248/261] testing failures --- CM1/python-rct-penn-precision-adj.ipynb | 4 +++- CM1/r-rct-penn-precision-adj.irnb | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CM1/python-rct-penn-precision-adj.ipynb b/CM1/python-rct-penn-precision-adj.ipynb index 87394e8c..1f4f60e1 100644 --- a/CM1/python-rct-penn-precision-adj.ipynb +++ b/CM1/python-rct-penn-precision-adj.ipynb @@ -29,7 +29,9 @@ "import numpy as np\n", "import patsy\n", "import statsmodels.formula.api as smf\n", - "import statsmodels.api as sm" + "import statsmodels.api as sm\n", + "\n", + "asdfa" ] }, { diff --git a/CM1/r-rct-penn-precision-adj.irnb b/CM1/r-rct-penn-precision-adj.irnb index 6e4ac12c..08ee178d 100644 --- a/CM1/r-rct-penn-precision-adj.irnb +++ b/CM1/r-rct-penn-precision-adj.irnb @@ -69,7 +69,7 @@ "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", "Penn <- as.data.frame(read.table(file, header = TRUE))\n", "\n", - "n <- dim(Penn)[1]\n", + "n <- dim(Penn)[1] adsfasdf\n", "p_1 <- dim(Penn)[2]\n", "Penn <- subset(Penn, tg == 4 | tg == 0)\n", "attach(Penn)" From 8cc642a49121b043b4211dff155c6162cec6e0f3 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:11:31 -0700 Subject: [PATCH 249/261] testing error reporting --- .github/workflows/check-R-notebooks.yml | 23 +++++++++++++++-------- CM1/python-rct-penn-precision-adj.ipynb | 4 +--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index b5b73cf1..94e30feb 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -132,13 +132,20 @@ jobs: if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) - stop("Linting failed with warnings") + # stop("Linting failed with warnings") + writeLines("linting_errors_found=false", "linting_errors_check.txt") } }) ' + + - name: Publish result of linting error check + if: env.notebooks_changed == 'true' + run: | + cat linting_errors_check.txt >> $GITHUB_ENV + rm linting_errors_check.txt - name: Execute R scripts and log output - if: env.notebooks_changed == 'true' + if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" id: execute run: | log_file="${{ matrix.directory }}_r_script_execution.log" @@ -184,7 +191,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - name: Upload execution log - if: env.notebooks_changed == 'true' + if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-script-execution-log @@ -211,13 +218,13 @@ jobs: rm ${{ matrix.directory }}_r_scripts.zip - name: Publish result of error check - if: env.notebooks_changed == 'true' + if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" run: | cat errors_check.txt >> $GITHUB_ENV rm errors_check.txt - - name: Send success mail - if: "(env.notebooks_changed == 'true') && (env.errors_found == 'true')" + - name: Send failure mail + if: "(env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true'))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: @@ -233,7 +240,7 @@ jobs: # Required mail subject: subject: FAILURE R (${{matrix.directory}}) test Github Action job failed! # Required recipients' addresses: - to: bsyrganis@gmail.com,vsyrgk@stanford.edu + to: bsyrganis@gmail.com # Required sender full name (address can be skipped): from: GA-MetricsML-Notebooks # Optional plain body: @@ -241,7 +248,7 @@ jobs: ignore_cert: true - name: fail if errors - if: "(env.notebooks_changed == 'true') && (env.errors_found == 'true')" + if: "(env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true'))" run: exit 1 - name: Send success mail diff --git a/CM1/python-rct-penn-precision-adj.ipynb b/CM1/python-rct-penn-precision-adj.ipynb index 1f4f60e1..87394e8c 100644 --- a/CM1/python-rct-penn-precision-adj.ipynb +++ b/CM1/python-rct-penn-precision-adj.ipynb @@ -29,9 +29,7 @@ "import numpy as np\n", "import patsy\n", "import statsmodels.formula.api as smf\n", - "import statsmodels.api as sm\n", - "\n", - "asdfa" + "import statsmodels.api as sm" ] }, { From 82a37ec2899883e54b17d90f2f0d47c1d27f50e3 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:17:54 -0700 Subject: [PATCH 250/261] Update check-R-notebooks.yml --- .github/workflows/check-R-notebooks.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index 94e30feb..64640d7a 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -127,15 +127,21 @@ jobs: object_name_linter = object_name_linter(styles = c("snake_case", "CamelCase", "camelCase")), object_usage_linter = NULL) rmd_files <- list.files(path = "${{ matrix.directory }}", pattern = "\\.Rmd$", full.names = TRUE) - results <- lapply(rmd_files, function(file) { - lints <- lint(file, linters) + linting_error <- FALSE + for (rfile in rmd_files) { + lints <- lint(rfile, linters) if (length(lints) > 0) { cat("Warnings found during linting:\n") print(lints) # stop("Linting failed with warnings") - writeLines("linting_errors_found=false", "linting_errors_check.txt") + linting_error <- TRUE } - }) + } + if linting_error { + writeLines("linting_errors_found=true", "linting_errors_check.txt") + } else { + writeLines("linting_errors_found=false", "linting_errors_check.txt") + } ' - name: Publish result of linting error check @@ -246,7 +252,7 @@ jobs: # Optional plain body: body: R notebook tests of directory ${{matrix.directory}} in Git repo ${{github.repository}} failed. ignore_cert: true - + - name: fail if errors if: "(env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true'))" run: exit 1 From a1ec9f041b9ce57d0637b8039e2eeca54dc5967c Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:21:21 -0700 Subject: [PATCH 251/261] update R workflow --- .github/workflows/check-R-notebooks.yml | 6 +++--- .github/workflows/check-python-notebooks.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index 64640d7a..8dc12146 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -230,7 +230,7 @@ jobs: rm errors_check.txt - name: Send failure mail - if: "(env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true'))" + if: "((env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true')))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: @@ -254,7 +254,7 @@ jobs: ignore_cert: true - name: fail if errors - if: "(env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true'))" + if: "((env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true')))" run: exit 1 - name: Send success mail @@ -274,7 +274,7 @@ jobs: # Required mail subject: subject: SUCCESS R (${{matrix.directory}}) test Github Action job succeeded! # Required recipients' addresses: - to: bsyrganis@gmail.com,vsyrgk@stanford.edu + to: bsyrganis@gmail.com # Required sender full name (address can be skipped): from: GA-MetricsML-Notebooks # Optional plain body: diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index 8bfda577..87008fab 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -188,7 +188,7 @@ jobs: # Required mail subject: subject: FAILURE Python (${{matrix.directory}}) test Github Action job failed! # Required recipients' addresses: - to: bsyrganis@gmail.com,vsyrgk@stanford.edu + to: bsyrganis@gmail.com # Required sender full name (address can be skipped): from: GA-MetricsML-Notebooks # Optional plain body: @@ -216,7 +216,7 @@ jobs: # Required mail subject: subject: SUCCESS Python (${{matrix.directory}}) test Github Action job succeeded! # Required recipients' addresses: - to: bsyrganis@gmail.com,vsyrgk@stanford.edu + to: bsyrganis@gmail.com # Required sender full name (address can be skipped): from: GA-MetricsML-Notebooks # Optional plain body: From 1bd344b2a6ec32129482558f5fb61cb80a885899 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:24:14 -0700 Subject: [PATCH 252/261] Update check-R-notebooks.yml --- .github/workflows/check-R-notebooks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index 8dc12146..361dc93d 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -143,7 +143,7 @@ jobs: writeLines("linting_errors_found=false", "linting_errors_check.txt") } ' - + - name: Publish result of linting error check if: env.notebooks_changed == 'true' run: | @@ -228,7 +228,7 @@ jobs: run: | cat errors_check.txt >> $GITHUB_ENV rm errors_check.txt - + - name: Send failure mail if: "((env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true')))" uses: dawidd6/action-send-mail@v3 From a7742b5668d9b19885bb2f64e9a692c7b9916956 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:26:16 -0700 Subject: [PATCH 253/261] Update check-R-notebooks.yml --- .github/workflows/check-R-notebooks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index 361dc93d..b722356a 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -151,7 +151,7 @@ jobs: rm linting_errors_check.txt - name: Execute R scripts and log output - if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" + if: "((env.notebooks_changed == 'true') && (env.linting_errors_found == 'false'))" id: execute run: | log_file="${{ matrix.directory }}_r_script_execution.log" @@ -197,7 +197,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - name: Upload execution log - if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" + if: "((env.notebooks_changed == 'true') && (env.linting_errors_found == 'false'))" uses: actions/upload-artifact@v2 with: name: ${{ matrix.directory }}-r-script-execution-log @@ -224,7 +224,7 @@ jobs: rm ${{ matrix.directory }}_r_scripts.zip - name: Publish result of error check - if: "((env.notebooks_changed == 'true') && (linting_errors_found == 'false'))" + if: "((env.notebooks_changed == 'true') && (env.linting_errors_found == 'false'))" run: | cat errors_check.txt >> $GITHUB_ENV rm errors_check.txt From 8a7b4f69a81004f03ce7a9fa8e2f70097e9f6063 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:44:41 -0700 Subject: [PATCH 254/261] Update check-R-notebooks.yml --- .github/workflows/check-R-notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index b722356a..0aecdbe7 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -137,7 +137,7 @@ jobs: linting_error <- TRUE } } - if linting_error { + if (linting_error) { writeLines("linting_errors_found=true", "linting_errors_check.txt") } else { writeLines("linting_errors_found=false", "linting_errors_check.txt") From fd68b29e0516d0948fd72688579b4e5a907977ec Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 05:59:27 -0700 Subject: [PATCH 255/261] Update r-rct-penn-precision-adj.irnb --- CM1/r-rct-penn-precision-adj.irnb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CM1/r-rct-penn-precision-adj.irnb b/CM1/r-rct-penn-precision-adj.irnb index 08ee178d..6e4ac12c 100644 --- a/CM1/r-rct-penn-precision-adj.irnb +++ b/CM1/r-rct-penn-precision-adj.irnb @@ -69,7 +69,7 @@ "file <- \"https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/penn_jae.dat\"\n", "Penn <- as.data.frame(read.table(file, header = TRUE))\n", "\n", - "n <- dim(Penn)[1] adsfasdf\n", + "n <- dim(Penn)[1]\n", "p_1 <- dim(Penn)[2]\n", "Penn <- subset(Penn, tg == 4 | tg == 0)\n", "attach(Penn)" From 66d68980e99674e4d387004b5c06a957d7d60c9f Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 06:02:45 -0700 Subject: [PATCH 256/261] sending emails only for scheduled triggers --- .github/workflows/check-R-notebooks.yml | 4 ++-- .github/workflows/check-python-notebooks.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index 0aecdbe7..c217919c 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -230,7 +230,7 @@ jobs: rm errors_check.txt - name: Send failure mail - if: "((env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true')))" + if: "((github.event_name == 'schedule') && (env.notebooks_changed == 'true') && ((env.linting_errors_found == 'true') || (env.errors_found == 'true')))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: @@ -258,7 +258,7 @@ jobs: run: exit 1 - name: Send success mail - if: env.notebooks_changed == 'true' + if: ((github.event_name == 'schedule') && (env.notebooks_changed == 'true'))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: diff --git a/.github/workflows/check-python-notebooks.yml b/.github/workflows/check-python-notebooks.yml index 87008fab..e72afa61 100644 --- a/.github/workflows/check-python-notebooks.yml +++ b/.github/workflows/check-python-notebooks.yml @@ -172,7 +172,7 @@ jobs: path: logs - name: Send failure mail - if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0'))" + if: "((github.event_name == 'schedule') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')) && (env.flake8_errors != '0' || env.script_errors != '0')))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: @@ -200,7 +200,7 @@ jobs: run: exit 1 - name: Send success mail - if: "(env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest')))" + if: "((github.event_name == 'schedule') && (env.notebooks_changed == 'true') && (! (matrix.directory == 'CM3' && (matrix.os == 'windows-latest' || matrix.os == 'macos-latest'))))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: From 66f458f9385394c815c8b54d1463f129f19d7fc7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 23 Jul 2024 06:05:01 -0700 Subject: [PATCH 257/261] Update check-R-notebooks.yml --- .github/workflows/check-R-notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-R-notebooks.yml b/.github/workflows/check-R-notebooks.yml index c217919c..07ea922e 100644 --- a/.github/workflows/check-R-notebooks.yml +++ b/.github/workflows/check-R-notebooks.yml @@ -258,7 +258,7 @@ jobs: run: exit 1 - name: Send success mail - if: ((github.event_name == 'schedule') && (env.notebooks_changed == 'true'))" + if: "((github.event_name == 'schedule') && (env.notebooks_changed == 'true'))" uses: dawidd6/action-send-mail@v3 with: # Required mail server address if not connection_url: From b35567d270cb3c12e360bce33fe94f019d9a06f7 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Mon, 29 Jul 2024 15:15:33 +0300 Subject: [PATCH 258/261] Update transform-notebooks.yml --- .github/workflows/transform-notebooks.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/transform-notebooks.yml b/.github/workflows/transform-notebooks.yml index fc8560f1..b41f9120 100644 --- a/.github/workflows/transform-notebooks.yml +++ b/.github/workflows/transform-notebooks.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - gen-Rmd concurrency: group: strip-transform-notebooks-${{ github.ref }} @@ -83,8 +84,12 @@ jobs: # Check if directory exists if [ -d "$dir" ]; then echo "Processing directory: $dir" + git add "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd + git diff "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd + CHANGES=$(git status --porcelain) - if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then + if [ -z "$CHANGES" ]; then + # if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then echo "No changes were found" else echo "Found changes in directory: $dir" From 1db7dd4ff899c24c389d27b0820010328ea0db63 Mon Sep 17 00:00:00 2001 From: vsyrgkanis Date: Tue, 30 Jul 2024 02:56:54 -0700 Subject: [PATCH 259/261] Update transform-notebooks.yml --- .github/workflows/transform-notebooks.yml | 36 ++++++++++------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/.github/workflows/transform-notebooks.yml b/.github/workflows/transform-notebooks.yml index b41f9120..0ca8d597 100644 --- a/.github/workflows/transform-notebooks.yml +++ b/.github/workflows/transform-notebooks.yml @@ -40,6 +40,7 @@ jobs: - name: Strip outputs from .ipynb files run: | + git pull dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) for dir in "${dirs[@]}"; do # Check if directory exists @@ -64,12 +65,17 @@ jobs: fi done echo "Converting .irnb to .Rmd to update the .Rmd version" + # first we delete all Rmd files and regenerate. This will make sure + # that if a .irnb file is deleted then the corresponding .Rmd file + # will also be removed by this script. + git rm "$dir"/*.Rmd R -e " files <- list.files(path = '$dir', pattern = '\\\\.irnb$', full.names = TRUE, recursive = FALSE) lapply(files, function(input) { rmarkdown::convert_ipynb(input) }) " + git add "$dir"/*.Rmd else echo "Directory $dir does not exist." fi @@ -79,27 +85,15 @@ jobs: id: verify_diff run: | git pull - dirs=(PM1 PM2 PM3 PM4 PM5 CM1 CM2 CM3 AC1 AC2 T) - for dir in "${dirs[@]}"; do - # Check if directory exists - if [ -d "$dir" ]; then - echo "Processing directory: $dir" - git add "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd - git diff "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd - CHANGES=$(git status --porcelain) - - if [ -z "$CHANGES" ]; then - # if git diff --quiet "$dir"/*.ipynb "$dir"/*.irnb "$dir"/*.Rmd; then - echo "No changes were found" - else - echo "Found changes in directory: $dir" - echo "changed=true" >> $GITHUB_OUTPUT - break 1 - fi - else - echo "Directory $dir does not exist." - fi - done + git status --porcelain + CHANGES=$(git status --porcelain) + if [ -z "$CHANGES" ]; then + echo "No changes were found" + else + echo "Found changes" + echo "$CHANGES" + echo "changed=true" >> $GITHUB_OUTPUT + fi - name: Commit and push stripped .ipynb files if: steps.verify_diff.outputs.changed == 'true' From 406b77f0f0722056356be528305e4359a0cb35f2 Mon Sep 17 00:00:00 2001 From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com> Date: Wed, 31 Jul 2024 22:20:02 +0200 Subject: [PATCH 260/261] update to keras3 for compatibility with newst R (see https://github.com/rstudio/keras3/issues/1428) --- PM3/r_functional_approximation_by_nn_and_rf.irnb | 8 ++++---- PM3/r_ml_wage_prediction.irnb | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index ef445b1b..e1d2c879 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -39,7 +39,7 @@ "install.packages(\"randomForest\")\n", "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", - "install.packages(\"keras\")" + "install.packages(\"keras3\")" ] }, { @@ -56,7 +56,7 @@ "library(randomForest)\n", "library(rpart)\n", "library(gbm)\n", - "library(keras)" + "library(keras3)" ] }, { @@ -356,7 +356,7 @@ " layer_dense(units = 1)\n", "\n", " model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", + " optimizer = optimizer_adam(learning_rate = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", " )\n", @@ -475,7 +475,7 @@ "\n", "# Compile the model\n", "model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", + " optimizer = optimizer_adam(learning_rate = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", ")\n", diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index 33ed7b2f..8f35f5d0 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -57,7 +57,7 @@ "install.packages(\"nnet\")\n", "install.packages(\"gbm\")\n", "install.packages(\"rpart.plot\")\n", - "install.packages(\"keras\")" + "install.packages(\"keras3\")" ] }, { @@ -78,7 +78,7 @@ "library(nnet)\n", "library(gbm)\n", "library(rpart.plot)\n", - "library(keras)" + "library(keras3)" ] }, { @@ -1281,7 +1281,7 @@ "\n", "# Compile the model\n", "model %>% compile(\n", - " optimizer = optimizer_adam(lr = 0.01),\n", + " optimizer = optimizer_adam(learning_rate = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", ")\n", From 3a3a4425d6026e3b3545b7272eeebf582b139dcc Mon Sep 17 00:00:00 2001 From: OliverSchacht <65898638+OliverSchacht@users.noreply.github.com> Date: Thu, 1 Aug 2024 14:30:41 +0200 Subject: [PATCH 261/261] revert last commit --- PM3/r_functional_approximation_by_nn_and_rf.irnb | 8 ++++---- PM3/r_ml_wage_prediction.irnb | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/PM3/r_functional_approximation_by_nn_and_rf.irnb b/PM3/r_functional_approximation_by_nn_and_rf.irnb index e1d2c879..ef445b1b 100644 --- a/PM3/r_functional_approximation_by_nn_and_rf.irnb +++ b/PM3/r_functional_approximation_by_nn_and_rf.irnb @@ -39,7 +39,7 @@ "install.packages(\"randomForest\")\n", "install.packages(\"rpart\")\n", "install.packages(\"gbm\")\n", - "install.packages(\"keras3\")" + "install.packages(\"keras\")" ] }, { @@ -56,7 +56,7 @@ "library(randomForest)\n", "library(rpart)\n", "library(gbm)\n", - "library(keras3)" + "library(keras)" ] }, { @@ -356,7 +356,7 @@ " layer_dense(units = 1)\n", "\n", " model %>% compile(\n", - " optimizer = optimizer_adam(learning_rate = 0.01),\n", + " optimizer = optimizer_adam(lr = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", " )\n", @@ -475,7 +475,7 @@ "\n", "# Compile the model\n", "model %>% compile(\n", - " optimizer = optimizer_adam(learning_rate = 0.01),\n", + " optimizer = optimizer_adam(lr = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", ")\n", diff --git a/PM3/r_ml_wage_prediction.irnb b/PM3/r_ml_wage_prediction.irnb index 8f35f5d0..33ed7b2f 100644 --- a/PM3/r_ml_wage_prediction.irnb +++ b/PM3/r_ml_wage_prediction.irnb @@ -57,7 +57,7 @@ "install.packages(\"nnet\")\n", "install.packages(\"gbm\")\n", "install.packages(\"rpart.plot\")\n", - "install.packages(\"keras3\")" + "install.packages(\"keras\")" ] }, { @@ -78,7 +78,7 @@ "library(nnet)\n", "library(gbm)\n", "library(rpart.plot)\n", - "library(keras3)" + "library(keras)" ] }, { @@ -1281,7 +1281,7 @@ "\n", "# Compile the model\n", "model %>% compile(\n", - " optimizer = optimizer_adam(learning_rate = 0.01),\n", + " optimizer = optimizer_adam(lr = 0.01),\n", " loss = \"mse\",\n", " metrics = c(\"mae\"),\n", ")\n",