week5_assi3_sol2.r

# -*- coding: utf-8 -*-
# """Week5 Assi3 Sol2.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1JKvVPp26EF71QCN4Ihq4ihjp6d2wpZpY
# """


###########################################################################
## Week-6, Homework-3, Sol-2
## Sreya Dhar 
## Created: Oct 02, 2020
## Edited: Oct 14, 2020
###########################################################################

rm(list = ls()) ## clearing working environment

## installing all the libaries in R kernel

# install.packages("corrplot")
# install.packages("forecast")
# install.packages("zoo")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("caret")
# install.packages("ROCR")
# install.packages("PerformanceAnalytics")
# install.packages("funModeling")
# install.packages("hrbrthemes")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("data.table")
# install.packages("leaps")
## importing the libraries in R kernel

library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(repr)
library(reshape2)
library(forecast)
library(zoo)
library(rsample)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(leaps)
library(car)
library(PerformanceAnalytics)
library(funModeling)
library(caret)
library(MASS)
library(Hmisc)
library(hrbrthemes)
library(ggthemes)
library(data.table)
library(GGally)
library(leaps)

## creating the dataset
set.seed(100) ## seeding the sampling

df_1 <- data.frame(replicate(20, rnorm(n = 1000, mean=10, sd=5)))
df_mat <- data.matrix(df_1)
beta <- rnorm(20)
beta[2] <- 0
beta[4] <- 0
beta[7] <- 0
beta[9] <- 0
beta[12] <- 0
beta[15] <- 0
beta[18] <- 0
beta[20] <- 0
eps <- rnorm(20, mean=2, sd=1)

Y <- df_mat%*%beta + eps
df <- cbind(df_1,Y)

mean(Y)
sd(Y)

head(df)

options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(df, histogram=TRUE, pch=15)

options(repr.plot.width=12, repr.plot.height=10, repr.plot.res = 200)
par(mfrow=c(5,5))
hist(df, col="grey")

dim(df)

head(df)

## splitting the dataset into train and test sets

df_split1 <- initial_split(df, prop = 0.8) ## spliting the data by library 'rsample'
df_train <- training(df_split1)
df_test  <- testing(df_split1)

## dividing the data into X and Y from train and test sets
X_train <- df_train[,-21]
X_test  <- df_test[,-21]
Y_train <- df_train[,21]
Y_test  <- df_test[,21]

head(df_test)

require(leaps); require(ggplot2); require(dplyr); require(ggthemes)

################################### Exhaustive Subsets selection, nbest=1 ##################################

data_exh <- regsubsets(Y ~., data= df_train,
             nbest = 1,       # only 'one' best model for each number of predictors
             nvmax = 20,    # NULL for no limit on number of variables
             force.in = NULL, force.out = NULL,
             really.big = TRUE,
             method = "exhaustive")
exh_sum <- summary(data_exh)
names(exh_sum)

as.data.frame(exh_sum$outmat)

exh_sum$rsq

coef(data_exh ,20)

#plot of r2 for different models

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
exh_r2 <- as.data.frame(exh_sum$rsq)
names(exh_r2) <- "R2"
plot(x= 1:nrow(exh_r2), y=exh_r2[,'R2'],  xlab = "Number of Variables", ylab = "R^2",type="l") 
points(x= 1:nrow(exh_r2), y=exh_r2[,'R2'], col="red",cex=1,pch=20)
abline(v=which.max(exh_r2[,'R2']), y=max(exh_r2['R2']),  type = "l", col = "blue", lty = 3) 
abline(x=which.max(exh_r2[,'R2']), h=max(exh_r2['R2']),  type = "l", col = "blue", lty = 3)

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)

## Plot Cp, BIC, RSS, Adjusted R2 for ex.model(nbest=100)

par(mfrow = c(2,2))
plot(exh_sum$cp, xlab = "Number of Variables", ylab = "Mallow's Cp", type = "l")
points(x= 1:20, y=exh_sum$cp, col="red",cex=1,pch=20)
abline(v=which.min(exh_sum$cp), y=min(exh_sum$cp),  type = "l", col = "blue", lty = 3) 
abline(x=which.min(exh_sum$cp), h=min(exh_sum$cp),  type = "l", col = "blue", lty = 3) 


plot(exh_sum$bic, xlab = "Number of Variables", ylab = "BIC", type = "l")
points(x= 1:20, y=exh_sum$bic, col="red",cex=1,pch=20)
abline(v=which.min(exh_sum$bic), y=min(exh_sum$bic),  type = "l", col = "blue", lty = 3) 
abline(x=which.min(exh_sum$bic), h=min(exh_sum$bic),  type = "l", col = "blue", lty = 3) 

plot(exh_sum$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
points(x= 1:20, y=exh_sum$rss, col="red",cex=1,pch=20)
abline(v=which.min(exh_sum$rss), y=min(exh_sum$rss),  type = "l", col = "blue", lty = 3) 
abline(x=which.min(exh_sum$rss), h=min(exh_sum$rss),  type = "l", col = "blue", lty = 3) 

plot(exh_sum$adjr2, xlab = "Number of Variables", ylab = "Adjusted R^2", type = "l")
points(x= 1:20, y=exh_sum$adjr2, col="red",cex=1,pch=20)
abline(v=which.max(exh_sum$adjr2), y=max(exh_sum$adjr2),  type = "l", col = "blue", lty = 3) 
abline(x=which.max(exh_sum$adjr2), h=max(exh_sum$adjr2),  type = "l", col = "blue", lty = 3)

#How many variables are needed for the best model fit.

data.frame(
  Adj.R2 = which.max(exh_sum$adjr2),
  CP = which.min(exh_sum$cp),
  BIC = which.min(exh_sum$bic),
  RSS = which.min(exh_sum$rss)
  )

options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(data_exh, scale = "r2", main = "R^2")
plot(data_exh, scale = "adjr2", main = "Adjusted R^2")
plot(data_exh, scale = "Cp",main = "Cp" )
plot(data_exh, scale = "bic", main = "BIC")

# coefficient output
exh_sum$outmat[12,]
exh_sum$outmat[14,]

data.frame(coef(data_exh ,12))

data.frame(coef(data_exh ,20))

# variables for best models 
options(repr.plot.width=10, repr.plot.height=5, repr.plot.res = 200)
par(mfrow = c(1,2))
## Adjusted R2
res_adjr <- subsets(data_exh, statistic="adjr2", legend = FALSE, min.size = 5, main = "Adjusted R^2")
## Mallow Cp
res_mcp <- subsets(data_exh, statistic="cp", legend = FALSE, min.size = 5, main = "Mallow Cp")
abline(a = 1, b = 1, lty = 2)

res_adjr ## gives the legend in the previous plots

## prediction on train and test set

test_error_ex = rep(NA, 20)
train_error_ex = rep(NA, 20)

new_test_ex = model.matrix(Y ~., data=df_test)
new_train_ex = model.matrix(Y ~., data=df_train)

for (i in 1:20){
        coeffs_ex = coef(data_exh, id=i)
        pred_te_ex = new_test_ex[,names(coeffs_ex)]%*%coeffs_ex
        pred_tr_ex = new_train_ex[,names(coeffs_ex)]%*%coeffs_ex
        test_error_ex[i] = mean((df_test$Y-pred_te_ex)^2) # prediction on test
        train_error_ex[i] = mean((df_train$Y-pred_tr_ex)^2) # prediction on train
}

options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot(test_error_ex, col='red', type="b", xlab="subset size", ylab= "MSE from exhaustive model (nbest=1)", ylim=c(0,370))
abline(v = which.min(test_error_ex),y = min(test_error_ex), type = "d", col = "blue", lty=2, lwd=2)

lines(train_error_ex, col= "blue", type = "b")
abline(v = which.min(train_error_ex),y = min(train_error_ex), type = "d", col = "red", lty=2, lwd=2)

legend("topright",inset=.02, c("Test Set", "Train Set"), lty= c(1,1), lwd=c(2.5,2.5),col= c("blue", "red"))

print(which.min(test_error_ex))
print(which.min(train_error_ex))

coef(data_exh, id =12)

col_head = colnames(X_train)
coeff_errors = rep(NA, 20)
coeff_no = rep(NA, 20)
res_err_coeff = rep(NA, 20)



for (i in 1:20) {
    coeffi = coef(data_exh, id = i)
    coeff_no[i] = length(coeffi) - 1
    res_err_coeff[i] = sqrt(sum((beta[col_head %in% names(coeffi)] - coeffi[names(coeffi) %in% col_head])^2) + 
        sum(beta[!(col_head %in% names(coeffi))])^2)
}


options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 200)
err_tab <- data.table(coeff_no,res_err_coeff,keep.rownames = TRUE)

ggplot(err_tab, aes(coeff_no,res_err_coeff)) + geom_point(col="red") + geom_line() + theme_bw() + ggtitle("Residuals between true and predicted coeffs.") + xlab("No. of Coefficients") + ylab("Residual error of coefficients")

err_tab

which.min(err_tab$res_err_coeff)

data.frame(col_head, beta)

## end ###