-
Notifications
You must be signed in to change notification settings - Fork 1
/
regression_R.r
104 lines (67 loc) · 2.41 KB
/
regression_R.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# install.packages("corrplot")
# install.packages("caret")
# install.packages("randomForest")
# install.packages("ridge")
#
# library(dplyr)
# library(ggplot2)
# library(gridExtra)
# library(tidyverse)
# library(grid)
# library(corrplot)
# library(caret)
# library(randomForest)
# library(ridge)
regression<-function(){
set.seed(123)
#Read the dataset
data<-read.csv("Data_Files/Label-Encoded-data.csv",stringsAsFactors = FALSE,row.names =NULL)
#Remove index column
data<-subset(data,select=-c(X))
#Filter out movies above 5000 votes
data<-subset(data,data$Votes>5000)
#Remove all duplicate rows
data<-data[!duplicated(data$Title),]
#Remove title column
data<-subset(data,select=-c(Title))
#Convert type to data frame, if not already a data frame
data<-data.frame(data)
#Create Correlation Matrix
correlation_matrix<-cor(data, method=c("pearson"),use="complete.obs")
#Remove those columns which do not have a significant correlation with User Rating
data<-subset(data,select=-c(Director,Genre,Sub.Genre.1,Sub.Genre.2,Actor..1,Actor..2,Actor..3,Actor..4))
#Create train and test index dataset
train_index<-sample(1:nrow(data),0.8*nrow(data))
test_index<-setdiff(1:nrow(data),train_index)
#Divide into train and test datasets
X_train <- data[train_index, -4]
y_train <- data[train_index, "User.Rating"]
X_test <- data[test_index, -4]
y_test <- data[test_index, "User.Rating"]
###K Nearest Neighbor Regression###
#Fit training data
fit_knn<-knnreg(X_train,y_train,k=12)
#Predict values of test data
y_pred_knn<-predict(fit_knn,X_test)
#Find R2 score
r2_knn<-R2(y_pred_knn,y_test)
print(r2_knn)
#Finding RMSE, 0.78/Mean(y_test)=> 0.78/6.5 gives 12.5 error rate which is low and good
rmse_knn<-RMSE(y_pred_knn,y_test)
print(rmse_knn)
#Get mean absolute error
mae_knn<-MAE(y_pred_knn,y_test)
print(mae_knn)
###Simple Linear Modelling###
linearMod <- lm(User.Rating~ Duration.Min.. +Ranking +Release.Year + Votes, data=data)
#Calculated Adjusted R2 score of linear model
adjusted_r2_lm<-0.4483
#Calculated P-value of linear model
p_value_lm<-2.2e-16
#Random Forest Regression###
#Create the model
# model_rfr <- randomForest(User.Rating ~ ., data = data, mtry = 4, importance = TRUE, na.action = na.omit)
#
# #Store variance explained by model
# var_explained<-71.21
}