-
Notifications
You must be signed in to change notification settings - Fork 0
/
Housing Prices Prediction.R
239 lines (203 loc) · 8.31 KB
/
Housing Prices Prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
##### Part 1: Data Preprocessing #####
### Step 1: Importing the dataset
# Set the working directory
# setwd()
# Read a CSV files using read.csv() function
df <- read.csv("Housing_Prices.csv")
### Step 2: Remove Irrelevant Columns
# The column "INDICATOR" is removed because it is not important
df = subset(df, select = -c(INDICATOR))
# The column "SUBJECT" is removed because it is not important
# df = subset(df, select = -c(SUBJECT))
# The column "MEASURE" is removed because it is not important
df = subset(df, select = -c(MEASURE))
# The column "FREQUENCY" is removed because it is not important
df = subset(df, select = -c(FREQUENCY))
# The column "Flag Codes" is removed because it is an empty column
df = subset(df, select = -c(Flag.Codes))
### Step 3: Combine Rows that have similar attributes
# Split NOMINAL and RENT into 2 df
df1 <- subset(df, SUBJECT == "NOMINAL")
df2 <- subset(df, SUBJECT == "RENT")
library(tidyverse)
df1 <- df1[1:nrow(df1), c(1,3,4)] %>% spread(key=TIME, value=Value)
df2 <- df2[1:nrow(df2), c(1,3,4)] %>% spread(key=TIME, value=Value)
library(tibble)
df1 <- add_column(df1, SUBJECT = "NOMINAL", .after = "LOCATION")
df2 <- add_column(df2, SUBJECT = "RENT", .after = "LOCATION")
### Step 4: Handling the missing data
# Calculate the mean value for missing data
for(i in 3:ncol(df1)){
df1[[i]] = ifelse(is.na(df1[[i]]),
ave(df1[[i]], FUN = function (x)mean(x, na.rm = TRUE)),
df1[[i]])
}
for(i in 3:ncol(df2)){
df2[[i]] = ifelse(is.na(df2[[i]]),
ave(df2[[i]], FUN = function (x)mean(x, na.rm = TRUE)),
df2[[i]])
}
# Since NOMINAL does not have 2022-Q4, append NA to it
df1 <- add_column(df1, "2022-Q4" = NA, .after = "2022-Q3")
df <- rbind(df1, df2)
# finds the locations of missing values
which(is.na(df))
# finds the count of missing values
sum(is.na(df))
# The column "2022-Q4" is removed because empty for NOMINAL
df = subset(df, select = -c(`2022-Q4`))
df1 = subset(df1, select = -c(`2022-Q4`))
df2 = subset(df2, select = -c(`2022-Q4`))
which(is.na(df))
sum(is.na(df))
# Copy df into df3 because df is used in EDA while df3 is used in Feature Scaling
df3 <- df
### Step 5: Encoding categorical data
# To see the structure of the dataframe (before)
str(df3)
# Convert the data type into factor
df3$LOCATION = factor(df3$LOCATION,
c(df3[, 1]),
labels = seq(1, nrow(df3), by=1)
)
df3$LOCATION = as.factor(df3$LOCATION)
df3$SUBJECT = factor(df3$SUBJECT,
levels = c(unique(df3$SUBJECT)),
labels = seq(1, length(unique(df3$SUBJECT)), by=1)
)
df3$SUBJECT = as.factor(df3$SUBJECT)
### Step 6: Splitting the dataset into the training and test set
library(caTools) # required library to split data
set.seed(123) # Setting the seed to obtain the same random splitting set
# returns true if observation goes to the Training set and
# false if observation goes to the test set.
split = sample.split(df3$SUBJECT, SplitRatio = 0.8)
# Creating the training set and test set separately
training_set = subset(df3, split == TRUE)
test_set = subset(df3, split == FALSE)
### Step 7: Feature scaling
training_set[, 3:10] = scale(training_set[, 3:10])
test_set[, 3:10] = scale(test_set[, 3:10])
# To see the structure of the dataframe (after)
str(df3)
# View the first few rows of the dataframe
head(df3)
head(training_set)
head(test_set)
# Print the summary of the dataframe
summary(df3)
summary(training_set)
summary(test_set)
################################################################################
##### Part 2: Exploratory Data Analysis(EDA) #####
### Visualising the data:
ylabel = 'Value'
xlabel = 'Locations'
## a. Barplot for Nominal Housing Prices:
titlelabel = 'Nominal'
plot1 = ggplot(data=df1,aes(x=reorder(LOCATION,`2020-Q4`),y=`2020-Q4`,fill=x)) +
geom_bar(stat ='identity',aes(fill=`2020-Q4`))+
coord_flip() +
theme_grey() +
scale_fill_gradient(name="Value Level")+
ggtitle(titlelabel)+
ggeasy::easy_center_title()+
theme(plot.title = element_text(face = "bold"))+
labs(y=ylabel,x=xlabel) +
geom_hline(yintercept = mean(df1$"2020-Q4"),size = 1, color = 'blue')
## a. Barplot for Rent Housing Prices:
titlelabel = 'Rent'
plot2 = ggplot(data=df2,aes(x=reorder(LOCATION,`2020-Q4`),y=`2020-Q4`,fill=x)) +
geom_bar(stat ='identity',aes(fill=`2020-Q4`))+
coord_flip() +
theme_grey() +
scale_fill_gradient(name="Value Level")+
ggtitle(titlelabel)+
ggeasy::easy_center_title()+
theme(plot.title = element_text(face = "bold"))+
labs(y=ylabel,x=xlabel) +
geom_hline(yintercept = mean(df2$"2020-Q4"),size = 1, color = 'blue')
# To combine both plots on the same diagram:
library(gridExtra)
library(ggpubr)
grid.arrange(plot1, plot2, nrow = 1,
top = text_grob('Barplot for the Value of Nominal and Rent Housing Prices by Locations in 2020-Q4', face = "bold"),
bottom = ''
)
## b. Separated Boxplot:
df4 = df[,c(1,2,3,4,5,6,7,8,9,10)] %>% # select relevant columns
pivot_longer(c(3,4,5,6,7,8,9,10),names_to = 'MONTHS')
# Change the number in unique(df4$SUBJECT)[] to toggle between Nominal and Rent. Nominal is [1]; Rent is [2]
subject = unique(df4$SUBJECT)[2]
titlelabel = paste('Boxplot for the Housing Prices Value by Months for', subject)
df4 = df[,c(1,2,3,4,5,6,7,8,9,10)] %>% # select relevant columns
pivot_longer(c(3,4,5,6,7,8,9,10),names_to = 'MONTHS')
ggplot(data = filter(df4,SUBJECT==subject), aes(x=MONTHS,y=value, color=MONTHS)) +
geom_boxplot()+
scale_fill_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2))+
ggtitle(titlelabel)+
ggeasy::easy_center_title()+
theme(plot.title = element_text(face = "bold"))+
labs(y=ylabel,x=xlabel)
## b. Combined Boxplot:
titlelabel = 'Boxplot for the Value of Housing Prices by Locations'
ggplot(data = df4, aes(x=MONTHS,y=value, color=MONTHS)) +
geom_boxplot()+
scale_fill_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2))+
ggtitle(titlelabel)+
ggeasy::easy_center_title()+
theme(plot.title = element_text(face = "bold"))+
labs(y=ylabel,x=xlabel)+
facet_wrap(~SUBJECT,nrow = 1)
## c. Correlation Plot:
library("Hmisc")
library(corrplot)
# Run this statement if you want all corrplot to be displayed together
par(mfrow=c(2,2))
res = cor(df[,-1:-2]) # -1:-2 here means we look at all columns except the first two columns
res
res2 <- rcorr(as.matrix(df[,-1:-2]))
res2
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, mar=c(0,0,2,0), title = "Correlation Plot for the Value of Nominal and Rent")
res = cor(df1[,-1:-2]) # -1:-2 here means we look at all columns except the first two columns
res
res2 <- rcorr(as.matrix(df1[,-1:-2]))
res2
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, mar=c(0,0,2,0), title = "Correlation Plot for the Value of Nominal Only")
res = cor(df2[,-1:-2]) # -1:-2 here means we look at all columns except the first two columns
res
res2 <- rcorr(as.matrix(df2[,-1:-2]))
res2
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, mar=c(0,0,2,0), title = "Correlation Plot for the Value of Rent Only")
#############################################################
##### Part 3: Clustering #####
# Apply a heuristic that uses the Within Sum of Squares (WSS) metric
# to determine a reasonably optimal value of k
# training_set = training_set[, 3:11]
training_set = subset(training_set, select = -c(SUBJECT))
training_set
wss <- numeric(15)
for (k in 1:15)
wss[k] <- sum(kmeans(training_set, centers=k, nstart=25)$withinss)
# Using the basic R plot function, each WSS is plotted against
# the respective number of centroids, 1 through 15
plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of
Squares")
# Apply k-means Clustering and view the results
km <- kmeans(training_set, 10, nstart=50)
km$withinss
km$tot.withinss
km$betweenss
# View the number of data points in each cluster
table(km$cluster)
# Plot to see the distribution of Location and 2022-Q1 data points in clusters
plot(training_set[c(7,1)], col=km$cluster)
# Plot to see the distribution of Location and 2022-Q2 data points in clusters
plot(training_set[c(8,1)], col=km$cluster)
# Plot to see the distribution of Location and 2022-Q3 data points in clusters
plot(training_set[c(9,1)], col=km$cluster)