-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML.R
193 lines (181 loc) · 8.6 KB
/
ML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# reading the dataset
#read.csv("C:/Users/amara/OneDrive - Macquarie University/CRF/R-workshop_july2023/small_molecules_dataset.csv")
small_moecules <- read.csv("C:/Users/amara/OneDrive - Macquarie University/CRF/R-workshop_july2023/small_molecules_dataset.csv")
# Viewing the data dimensions
dim(small_moecules)
#viewing the dataset
View(small_moecules)
# Step 1: Missing Value Filter
#identifying positions for the missing values
which(is.na(small_moecules))
#total number of missing values
sum(is.na(small_moecules))
#removing the missing values
small_moecules_new <- na.omit(small_moecules) #it omits all rows with missing data
dim(small_moecules_new) # all rows deleted
#keeping all rows and deleting columns (features) with missing data
small_moecules_MVF <- small_moecules[ , colSums(is.na(small_moecules))==0]
# checking the dimensions of data after missing value filter
dim(small_moecules_MVF)
#Step 2: Removing the low variance data
library(caret)
#checking which columns have low variance (near to zero) and storing into a vector
nzv <- nearZeroVar(small_moecules_MVF)
nzv
# keeping columns (features) with variance
small_moecules_MVF_LVF <- small_moecules_MVF[, -nzv]
# checking dimensions
View(small_moecules_MVF_LVF)
dim(small_moecules_MVF_LVF)
str(small_moecules_MVF_LVF)
#Ste 3: removing the highly correalted data
#coverting data to numeric values
small_moecules_MVF_LVF_num <- small_moecules_MVF_LVF[sapply(small_moecules_MVF_LVF, is.numeric)]
View(small_moecules_MVF_LVF_num)
#computing the correlation
small_moecules_MVF_LVF_num_cor <- cor(small_moecules_MVF_LVF_num)
small_moecules_MVF_LVF_num_cor
#visualizing the upper correlation matrix for first 5 features of first five data points
library(corrplot)
corrplot(small_moecules_MVF_LVF_num_cor[1:5,1:5], method="number")
#removing the upper-triangle and diagonals to retain unique values in the matrix
cor_matrix_rm <- small_moecules_MVF_LVF_num_cor
cor_matrix_rm[upper.tri(cor_matrix_rm)] <- 0
diag(cor_matrix_rm) <- 0
cor_matrix_rm
#visulizing the updated matrix
corrplot(cor_matrix_rm[1:5,1:5], method="number")
#removing the highly correlated features
small_moecules_MVF_LVF_num_remcor <- small_moecules_MVF_LVF_num[ , !apply(cor_matrix_rm, 2,function(x) any(x > 0.70))]
# Remove highly correlated variables
dim(small_moecules_MVF_LVF_num_remcor)
View(small_moecules_MVF_LVF_num_remcor)
#integrating features with classification data i.e. class
class <- small_moecules[, c(2)]
class
filtered_data <- cbind(class,small_moecules_MVF_LVF_num_remcor)
View(filtered_data)
#Step 4: splitting into training and test data
library(caTools)
set.seed(100)
split_data <- sample.split(Y= filtered_data, SplitRatio = 0.7)
traindata <-filtered_data[split_data,]
testdata <-filtered_data[!split_data,]
View(testdata)
dim(traindata)
dim(testdata)
#removing missing values in traindata
traindata_filter <- na.omit(traindata)
testdata_filter <- na.omit(testdata)
View(traindata_filter)
#visualizing the data to see the class imbalance
table(traindata_filter$class)
#visualizing the class imbalance
barplot(table(traindata_filter$class), col = rainbow(2))
#Step 5: dealing with imbalanced dataset through oversampling
library(ROSE)
oversampled_data <- ovun.sample(class ~., data=traindata_filter, method = "over", N=374)$data
table(oversampled_data$class)
View(oversampled_data)
#for next step (RFE), the response variable should be numeric or factor
str(oversampled_data$class)
oversampled_data$class<-as.factor(oversampled_data$class)
str(oversampled_data$class)
#building the random forest model after preprocessing (feature selection through filter methods)
library(randomForest)
set.seed(123)
rf_all <- randomForest(class~., data = oversampled_data)
rf_all
#apply model on test data
test_predict <- predict(rf_all, testdata_filter)
#generating the confusion matrix
testdata_filter$class <- as.factor(testdata_filter$class)
confusionMatrix(test_predict, testdata_filter$class)
#Step 6(a): feature selection through automatic recursive feature elimination
library(mlbench)
library(caret)
library(randomForest)
#setting controls
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
#executing rfe
results <- rfe(oversampled_data[,2:102], oversampled_data[,1], sizes=c(2:102), rfeControl=control)
#printing top 10 variables, by default five are printed
print(results, top = 5)
#plotting the varibale and impact on accuracy
plot(results, type= "o")
#displaying the variables (most to least important)
predictors(results)
#building the random forest model on RFE selected features
#top five features
rf_rfe <- randomForest(class~ JGI6 + FilterItLogS + ZMIC5 + mZagreb2 + GATS3are, data = oversampled_data)
rf_rfe
#apply model on test data
test_predict <- predict(rf_rfe, testdata_filter)
#generating the confusion matrix
confusionMatrix(test_predict, testdata_filter$class)
# Try taking 10 features
# Step 6(b) Selecting the features through Boruta package
library(Boruta)
set.seed(1)
#gradually increase the value of maxruns, doTrace =2 is to print the progress
boruta <- Boruta(class ~ ., data = oversampled_data, doTrace = 2, maxRuns = 50)
print(boruta)
plot(boruta, las = 2, cex.axis = 0.5)
plotImpHistory(boruta) # to see if there are any important variables that were part of the blue line (shadow variable) and might not be important
attStats(boruta) #faeture-wise detail
#visulaize important feature
#Step 7 : building the random forest model with default parameters
#building the random forest model with Boruta suggested features
# getting the Boruta suggested fetures
boruta_features <- getConfirmedFormula(boruta)
boruta_features
rf_boruta <- randomForest(class~ ATSC4c + ATSC8c + ATSC4d + ATSC7d +
ATSC4are + ATSC5are + ATSC6are + ATSC7are + ATSC8are + ATSC4i +
ATSC5i + ATSC6i + ATSC7i + ATSC8i + MATS1c + MATS2d + MATS3d +
MATS1s + MATS1se + MATS1are + MATS2are + MATS3are + MATS1i +
MATS3i + GATS1c + GATS3d + GATS2m + GATS3m + GATS1are + GATS2are +
GATS3are + GATS2i + GATS3i + BCUTc.1l + BCUTdv.1l + BCUTd.1l +
BCUTs.1l + BCUTare.1h + BCUTi.1h + BCUTi.1l + BalabanJ +
SM1_Dzv + nBondsD + RPCG + C1SP2 + Mi + NdssC + SdsCH + SdssC +
SaasC + SssO + AETA_alpha + AETA_eta_L + ETA_psi_1 + BIC5 +
CIC5 + MIC5 + ZMIC5 + FilterItLogS + PEOE_VSA2 + PEOE_VSA7 +
PEOE_VSA8 + PEOE_VSA9 + PEOE_VSA10 + SMR_VSA6 + SlogP_VSA2 +
SlogP_VSA3 + SlogP_VSA4 + EState_VSA2 + EState_VSA3 + EState_VSA4 +
EState_VSA6 + EState_VSA7 + EState_VSA8 + EState_VSA9 + AMID_C +
piPC10 + n6aRing + RotRatio + TopoPSA.NO. + TopoPSA + GGI9 +
JGI3 + JGI4 + JGI5 + JGI6 + JGI7 + JGI8 + JGT10 + PetitjeanIndex +
AMW + mZagreb2, data = oversampled_data)
rf_boruta
p <- predict(rf_boruta , testdata_filter)
confusionMatrix(p, testdata_filter$class)
# Step 6(c) Selecting the features Variable importance by RF
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(class~., data=oversampled_data, method= "rf", trControl=control)
importance <- varImp(model, scale=TRUE)
print(importance, top = 20)
plot(importance, top = 20)
# Building the RF model using variable importance suggested variables
rf_vi <- randomForest(class ~ FilterItLogS + mZagreb2 + JGI6 +
ATSC7d + ATSC7are +AMID_C + JGI7 + JGI5 +
JGI5 + ATSC6are, data = oversampled_data)
rf_vi
p <- predict(rf_vi , testdata_filter)
confusionMatrix(p, testdata_filter$class)
# Hyperparameterizing RF
#getting to know how many features are required
t <- tuneRF(oversampled_data[,2:102], oversampled_data[,1],
stepFactor = 0.5,
plot = TRUE,
ntreeTry = 500,
trace =TRUE,
improve = 0.05)
rf2 <- randomForest(class~., data = oversampled_data, ntree=500, method= "cv", mtry=10, number=10, importance= TRUE)
plot(rf2)
rf_final <- randomForest(class~ FilterItLogS + mZagreb2 + JGI6 +
ATSC7d + ATSC7are +AMID_C + JGI7 + JGI5 +
JGI5 + ATSC6are , data = oversampled_data, method= "cv", number=10, ntree=140, mtry=10, importance= TRUE)
rf_final
p <- predict(rf_final , testdata_filter)
confusionMatrix(p, testdata_filter$class)
If you need further help please Contact me @