-
Notifications
You must be signed in to change notification settings - Fork 0
/
fraud detection.R
159 lines (159 loc) · 6.52 KB
/
fraud detection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#importing dataset
credit_card <- read.csv('C:\\Users\\Vineeta\\Desktop\\vlad-credit-card-fraud-detection\\CC.csv')
#observing the dataset
str(credit_card)
#converting class to factor variable
credit_card$Class <- factor(credit_card$Class, levels = c(0,1))
#summary of dataset
summary(credit_card)
#counting the missing values
sum(is.na(credit_card))
#-----Calculating fraud and legit transactions in data set-----
#get the distribution of fraud and legit data
table(credit_card$Class)
#get the percent distribution of fraud and legit data
prop.table(table(credit_card$Class))
#pie chart of transactions
labels <- c("legit","fraud")
labels <- paste(labels, round(100*prop.table(table(credit_card$Class)), 2))
labels <- paste0(labels,"%")
pie(table(credit_card$Class), labels, col = c("pink","white"),
main = "Pie chart of fraud and legit credit card transactions")
#------NO MODEL PREDICTION------
#assuming all transactions as legitimate transactions
predictions <- rep.int(0, nrow(credit_card))
predictions <- factor(predictions, levels = c(0,1))
#install.packages('caret')
library(caret)
# confusion matrix is very imp for tree building
confusionMatrix(data = predictions, reference = credit_card$Class)
#----Building the model-----
library(dplyr)
set.seed(1)
table(credit_card$Class)
library(ggplot2)
ggplot(data = credit_card, aes(x = V1, y = V2, col = Class))+
geom_point()+
theme_bw()+
scale_color_manual(values = c('dodgerblue2','red'))
#-----Creating training and test set for fraud detection----
#install.packages('caTools')
library(caTools)
set.seed(123)
data_sample = sample.split(credit_card$Class,SplitRatio = 0.80)
train_data = subset(credit_card,data_sample==TRUE)
test_data = subset(credit_card,data_sample==FALSE)
dim(train_data)
dim(test_data)
#--------Balancing the dataset------
#Random Over Sampling(duplicating the number of fraud cases)
#in this method we duplicate the number of fraud cases
table(train_data$Class)
n_legit <- 227452
new_frac_legit <- 0.50
new_n_total <- n_legit/new_frac_legit
#install.packages('ROSE')
library(ROSE)
oversampling_result <- ovun.sample(Class ~ .,
data = train_data,
method = "over",
N = new_n_total,
seed = 2019)
oversampled_credit <- oversampling_result$data
table(oversampled_credit$Class)
ggplot(data = oversampled_credit, aes(x = V1, y = V2, col = Class))+
geom_point(position = position_jitter(width = 0.2))+
theme_bw()+
scale_color_manual(values = c('dodgerblue2','red'))
#random under sampling(reducing the number of legitimate cases)
table(train_data$Class)
n_fraud <- 394
new_frac_fraud <- 0.50
new_n_total <- n_fraud/new_frac_fraud
undersampling_result <- ovun.sample(Class ~ .,
data = train_data,
method = "under",
N = new_n_total,
seed = 2019)
undersampled_credit <- undersampling_result$data
table(undersampled_credit$Class)
ggplot(data = undersampled_credit, aes(x = V1, y = V2, col = Class))+
geom_point()+
theme_bw()+
scale_color_manual(values = c('dodgerblue2','red'))
#random under sampling and random over sampling both(fraud data overlapped each other due to duplicacy)
n_new <- nrow(train_data)
fraction_fraud_new <- 0.50
sampling_result <- ovun.sample(Class ~ .,
data = train_data,
method = "both",
N = n_new,
p = fraction_fraud_new,
seed = 2019)
sampled_credit <- sampling_result$data
table(sampled_credit$Class)
prop.table(table(sampled_credit$Class))
ggplot(data = sampled_credit, aes(x = V1, y = V2, col = Class))+
geom_point(position = position_jitter(width = 0.2))+
theme_bw()+
scale_color_manual(values = c('dodgerblue2','red'))
#SMOTE method(synthetic minority oversampling technique)[adding synthetic points to the dataset removing duplicacy]
#install.packages('smotefamily')
library(smotefamily)
table(train_data$Class)
#setting the number of fraud and legitimate cases in smote
n0 <- 227452
n1 <- 394
r0 <- 0.6
ntimes <- ((1 - r0) / r0) * (n0 / n1) - 1
smote_output <- SMOTE(X = train_data[ , -c(1,32)],
target = train_data$Class,
K = 5,
dup_size = ntimes)
credit_smote <- smote_output$data
colnames(credit_smote)[31] <- "Class"
table(credit_smote$Class)
prop.table(table(credit_smote$Class))
ggplot(data = credit_smote, aes(x = V1, y = V2, col = Class))+
geom_point()+
theme_bw()+
scale_color_manual(values = c('dodgerblue2','red'))
#--------TRAINING OUR MODEL-------
#training on credit_smote dataset
#designing a decision tree to predict whether a transaction is fraud or legit
#install.packages('rpart')
#install.packages('rpart.plot')
library(rpart)
library(rpart.plot)
CART_model <- rpart(Class ~ ., credit_smote)
rpart.plot(CART_model, extra = 0, type = 5, tweak = 1.2)
#predicting fraud classes
predicted_val <- predict(CART_model, test_data, type = 'class')
#build confusion matrix for test data
library(caret)
confusionMatrix(predicted_val, test_data$Class)
#Training on credit_card dataset
predicted_val <- predict(CART_model, credit_card[-1], type = 'class')
confusionMatrix(predicted_val,credit_card$Class)
#training on train_data datset
CART_model <- rpart(Class ~ ., train_data[,-1])
rpart.plot(CART_model, extra = 0, type = 5, tweak = 1.2)
#predicting fraud classes
predicted_val <- predict(CART_model, test_data[, -1], type = 'class')
library(caret)
confusionMatrix(predicted_val, test_data$Class)
#predicting on whole credit_card dataset
predicted_val <- predict(CART_model, credit_card[-1], type = 'class')
confusionMatrix(predicted_val, credit_card$Class)
#--------WITHOUT SMOTE-------
#decision tree without smote(unbalanced dataset)
CART_model <- rpart(Class ~ ., train_data[,-1])
rpart.plot(CART_model, extra = 0, type = 5, tweak = 1.2)
#predicting fraud classes
predicted_val <- predict(CART_model, test_data[,-1], type = 'class')
library(caret)
confusionMatrix(predicted_val, test_data$Class)
#prediction on whole dataset
predicted_val <- predict(CART_model, credit_card[-1], type = 'class')
confusionMatrix(predicted_val, credit_card$Class)
#----------------------END---------------------------------------------------------------------------------------------