forked from aws/amazon-sagemaker-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
breast_cancer_modeling.r
47 lines (40 loc) · 1.38 KB
/
breast_cancer_modeling.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
library(caret)
library(mlbench)
data(BreastCancer)
summary(BreastCancer) #Summary of Dataset
df <- BreastCancer
# convert input values to numeric
for(i in 2:10) {
df[,i] <- as.numeric(as.character(df[,i]))
}
# split the data into train and test and perform preprocessing
trainIndex <- createDataPartition(df$Class, p = .8,
list = FALSE,
times = 1)
df_train <- df[ trainIndex,]
df_test <- df[-trainIndex,]
preProcValues <- preProcess(df_train, method = c("center", "scale", "medianImpute"))
df_train_transformed <- predict(preProcValues, df_train)
# train a model on df_train
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10,
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
set.seed(825)
gbmFit <- train(Class ~ ., data = df_train_transformed[,2:11],
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE,
metric = "ROC")
gbmFit
saveRDS(preProcValues, file = './preProcessor.rds')
saveRDS(gbmFit, file = './gbm_model.rds')
saveRDS(df_test[,1:10], file = './breast_cancer_test_data.rds')