-
Notifications
You must be signed in to change notification settings - Fork 27
/
014_Basic preprocessing.R
94 lines (74 loc) · 3.7 KB
/
014_Basic preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# processing predictor variables (useful for model-based approaches than ?)
# warning preprocessing is useful for continuous variables and does not work well for factor variables
library(caret)
library(kernlab)
data(spam) # load the spam dataset
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
# the variable how many capital we see in a row (letter of an email)
hist(training$capitalAve, main="", xlab="ave. capital run length")
# most of the emails has few capital letter
mean(training$capitalAve)
sd(training$capitalAve) # highly variable!!
# Need for preprocess the dataset so that the prediction algorithm don't get wrong by the high variability nature of the variables
# Standardizing
trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve - mean(trainCapAve)) / sd(trainCapAve)
# After reducint the variability, the mean and standard deriviation become
mean(trainCapAveS)
sd(trainCapAveS)
# Standardizing - test set
# keep in mind when applying the model on the test set, we should use mean and sd of the test set
testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve - mean(testCapAve)) / sd(testCapAve)
mean(testCapAveS)
sd(testCapAveS)
# Can use preProcess function to do the pre-processing for us by
# Here we pass all variables except the last one (i.e. 58) which is the outcome, and ask for centring and scalling them all
preObj <- preProcess(training[,-58], method=c("center", "scale"))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
mean(trainCapAveS)
sd(trainCapAveS)
# then apply pre-process to the testing set with the same object
testCapAveS <- predict(preObj, testing[, -58])$capitalAve
mean(testCapAveS)
sd(testCapAveS)
# can also pass the pre-processing options directly to the train function
set.seed(32343)
modelFit <- train(type ~., data=training, preProcess=c("center", "scale"), method="glm")
modelFit
# Others transformation techniques can be used to solve problems that centering and scaling cannot solve
# Standardizing - Box-Cox transforms continous data and make them look like normal data by maximum likelihood
preObj <- preProcess(training[,-58], method=c("BoxCox"))
trainCapAveS <- predict(preObj, training[, -58])$capitalAve
par(mfrow=c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS) # in the Q-Q plot we can see the problem with some points near to -2 and 2 and the line is not perfect
# Standardizing - Imputing data using KNN-Impute
# Missing values in the dataset are not well handled by prediction models
set.seed(13343)
# Make some values NA using a randomly selected indices
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1], size=1, prob=0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
# KNN-Impute looks to the K (e.g. 10) nearest values to a missing value and average the values then impute them at that position
install.packages("RANN")
library("RANN")
preObj <- preProcess(training[, -58], method="knnImpute")
capAve <- predict(preObj, training[, -58])$capAve
# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth)) / sd(capAveTruth)
# We can compare between the actual values (before removing them) and the imputted values, the imputation works well if they are close
# An overall comparison
quantile(capAve - capAveTruth)
# compare only concerned values (i.e. the one selected for imputation)
quantile((capAve - capAveTruth)[selectNA])
# compare all but the selected values for imputation
# values from both distributions are even closer (i.e. smaller difference)
quantile((capAve - capAveTruth)[!selectNA])
# Further information
# Preprocessing with Caret:
# - http://caret.r-forge.r-project.org/preprocess.html