-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPCA_Madsen.R
104 lines (78 loc) · 4.24 KB
/
PCA_Madsen.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
###############
# Principal Component Analysis [PCA] pipeline
###############
# Importing the 'iris' data set
library(datasets)
data(iris) #load
head(iris) #inspect
str(iris) #note 'Species' factor variable is column 5
# PCA cannot handle NA values: a real-world dataset will require this cleaning
df1 <- iris[complete.cases(iris$Sepal.Length),]
df2 <- df1[complete.cases(df1$Sepal.Width),]
df3 <- df2[complete.cases(df2$Petal.Length),]
df4 <- df3[complete.cases(df3$Petal.Width),]
# Apply principal component analysis (PCA)
PCA.model <- prcomp(df4[, 1:4], scale = TRUE, center = TRUE, retx = TRUE)
summary(PCA.model) #PCA result [variance explained]
plot(PCA.model, type = "l") #SCREE PLOT [discard PCs with Eigenvalue < 1.0]
PCA.model$sdev^2 #EIGENVALUES [calculate numerical Eigenvalues for each PC]
biplot(PCA.model, scale = 0) #BIPLOT [crude plot of the captured dataset variance]
PCA.model$rotation #LOADINGS [description of variable rotations/contribution for each PC]
df.pca <- cbind(df4, PCA.model$x[,1:4]) #NEW DATAFRAME WITH PC SCORES [for all 4 PCs]
# ggbiplot graphical representation of PCA
library(devtools)
install_github("vqv/ggbiplot", force = TRUE)
library(ggplot2)
library(ggbiplot)
# Basic ggbiplot model - notice horizontal variable vectors will determine PC1 score
ggbiplot(PCA.model, choices = c(1,2), obs.scale = 1, var.scale = 1, groups = df.pca$Species)
# Customized ggbiplot model
ggbiplot(PCA.model, choices = c(1,2), obs.scale = 1, var.scale = 1, groups = df.pca$Species,
varname.size = 5.3, varname.adjust = 0.5, ellipse = TRUE, ellipse.prob = 0.82, alpha = 1) +
theme_bw() + scale_color_manual(values = c("red", "green", "darkblue"), labels = c("setosa", "versicolor", "virginica")) +
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + labs(title="") +
guides(col = guide_legend(title = "Species")) + theme(legend.position = "top") +
theme(text = element_text(size=12)) + theme(legend.text = element_text(size=12)) +
theme(aspect.ratio=0.8) + theme(legend.margin=margin(t=0, r=0, b=0, l=0, unit="cm"))
# MANUALLY CALCULATE THE PC SCORES:
# APPLY CENTER=TRUE & SCALE=TRUE TO NEW DATA
str(iris)
a <- data.frame(iris$Sepal.Length, iris$Sepal.Width, iris$Petal.Length, iris$Petal.Width, iris$Species)
require(graphics)
c.fun<-function(a, center, scale) {
return((a-center)/scale ) }
#Apply centering and scaling to transform "NewData" according to previous PCAmodel
TransformedNewData <- apply(a[, 1:4], MARGIN=1, FUN=c.fun, PCA.model$center, PCA.model$scale)
NewDataPCscores <- t(PCA.model$rotation) %*% TransformedNewData
NewDataPCscores2 <- data.frame(t(NewDataPCscores))
head(NewDataPCscores2)
df <- cbind(df.pca, NewDataPCscores2)
head(df) # confirm that PC-scores [from PCA] are identical to the manually calculated PC-scores
# Self-made biplot!
plot.default(df$PC1, df$PC2)
# Write dataframe containing PC scores
write.table(df.pca, "PCA_results.csv",
na = " ",
row.names = FALSE,
col.names = TRUE,
append = FALSE,
sep = ",",
dec = ".")
# ROC analyses to evaluate PCA clustering: ability to distinguish "versicolor" vs "virginica"
library(pROC)
df.pca$SpeciesVar <- as.factor(ifelse(df.pca$Species == "versicolor",1, ifelse(df.pca$Species == "virginica",2,NA)))
class <- ifelse(c$SpeciesVar == "1", "2", "NA")
score <- c$PC1 #try all PCs and petal/sepal width/length in sequence
PROC1 <- roc(class,score)
plot(PROC1)
auc(PROC1)
coords(PROC1, "best", transpose=TRUE, ret=c("ppv", "npv", "accuracy", "threshold", "tp", "fp", "tn", "fn"))
# ppv = positive predictive value
# npv = negative predictive value
# tp = % true positive classifications
# fp = % false positive classifications
# tn = % true negative classifications
# fn = % false negative classifications
# accuracy = proportion of (TP+TN / TP+TN+FP+FN)
# threshold = THE VARIABLE VALUE CORRESPONDING TO THE OPTIMAL CUTOFF ON THE ROC CURVE (Youden index)
# ROC AUC score table: Petal.Length > Petal.Width > PC1 > Sepal.Length > PC2 > Sepal.Width > PC3 > PC4