forked from SISBID/Unsupervised-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2016_SISBID_DimensionReduction_Lab.R
149 lines (108 loc) · 4.23 KB
/
2016_SISBID_DimensionReduction_Lab.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#############################################################
#2016 SISBID Module 5 - Unsupervised Learning
#Genevera I. Allen & Yufeng Liu
#Dimension Reduction Lab
############################################################
############
#Data set - Digits Data.
#Either use all digits or choose 2-3 digits if computational speed is a problem. Looking at 3's, 8's and 5's are interesting. Note that NMF takes quite a while to run, so you may want to limit the digits considered for that problem.
############
############
#Problem 1 - PCA
############
#Problem 1a - Apply PCA to this data.
#Problem 1b - Do the first several PCs well separate different digits? Why or why not?
#Problem 1c - Use the first several PCs and PC loadings to evaluate the major patterns in the digits data. Can you come up with a description of the pattern found by each of the first five PCs?
#Problem 1d - How many PCs are needed to explain 95% of the variance? You must decide how many PCs to retain. Which do you pick and why?
#############
#Problem 2 - NMF.
#############
#Problem 2a - Apply NMF to this data.
#Problem 2b - Which value of K did you use? Why? What happens when you slightly change your chosen K?
#Problem 2c - Interpret the archetypes found. Do any of them accurately reflect the different digits? Which ones?
#Problem 2d - Plot NMF basis scatterplots of the factors associated with differences between the digits from 2c. Do these scatterplots well separate the different digits? Why or why not?
#############
#Problem 3 - ICA.
#############
#Problem 3a - Apply ICA to this data set.
#Problem 3b - Which value of K did you use? Why? What happens when you slightly change your chosen K?
#Problem 3c - Interpret the independent image signals found. Do any other them accurately reflect the different digits? Which ones?
###############
#Problem 4 - Comparisons.
##############
#Problem 4a - Compare and contrast PCA, NMF, and ICA on this data set. Which one best separates the different digits? Which one reveals the most interesting patterns?
#Problem 4b - Overall, which method do you recommend for this data set and why?
################################################
##########
#Additional Data set - NCI Microarray data
#(If you have time - take a further look at this data set using various methods for dimension reduction. Also you may be interested in trying MDS to visualize this data.)
###########
###############################################################
###############################################################
#R scripts to help out with the Dimension Reduction Lab
#Don't peek at this if you want to practice coding on your own!!
##################################################################
#code for digits - ALL
load("UnsupL_SISBID_2016.RData")
#visulaize
par(mfrow=c(4,8))
for(i in 1:32){
imagedigit(digits[i,])
}
########Problem 1 - PCA
#PCA - take SVD to get solution
#don't center and scale to retain interpretation as images
svdd = svd(digits)
U = svdd$u
V = svdd$v #PC loadings
D = svdd$d
Z = digits%*%V #PCs
#PC scatterplot
i = 4; j = 3;
plot(U[,i],U[,j],type="n")
text(U[,i],U[,j],rownames(digits),col=rownames(digits),cex=.7)
#PC loadings
par(mfrow=c(3,5))
for(i in 1:15){
imagedigit(V[,i])
}
#Variance Explained
varex = 0
cumvar = 0
denom = sum(D^2)
for(i in 1:256){
varex[i] = D[i]^2/denom
cumvar[i] = sum(D[1:i]^2)/denom
}
#screeplot
par(mfrow=c(1,2))
plot(1:256,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
plot(1:256,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")
#########Problem 2 - NMF
#NMF
require("NMF")
dat38 = rbind(digits[which(rownames(digits)==3),],digits[which(rownames(digits)==8),])
K = 20
nmffit = nmf(dat38+1,rank=K) #note - this takes a while
W = basis(nmffit)
H = coef(nmffit)
#plot archetypes
par(mfrow=c(4,5))
for(i in 1:K){
imagedigit(H[i,])
}
#plot scaterrplots of W for interesting archetypes
i = 11; j = 15;
par(mfrow=c(1,1))
plot(W[,i],W[,j],type="n")
text(W[,i],W[,j],rownames(dat38),col=rownames(dat38),cex=.7)
##################
#Problem 3 - ICA
K = 20
icafit = fastICA(t(digits),n.comp=K)
#plot independent source signals
par(mfrow=c(4,5))
for(i in 1:K){
imagedigit(icafit$S[,i])
}
#################################################################