-
Notifications
You must be signed in to change notification settings - Fork 0
/
kMeans.py
122 lines (103 loc) · 6.15 KB
/
kMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# ######################################## PREAMBLE #################################################
# Script uses an unsupervised algorithim, K-Means, to group points into K clusters. When script is #
# run in the interpreter, the output will return centroid value for each cluster,and the percentage #
# of points in each cluster for the training set. Change to kMeans(True) to run on validation. #
# ###################################################################################################
####################################################################
# QUESTION 2 (10,000 DATASET) # QUESTION 3 #
# K K1% K2% K3% K4% K5% # VALIDATION SET LOSS #
# 1 100 # 12,870.10 #
# 2 50.5 49.5 # 2,960.67 #
# 3 38.2 23.8 38.0 # 1,629.21 #
# 4 37.1 12.1 37.3 13.5 # 1,054.54 #
# 5 36.8 11.1 37.0 7.6 7.5 # 907.21 #
####################################################################
# 100 DIMENSION QUESTION #
# K TRAINING SET LOSS K MEANS TRAINING SET LOSS MoG #
# 5 215,509 1,091,210 #
# 10 215,268 834,024 #
# 15 215,361 834,038 #
# 20 212,945 486,583 #
# 30 211,200 484,477 #
####################################################################
def loadData(valid = False):
global k, epochs, trainData, validData, num_pts, dim, trainLoss, validLoss
k = 3 # Set number of clusters
epochs = 500 # Set number of epochs. Set to 800 for 100D
# trainData = np.load('data100D.npy') # Comment either this or 2D
trainData = np.load('data2D.npy')
[num_pts, dim] = np.shape(trainData)
trainLoss = np.full((epochs, 1), np.inf) # Define loss vector to store values of training loss
if valid: # Split data to training and validation
valid_batch = int(num_pts / 3.0)
np.random.seed(45689)
rnd_idx = np.arange(num_pts)
np.random.shuffle(rnd_idx)
validData = trainData[rnd_idx[:valid_batch]]
trainData = trainData[rnd_idx[valid_batch:]] # Re-define trainData if valid is True
[num_pts, dim] = np.shape(trainData)
validLoss = np.full((epochs, 1), np.inf)
def buildGraph():
tf.reset_default_graph() # Clear any previous junk
tf.set_random_seed(45689)
trainingInput = tf.placeholder(tf.float32, shape=(None, dim)) # Data placeholder
centroid = tf.get_variable('mean', shape=(k,dim), initializer=tf.initializers.random_normal()) # Mean placeholder
distanceSquared = distanceFunc(trainingInput,centroid) # Finds the euclidean norm
loss = tf.math.reduce_sum(tf.math.reduce_min(distanceSquared,0)) # Choose the smallest distance for each point and sum them
optimizer =tf.train.AdamOptimizer(learning_rate= 0.01, beta1=0.9, beta2=0.99,epsilon=1e-5).minimize(loss) # Optimize
return optimizer, loss, distanceSquared, centroid, trainingInput
def kMeans(valid=False):
loadData(valid) # Load the Data
optimizer, loss, distanceSquared, centroid, trainingInput = buildGraph() # Build the graph
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(0,epochs):
_, trainLoss[i], dist, mU = sess.run([optimizer, loss, distanceSquared, centroid], feed_dict = {trainingInput:trainData})
if valid: # Find validation loss and distance if this is True
validLoss[i],distV = sess.run([loss, distanceSquared], feed_dict = {trainingInput: validData})
assign = np.argmin(dist,0) # Assign the point to the nearest cluster
inCluster = np.mean(np.eye(k)[assign],0) # Find the average number of points in each cluster
plotter(valid) # Plot the Loss vs Epochs graph
scatter(trainData, assign, mU) # Draw a 2D scatter plot. For dim = 2 only
if valid:
assignV = np.argmin(distV,0) # If we are validating, use the distances of the validation points to assign the cluster
inClusterV = np.mean(np.eye(k)[assignV],0) # Find the average number of points in each cluster
scatter(validData, assignV, mU) # Draw a 2D scatter plot for the validation points. For dim = 2 only
return mU, inCluster, inClusterV
return mU, inCluster, None
def distanceFunc(X, mu): # Returns distance squared
expandPoints = tf.expand_dims(X, 0)
expandCentroid = tf.expand_dims(mu, 1)
return tf.reduce_sum(tf.square(tf.subtract(expandPoints, expandCentroid)), 2)
def plotter(valid=False): # Plotting Functiom
plt.figure(1)
plt.cla()
plt.title("K = %i Loss vs Epoch" % k, fontsize = 32)
plt.ylabel("Loss", fontsize = 30)
plt.xlabel("Epoch", fontsize = 30)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlim((0,epochs))
plt.grid(which = 'both', axis = 'both')
if valid == False:
plt.plot(trainLoss)
else: # Normalize Plot to see how training and validation are related
plt.plot(trainLoss/trainData.shape[0])
plt.plot(validLoss/validData.shape[0])
def scatter(X, cluster, mU):
if dim == 2: # Would be pretty bizarre to show 100D on 2D
plt.figure()
plt.title("K = %i Scatter Plot" % k, fontsize = 32)
plt.xlabel("$x_1$", fontsize = 30)
plt.ylabel("$x_2$", fontsize = 30)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.scatter(X[:, 0], X[:, 1], c= cluster, s=1, cmap='viridis')
plt.scatter(mU[:, 0], mU[:, 1], c='black', s=50, alpha=0.5)
mU, inCluster, inClusterV = kMeans(False) # Change to True if you want to run validation.