-
Notifications
You must be signed in to change notification settings - Fork 1
/
python_pseudo.py
61 lines (46 loc) · 1.96 KB
/
python_pseudo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Here is pseudo-python code which runs k-means on a dataset.
# It is a short algorithm made longer by verbose commenting.
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(dataSet, k):
# Initialize centroids randomly
numFeatures = dataSet.getNumFeatures()
centroids = getRandomCentroids(numFeatures, k)
# Initialize book keeping vars.
iterations = 0
oldCentroids = None
# Run the main k-means algorithm
while not shouldStop(oldCentroids, centroids, iterations):
# Save old centroids for convergence test. Book keeping.
oldCentroids = centroids
iterations += 1
# Assign labels to each datapoint based on centroids
labels = getLabels(dataSet, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(dataSet, labels, k)
# We can get the labels too by calling getLabels(dataSet, centroids)
return centroids
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
if iterations > MAX_ITERATIONS: return True
return oldCentroids == centroids
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset.
def getLabels(dataSet, centroids):
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element's label.
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k):
# Each centroid is the geometric mean of the points that
# have that centroid's label. Important: If a centroid is empty (no points have
# that centroid's label) you should randomly re-initialize it.