From ce9c4ff1cc688b520aa7c6c3c40110de4deabaa1 Mon Sep 17 00:00:00 2001 From: karanghuge01 Date: Fri, 18 Oct 2024 00:33:37 +0530 Subject: [PATCH] Add K-Means Clustering Algorithm --- PYTHON/k_means_clustering.py | 126 +++++++++++++++-------------------- 1 file changed, 55 insertions(+), 71 deletions(-) diff --git a/PYTHON/k_means_clustering.py b/PYTHON/k_means_clustering.py index 08785a2..ca86dd0 100644 --- a/PYTHON/k_means_clustering.py +++ b/PYTHON/k_means_clustering.py @@ -1,82 +1,66 @@ +import random import numpy as np import matplotlib.pyplot as plt -from sklearn.datasets import make_blobs -X,y = make_blobs(n_samples = 500,n_features = 2,centers = 3,random_state = 23) -fig = plt.figure(0) -plt.grid(True) -plt.scatter(X[:,0],X[:,1]) -plt.show() +def distance(a, b): + return np.sqrt(np.dot(a - b, a - b)) -k = 3 +class EnhancedKMeans: + def __init__(self, num_clusters=3, iterations=100): + self.num_clusters = num_clusters + self.iterations = iterations + self.centroids = None -clusters = {} -np.random.seed(23) + def fit_predict(self, X): + random_indices = random.sample(range(0, X.shape[0]), self.num_clusters) + self.centroids = X[random_indices] -for idx in range(k): - center = 2*(2*np.random.random((X.shape[1],))-1) - points = [] - cluster = { - 'center' : center, - 'points' : [] - } - - clusters[idx] = cluster - -clusters + for iteration in range(self.iterations): + assigned_clusters = self.assign_clusters(X) + prv_centroids = self.centroids.copy() + self.centroids = self.recalculate_centroids(X, assigned_clusters) + if np.allclose(prv_centroids, self.centroids): + break -plt.scatter(X[:,0],X[:,1]) -plt.grid(True) -for i in clusters: - center = clusters[i]['center'] - plt.scatter(center[0],center[1],marker = '*',c = 'red') -plt.show() + return assigned_clusters + def assign_clusters(self, X): + cluster_assignments = [] -def distance(p1,p2): - return np.sqrt(np.sum((p1-p2)**2)) - -def assign_clusters(X, clusters): - for idx in range(X.shape[0]): - dist = [] - - curr_x = X[idx] - - for i in range(k): - dis = distance(curr_x,clusters[i]['center']) - dist.append(dis) - curr_cluster = np.argmin(dist) - clusters[curr_cluster]['points'].append(curr_x) - return clusters - -#Implementing the M-Step -def update_clusters(X, clusters): - for i in range(k): - points = np.array(clusters[i]['points']) - if points.shape[0] > 0: - new_center = points.mean(axis =0) - clusters[i]['center'] = new_center - - clusters[i]['points'] = [] - return clusters - -def pred_cluster(X, clusters): - pred = [] - for i in range(X.shape[0]): - dist = [] - for j in range(k): - dist.append(distance(X[i],clusters[j]['center'])) - pred.append(np.argmin(dist)) - return pred - -clusters = assign_clusters(X,clusters) -clusters = update_clusters(X,clusters) -pred = pred_cluster(X,clusters) + for point in X: + distances = [distance(point, centroid) for centroid in self.centroids] + nearest_centroid_idx = np.argmin(distances) + cluster_assignments.append(nearest_centroid_idx) -plt.scatter(X[:,0],X[:,1],c = pred) -for i in clusters: - center = clusters[i]['center'] - plt.scatter(center[0],center[1],marker = '^',c = 'red') -plt.show() - \ No newline at end of file + return np.array(cluster_assignments) + + def recalculate_centroids(self, X, cluster_assignments): + new_centroids = [] + for cluster_idx in range(self.num_clusters): + cluster_points = X[cluster_assignments == cluster_idx] + if len(cluster_points) > 0: + new_centroids.append(np.mean(cluster_points, axis=0)) + + return np.array(new_centroids) + + def plot_clusters(self, X, cluster_assignments): + plt.figure(figsize=(10, 8)) + for cluster_idx in range(self.num_clusters): + cluster_points = X[cluster_assignments == cluster_idx] + plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster_idx + 1}") + plt.scatter(self.centroids[:, 0], self.centroids[:, 1], s=150, c='red', marker='X', label='Centroids') + plt.legend() + plt.show() + +if __name__ == "__main__": + np.random.seed(42) + data = np.vstack(( + np.random.randn(80, 2) + np.array([5, 5]), + np.random.randn(80, 2) + np.array([-5, -5]), + np.random.randn(80, 2) + np.array([5, -5]) + )) + + kmeans = EnhancedKMeans(num_clusters=3) + cluster_labels = kmeans.fit_predict(data) + kmeans.plot_clusters(data, cluster_labels) \ No newline at end of file