diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bde763c --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# kmedoids +The Python implementation of [k-medoids](https://en.wikipedia.org/wiki/K-medoids). + +## Example +```python +from sklearn.metrics.pairwise import pairwise_distances +import numpy as np + +import kmedoids + +# 3 points in dataset +data = np.array([[1,1], + [2,2], + [10,10]]) + +# distance matrix +D = pairwise_distances(data, metric='euclidean') + +# split into 2 clusters +M, C = kmedoids.kMedoids(D, 2) + +print('medoids:') +for point_idx in M: + print( data[point_idx] ) + +print('') +print('clustering result:') +for label in C: + for point_idx in C[label]: + print('label {0}: {1}'.format(label, data[point_idx])) +``` + +Output: +``` +medoids: +[1 1] +[10 10] + +clustering result: +label 0: [1 1] +label 0: [2 2] +label 1: [10 10] +``` + +## License +This code is from: + +> Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015. + +Please cite the article if the code is used in your research. \ No newline at end of file diff --git a/example.py b/example.py new file mode 100644 index 0000000..ebd9d40 --- /dev/null +++ b/example.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from sklearn.metrics.pairwise import pairwise_distances +import numpy as np + +import kmedoids + +# 3 points in dataset +data = np.array([[1,1], + [2,2], + [10,10]]) + +# distance matrix +D = pairwise_distances(data, metric='euclidean') + +# split into 2 clusters +M, C = kmedoids.kMedoids(D, 2) + +print('medoids:') +for point_idx in M: + print( data[point_idx] ) + +print('') +print('clustering result:') +for label in C: + for point_idx in C[label]: + print('label {0}: {1}'.format(label, data[point_idx])) + + diff --git a/kmedoids.py b/kmedoids.py new file mode 100644 index 0000000..38dd3b2 --- /dev/null +++ b/kmedoids.py @@ -0,0 +1,38 @@ +import numpy as np +import random + +def kMedoids(D, k, tmax=100): + # determine dimensions of distance matrix D + m, n = D.shape + + # randomly initialize an array of k medoid indices + M = np.sort(np.random.choice(n, k)) + + # create a copy of the array of medoid indices + Mnew = np.copy(M) + + # initialize a dictionary to represent clusters + C = {} + for t in xrange(tmax): + # determine clusters, i. e. arrays of data indices + J = np.argmin(D[:,M], axis=1) + for kappa in range(k): + C[kappa] = np.where(J==kappa)[0] + # update cluster medoids + for kappa in range(k): + J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1) + j = np.argmin(J) + Mnew[kappa] = C[kappa][j] + np.sort(Mnew) + # check for convergence + if np.array_equal(M, Mnew): + break + M = np.copy(Mnew) + else: + # final update of cluster memberships + J = np.argmin(D[:,M], axis=1) + for kappa in range(k): + C[kappa] = np.where(J==kappa)[0] + + # return results + return M, C \ No newline at end of file