initial commit

letiantian · Mar 8, 2016 · f9b60e8 · f9b60e8
commit f9b60e8
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+# kmedoids
+The Python implementation of [k-medoids](https://en.wikipedia.org/wiki/K-medoids). 
+
+## Example
+```python
+from sklearn.metrics.pairwise import pairwise_distances
+import numpy as np
+
+import kmedoids
+
+# 3 points in dataset
+data = np.array([[1,1], 
+                [2,2], 
+                [10,10]])
+
+# distance matrix
+D = pairwise_distances(data, metric='euclidean')
+
+# split into 2 clusters
+M, C = kmedoids.kMedoids(D, 2)
+
+print('medoids:')
+for point_idx in M:
+    print( data[point_idx] )
+
+print('')
+print('clustering result:')
+for label in C:
+    for point_idx in C[label]:
+        print('label {0}:　{1}'.format(label, data[point_idx]))
+```
+
+Output:
+```
+medoids:
+[1 1]
+[10 10]
+
+clustering result:
+label 0:　[1 1]
+label 0:　[2 2]
+label 1:　[10 10]
+```
+
+## License
+This code is from:
+
+> Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015.
+
+Please cite the article if the code is used in your research.
diff --git a/example.py b/example.py
@@ -0,0 +1,28 @@
+# coding: utf-8
+from sklearn.metrics.pairwise import pairwise_distances
+import numpy as np
+
+import kmedoids
+
+# 3 points in dataset
+data = np.array([[1,1], 
+                [2,2], 
+                [10,10]])
+
+# distance matrix
+D = pairwise_distances(data, metric='euclidean')
+
+# split into 2 clusters
+M, C = kmedoids.kMedoids(D, 2)
+
+print('medoids:')
+for point_idx in M:
+    print( data[point_idx] )
+
+print('')
+print('clustering result:')
+for label in C:
+    for point_idx in C[label]:
+        print('label {0}:　{1}'.format(label, data[point_idx]))
+
+
diff --git a/kmedoids.py b/kmedoids.py
@@ -0,0 +1,38 @@
+import numpy as np
+import random
+
+def kMedoids(D, k, tmax=100):
+    # determine dimensions of distance matrix D
+    m, n = D.shape
+
+    # randomly initialize an array of k medoid indices
+    M = np.sort(np.random.choice(n, k))
+
+    # create a copy of the array of medoid indices
+    Mnew = np.copy(M)
+
+    # initialize a dictionary to represent clusters
+    C = {}
+    for t in xrange(tmax):
+        # determine clusters, i. e. arrays of data indices
+        J = np.argmin(D[:,M], axis=1)
+        for kappa in range(k):
+            C[kappa] = np.where(J==kappa)[0]
+        # update cluster medoids
+        for kappa in range(k):
+            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
+            j = np.argmin(J)
+            Mnew[kappa] = C[kappa][j]
+        np.sort(Mnew)
+        # check for convergence
+        if np.array_equal(M, Mnew):
+            break
+        M = np.copy(Mnew)
+    else:
+        # final update of cluster memberships
+        J = np.argmin(D[:,M], axis=1)
+        for kappa in range(k):
+            C[kappa] = np.where(J==kappa)[0]
+
+    # return results
+    return M, C