Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PAM algorithm to K-Medoids #73

Merged
merged 33 commits into from
Nov 26, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
91891ae
Add pam algorithm
Nov 6, 2020
40e42e7
Merge remote-tracking branch 'upstream/master' into kmedoid_pam
Nov 7, 2020
4362f85
pam algorithm, not naive.
Nov 8, 2020
5a86b95
black reformat
Nov 8, 2020
6e6c90d
Fix mistake in code
Nov 8, 2020
2ed1803
optimization of the algorithm for speed, review from @kno10
Nov 10, 2020
8422c17
remove generator for couples
Nov 10, 2020
6eae84b
fix mistake
Nov 10, 2020
2a92978
Update pam review 2
Nov 11, 2020
ecce8c8
fix mistake
Nov 12, 2020
1cba61c
cython implementation
Nov 13, 2020
258c262
add test
Nov 13, 2020
9078628
disable openmp for windows and mac
Nov 13, 2020
482bc37
fix black
Nov 13, 2020
50d0eb3
fix setup.py for windows
Nov 13, 2020
7637891
remove test
Nov 13, 2020
eeaa2a3
change review
Nov 15, 2020
093f8b0
Merge branch 'master' into kmedoid_pam
TimotheeMathieu Nov 18, 2020
bd04827
fix black
Nov 18, 2020
e979579
Add build, remove parallel computing
Nov 21, 2020
b51d23b
Apply suggestions from code review
TimotheeMathieu Nov 21, 2020
e675bdb
apply suggested change & rename alternating to alternate.
Nov 21, 2020
4250681
fix test
Nov 21, 2020
8f2ada3
Merge remote-tracking branch 'upstream/master' into kmedoid_pam
Nov 21, 2020
552294b
make build default. Allow max_iter = 0 for build-only algo
Nov 21, 2020
b024b8e
Test for method and init
Nov 21, 2020
018c9c7
test on blobs example
Nov 21, 2020
2f6368f
fix typo
Nov 21, 2020
f1a33ad
fix difference long/long long windows vs linux
Nov 21, 2020
498d9b6
try another fix for windows/linux long difference
Nov 21, 2020
daa9879
test another fix cython long/int on different platforms
Nov 21, 2020
213bb2e
test all in int, cython kmedoid
Nov 22, 2020
9c15afa
explain test_kmedoid_results
Nov 26, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@
["sklearn_extra/utils/_cyfht.pyx"],
include_dirs=[np.get_include()],
),
Extension(
"sklearn_extra.cluster._k_medoids_swap",
["sklearn_extra/cluster/_k_medoids_swap.pyx"],
include_dirs=[np.get_include()],
extra_compile_args=["-fopenmp"],
extra_link_args=["-fopenmp"],
),
Extension(
"sklearn_extra.cluster._commonnn_inner",
["sklearn_extra/cluster/_commonnn_inner.pyx"],
Expand Down
83 changes: 81 additions & 2 deletions sklearn_extra/cluster/_k_medoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import warnings

import numpy as np
from itertools import product

from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.metrics.pairwise import (
Expand All @@ -20,6 +21,8 @@
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import ConvergenceWarning

from ._k_medoids_swap import _compute_optimal_swap


class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
"""k-medoids clustering.
Expand All @@ -35,6 +38,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
metric : string, or callable, optional, default: 'euclidean'
What distance metric to use. See :func:metrics.pairwise_distances

method : {"alternating", "pam"}, default: "alternating"
Which algorithm to use.

init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic'
Specify medoid initialization method. 'random' selects n_clusters
elements from the dataset. 'heuristic' picks the n_clusters points
Expand All @@ -47,6 +53,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
max_iter : int, optional, default : 300
Specify the maximum number of iterations when fitting.

n_threads : int, optional, default : 1
Number of threads to be used for "pam" method.

random_state : int, RandomState instance or None, optional
Specify random state for the random number generator. Used to
initialise medoids when init='random'.
Expand Down Expand Up @@ -112,12 +121,15 @@ def __init__(
self,
n_clusters=8,
metric="euclidean",
method="alternating",
init="heuristic",
max_iter=300,
n_threads=1,
random_state=None,
):
self.n_clusters = n_clusters
self.metric = metric
self.method = method
self.init = init
self.max_iter = max_iter
self.random_state = random_state
Expand Down Expand Up @@ -183,15 +195,56 @@ def fit(self, X, y=None):
)
labels = None

if self.method == "pam":
# Compute the distance to the first and second closest points
# among medoids.
Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]

# Continue the algorithm as long as
# the medoids keep changing and the maximum number
# of iterations is not exceeded

for self.n_iter_ in range(0, self.max_iter):
old_medoid_idxs = np.copy(medoid_idxs)
labels = np.argmin(D[medoid_idxs, :], axis=0)

# Update medoids with the new cluster indices
self._update_medoid_idxs_in_place(D, labels, medoid_idxs)
if self.method == "alternating":
# Update medoids with the new cluster indices
self._update_medoid_idxs_in_place(D, labels, medoid_idxs)
elif self.method == "pam":
not_medoid_idxs = np.delete(np.arange(len(D)), medoid_idxs)

# transform generator to list
optimal_swap = _compute_optimal_swap(
D,
medoid_idxs,
not_medoid_idxs,
Djs,
Ejs,
self.n_clusters,
self.n_threads,
)
if optimal_swap is not None:
i, j = optimal_swap
medoid_idxs[medoid_idxs == i] = j

# update Djs and Ejs with new medoids
Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
TimotheeMathieu marked this conversation as resolved.
Show resolved Hide resolved

elif self.method == "naive":
not_medoid_idxs = set(np.arange(len(D))) - set(medoid_idxs)
couples_idxs = product(medoid_idxs, list(not_medoid_idxs))
# transform generator to list
couples_idxs = [(i, j) for i, j in couples_idxs]
optimal_swap = self._compute_optimal_swap_naive(
D, medoid_idxs, couples_idxs
)
if optimal_swap is not None:
i, j = optimal_swap
medoid_idxs[medoid_idxs == i] = j
else:
raise ValueError("No such method implemented.")
TimotheeMathieu marked this conversation as resolved.
Show resolved Hide resolved

if np.all(old_medoid_idxs == medoid_idxs):
break
elif self.n_iter_ == self.max_iter - 1:
Expand Down Expand Up @@ -252,6 +305,32 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
if min_cost < curr_cost:
medoid_idxs[k] = cluster_k_idxs[min_cost_idx]

def _compute_optimal_swap_naive(self, D, medoid_idxs, couples_idxs):
"""Compute cost for all the possible swaps dictated by couples_idxs"""

# Compute the cost for each swap
initial_cost = self._compute_cost(D, medoid_idxs)
costs = []
for i, j in couples_idxs:
med_idxs = medoid_idxs.copy()
med_idxs[med_idxs == i] = j
T = self._compute_cost(D, med_idxs)
costs.append(T)

if np.min(costs) < initial_cost:
optimal_swap = couples_idxs[np.argmin(costs)]
else:
optimal_swap = None
return optimal_swap

def _compute_cost(self, D, medoid_idxs):
""" Compute the cose for a given configuration of the medoids"""
return self._compute_inertia(D[:, medoid_idxs])

def _compute_cost(self, D, medoid_idxs):
""" Compute the cose for a given configuration of the medoids"""
return self._compute_inertia(D[:, medoid_idxs])

def transform(self, X):
"""Transforms X to cluster-distance space.

Expand Down
68 changes: 68 additions & 0 deletions sklearn_extra/cluster/_k_medoids_swap.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# cython: infer_types=True
# Fast swap step in PAM algorithm for k_medoid.
# Author: Timothée Mathieu
# License: 3-clause BSD

cimport cython
from cython.parallel import prange

@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.boundscheck(False) # turn of out of bound checks
def _compute_optimal_swap( double[:,:] D,
long[:] medoid_idxs,
long[:] not_medoid_idxs,
double[:] Djs,
double[:] Ejs,
int n_clusters,
int n_threads):
"""Compute best cost change for all the possible swaps"""

# Initialize best cost change and the associated swap couple.
cdef double best_cost_change = 0
cdef (int, int) best_couple = (1, 1)
cdef int sample_size = len(D)
cdef int i, j, h, id_i, id_h, id_j
cdef double T
cdef int not_medoid_shape = sample_size - n_clusters
cdef bint cluster_i_bool, not_cluster_i_bool, second_best_medoid, not_second_best_medoid
cdef double to_add

# Compute the change in cost for each swap.
for h in range(not_medoid_shape):
# id of the potential new medoid.
id_h = not_medoid_idxs[h]
for i in range(n_clusters):
# id of the medoid we want to replace.
id_i = medoid_idxs[i]
T = 0
# compute for all not-selected points the change in cost
for j in prange(not_medoid_shape, nogil=True, num_threads = n_threads):
to_add = 0
TimotheeMathieu marked this conversation as resolved.
Show resolved Hide resolved
id_j = not_medoid_idxs[j]
cluster_i_bool = D[id_i, id_j] == Djs[id_j]
not_cluster_i_bool =D[id_i, id_j] != Djs[id_j]
second_best_medoid = D[id_h, id_j] < Ejs[id_j]
not_second_best_medoid = D[id_h, id_j] >= Ejs[id_j]
if cluster_i_bool & second_best_medoid:
to_add = D[id_j, id_h] -Djs[id_j]
elif cluster_i_bool & not_second_best_medoid:
to_add = Ejs[id_j] - Djs[id_j]
elif not_cluster_i_bool & (D[id_j, id_h] < Djs[id_j]):
to_add = D[id_j, id_h] - Djs[id_j]
T += to_add
# same for i
second_best_medoid = D[id_h, id_i] < Ejs[id_i]
if second_best_medoid:
T += D[id_i, id_h]
else:
T += Ejs[id_i]

if T < best_cost_change:
best_cost_change = T
best_couple = (id_i, id_h)

# If one of the swap decrease the objective, return that swap.
if best_cost_change < 0:
return best_couple
TimotheeMathieu marked this conversation as resolved.
Show resolved Hide resolved
else:
return None