Skip to content

Commit

Permalink
simplify direction flipping for clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
svirpioj committed Sep 20, 2023
1 parent 94cbaea commit 351315d
Showing 1 changed file with 27 additions and 31 deletions.
58 changes: 27 additions & 31 deletions opusfilter/autogen_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import preprocessing, random_projection
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import numpy as np
Expand Down Expand Up @@ -35,12 +34,13 @@ def __init__(self, score_file, k=2):
filter_cls = getattr(filtermodule, first_part)
self.filters[name] = filter_cls
self.scaler = preprocessing.StandardScaler()
self.standard_data = self.scaler.fit_transform(self.df)
self.standard_data = self.scaler.fit_transform(self.df.mul(self.direction_vector))

logger.info('Training KMeans with %s clusters', self.k)
self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data)
self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1)
self.kmeans.fit(self.standard_data)
self.labels = self.kmeans.labels_
self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_)
self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) * self.direction_vector
self._noisy_label = self._get_noisy_label()

@property
Expand All @@ -53,34 +53,29 @@ def clean_labels(self):
"""Cluster labels for clean data"""
return [idx for idx in range(self.k) if idx != self._noisy_label]

def _get_flipped_centers(self):
"""Get centers with values flipped when low score indicates clean data"""
dir_fixed_centers = []
for center in self.kmeans.cluster_centers_:
fixed_center = []
for i, name in enumerate(self.df.columns):
value = center[i].copy()
if self.filters[name].score_direction == CLEAN_LOW:
value *= -1
fixed_center.append(value)
dir_fixed_centers.append(fixed_center)
return dir_fixed_centers
@property
def direction_vector(self):
"""Direction vector for the features (1 for CLEAN_LOW, -1 for CLEAN_HIGH)"""
return np.array([1 if self.filters[name].score_direction == CLEAN_LOW else -1
for name in self.df.columns])

def _get_noisy_label(self):
"""Find label for the noisy cluster"""
means = np.mean(self._get_flipped_centers(), axis=1)
means = np.mean(self.kmeans.cluster_centers_, axis=1)

# Output some cluster information
nlabels = Counter(self.labels)
for i, (center, inv_center, mean) in enumerate(zip(self.kmeans.cluster_centers_, self.cluster_centers, means)):
logger.info('Cluster #%s - number of samples: %s', i, nlabels[i])
logger.info('Cluster #%s', i)
logger.info('* number of samples: %s', nlabels[i])
logger.info('* centroid (score, scaled value, original value):')
for j, val in enumerate(center):
logger.info('%s\t%s\t%s', self.df.columns[j], round(val, 2), round(inv_center[j], 2))
logger.info(' %s\t%s\t%s', self.df.columns[j].ljust(25), round(val, 2), round(inv_center[j], 2))
logger.info('Average center\t%s', np.round(mean, 2))

# Cluster center of the noisiest cluster based on average features
noisy_mean = np.min(means)
noisy_label = np.argmin(means)
noisy_mean = np.max(means)
noisy_label = np.argmax(means)
logger.info('Cluster center of the noisiest cluster (%s)', np.round(noisy_mean, 2))
logger.info('Noisy label: %s', noisy_label)
noisy_labels = np.where(self.labels == noisy_label)[0]
Expand Down Expand Up @@ -111,13 +106,14 @@ def get_rejects(self):
feature_importances = permutation_importance(clf, self.standard_data, self.labels)
importance_mean_mean = np.mean(feature_importances.importances_mean)
rej_coef = 0.1
logger.info('mean importance: %s', round(importance_mean_mean, 3))
logger.info('rejection coefficient: %s', rej_coef)
logger.info('* mean importance: %s', round(importance_mean_mean, 3))
logger.info('* rejection coefficient: %s', rej_coef)
logger.info('* decisions:')
rejects = []
for i, k in enumerate(self.df.columns):
for i, col in enumerate(self.df.columns):
importance = feature_importances['importances_mean'][i]
reject = importance < importance_mean_mean * rej_coef
logger.info('%s\t%s\t%s', k, round(importance, 3), 'reject' if reject else 'keep')
logger.info(' %s\t%s\t%s', col.ljust(25), round(importance, 3), 'reject' if reject else 'keep')
rejects.append(reject)
return rejects

Expand All @@ -131,18 +127,18 @@ def get_result_df(self):
def plot(self, plt):
"""Plot clustering and histograms"""
plt.figure(figsize=(10, 10))
pca = PCA(n_components=2)
data_t = pca.fit_transform(self.standard_data)
centroids = pca.transform(self.kmeans.cluster_centers_)
projection = random_projection.GaussianRandomProjection(n_components=2)
data_t = projection.fit_transform(self.standard_data)
centroids = projection.transform(self.kmeans.cluster_centers_)
for label_id in range(self.k):
points = np.where(self.labels == label_id)
plt.scatter(data_t[points, 0], data_t[points, 1],
c='orange' if label_id == self.noisy_label else 'blue',
label='noisy' if label_id == self.noisy_label else 'clean',
marker=',', s=1, alpha=0.3)
marker=',', s=1, alpha=0.1)
for label_id in range(self.k):
plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1,
marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue',
marker='+', c='brown' if label_id == self.noisy_label else 'darkblue',
label='noisy centroid' if label_id == self.noisy_label else 'clean centroid')
plt.legend()
plt.title('Clusters')
Expand Down

0 comments on commit 351315d

Please sign in to comment.