diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index 4e59c63..b30f4f3 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -5,8 +5,7 @@ import pandas as pd from sklearn.cluster import KMeans -from sklearn import preprocessing -from sklearn.decomposition import PCA +from sklearn import preprocessing, random_projection from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance import numpy as np @@ -35,12 +34,13 @@ def __init__(self, score_file, k=2): filter_cls = getattr(filtermodule, first_part) self.filters[name] = filter_cls self.scaler = preprocessing.StandardScaler() - self.standard_data = self.scaler.fit_transform(self.df) + self.standard_data = self.scaler.fit_transform(self.df.mul(self.direction_vector)) logger.info('Training KMeans with %s clusters', self.k) - self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) + self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1) + self.kmeans.fit(self.standard_data) self.labels = self.kmeans.labels_ - self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) + self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) * self.direction_vector self._noisy_label = self._get_noisy_label() @property @@ -53,34 +53,29 @@ def clean_labels(self): """Cluster labels for clean data""" return [idx for idx in range(self.k) if idx != self._noisy_label] - def _get_flipped_centers(self): - """Get centers with values flipped when low score indicates clean data""" - dir_fixed_centers = [] - for center in self.kmeans.cluster_centers_: - fixed_center = [] - for i, name in enumerate(self.df.columns): - value = center[i].copy() - if self.filters[name].score_direction == CLEAN_LOW: - value *= -1 - fixed_center.append(value) - dir_fixed_centers.append(fixed_center) - return dir_fixed_centers + @property + def direction_vector(self): + """Direction vector for the features (1 for CLEAN_LOW, -1 for CLEAN_HIGH)""" + return np.array([1 if self.filters[name].score_direction == CLEAN_LOW else -1 + for name in self.df.columns]) def _get_noisy_label(self): """Find label for the noisy cluster""" - means = np.mean(self._get_flipped_centers(), axis=1) + means = np.mean(self.kmeans.cluster_centers_, axis=1) # Output some cluster information nlabels = Counter(self.labels) for i, (center, inv_center, mean) in enumerate(zip(self.kmeans.cluster_centers_, self.cluster_centers, means)): - logger.info('Cluster #%s - number of samples: %s', i, nlabels[i]) + logger.info('Cluster #%s', i) + logger.info('* number of samples: %s', nlabels[i]) + logger.info('* centroid (score, scaled value, original value):') for j, val in enumerate(center): - logger.info('%s\t%s\t%s', self.df.columns[j], round(val, 2), round(inv_center[j], 2)) + logger.info(' %s\t%s\t%s', self.df.columns[j].ljust(25), round(val, 2), round(inv_center[j], 2)) logger.info('Average center\t%s', np.round(mean, 2)) # Cluster center of the noisiest cluster based on average features - noisy_mean = np.min(means) - noisy_label = np.argmin(means) + noisy_mean = np.max(means) + noisy_label = np.argmax(means) logger.info('Cluster center of the noisiest cluster (%s)', np.round(noisy_mean, 2)) logger.info('Noisy label: %s', noisy_label) noisy_labels = np.where(self.labels == noisy_label)[0] @@ -111,13 +106,14 @@ def get_rejects(self): feature_importances = permutation_importance(clf, self.standard_data, self.labels) importance_mean_mean = np.mean(feature_importances.importances_mean) rej_coef = 0.1 - logger.info('mean importance: %s', round(importance_mean_mean, 3)) - logger.info('rejection coefficient: %s', rej_coef) + logger.info('* mean importance: %s', round(importance_mean_mean, 3)) + logger.info('* rejection coefficient: %s', rej_coef) + logger.info('* decisions:') rejects = [] - for i, k in enumerate(self.df.columns): + for i, col in enumerate(self.df.columns): importance = feature_importances['importances_mean'][i] reject = importance < importance_mean_mean * rej_coef - logger.info('%s\t%s\t%s', k, round(importance, 3), 'reject' if reject else 'keep') + logger.info(' %s\t%s\t%s', col.ljust(25), round(importance, 3), 'reject' if reject else 'keep') rejects.append(reject) return rejects @@ -131,18 +127,18 @@ def get_result_df(self): def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) - pca = PCA(n_components=2) - data_t = pca.fit_transform(self.standard_data) - centroids = pca.transform(self.kmeans.cluster_centers_) + projection = random_projection.GaussianRandomProjection(n_components=2) + data_t = projection.fit_transform(self.standard_data) + centroids = projection.transform(self.kmeans.cluster_centers_) for label_id in range(self.k): points = np.where(self.labels == label_id) plt.scatter(data_t[points, 0], data_t[points, 1], c='orange' if label_id == self.noisy_label else 'blue', label='noisy' if label_id == self.noisy_label else 'clean', - marker=',', s=1, alpha=0.3) + marker=',', s=1, alpha=0.1) for label_id in range(self.k): plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1, - marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue', + marker='+', c='brown' if label_id == self.noisy_label else 'darkblue', label='noisy centroid' if label_id == self.noisy_label else 'clean centroid') plt.legend() plt.title('Clusters')