diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 9124170..a478f99 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -43,9 +43,9 @@ parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT', parser.add_argument('--work-dir', default='work', help='Location of the source and target files for the generated configuration (default %(default)s)') parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)') -parser.add_argument('--plot', action='store_true', - help=('Show a scatter plot of the clustering and histograms of feature data distributions; ' - 'only for the clustering method')) +parser.add_argument('--plot', metavar='PATH', default=None, type=str, + help=('Create histograms of feature data distributions and a scatter plot of the clustering; ' + 'give path to plot the PDF files to, or "-" for interactive plots; only for the clustering method')) parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit') parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'), help=('Instead of using default filters, add a filter of CLASS with JSON parameters object ' @@ -78,9 +78,12 @@ if args.list_defaults: filters = filtergen.set_filter_thresholds() -if args.method == 'clustering' and args.plot: - filtergen.scoredata.plot(plt) - plt.show() +if args.method == 'clustering' and args.plot is not None: + if args.plot == '-': + filtergen.scoredata.plot(plt) + plt.show() + else: + filtergen.scoredata.plot(plt, path=args.plot) generator = ConfigurationGenerator( files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index b30f4f3..2faf19e 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -1,13 +1,17 @@ """Unsupervised threshold selection for filters""" from collections import Counter +import itertools import logging +import os import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.cluster import KMeans -from sklearn import preprocessing, random_projection +from sklearn import decomposition, preprocessing, random_projection from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance +from sklearn.utils.validation import check_is_fitted import numpy as np from . import CLEAN_LOW @@ -18,6 +22,147 @@ logger = logging.getLogger(__name__) +class ArcProjection(BaseEstimator, TransformerMixin): + """Project data to two dimensions using evenly distributed unit vectors + + The assigment of unit vectors to the original vectors is optimized + so that L2 norm between the correlation coefficients of the + original features and dot product of the vectors is minimized. + + Parameters + ---------- + + arc : float, 'auto' or 'full', default='auto' + Length of the arc used for projection vectors. For 'auto', the + length is set as the arccosine of the minimum correlation of + the input features. For 'full', full circle (2 * pi) is used. + + Attributes + ---------- + n_components_ : int + Number of components, always 2. + + components_ : ndarray of shape (n_components, n_features) + Matrix used for the projection. + + """ + + def __init__(self, arc='auto'): + self.n_components_ = 2 + self.arc = arc + + def _best_permutation(self, current, n_features, eval_func): + """Greedy search for best permutation of projection matrix""" + permutation = np.arange(n_features) + while True: + logger.debug("current permutation: %s", permutation) + minpair, mincost = self._best_single_swap(current, eval_func) + logger.debug("best swap: %s (%s)", minpair, mincost) + if minpair is None: + return permutation, current, mincost + fwd, rev = list(minpair), list(reversed(minpair)) + current[fwd] = current[rev] + permutation[fwd] = permutation[rev] + + @staticmethod + def _best_single_swap(matrix, eval_func): + """Find best swap of matrix rows based in eval_func""" + mincost = eval_func(matrix) + minpair = None + for pair in itertools.combinations(range(matrix.shape[0]), 2): + fwd, rev = list(pair), list(reversed(pair)) + matrix[fwd] = matrix[rev] + cost = eval_func(matrix) + if cost < mincost: + mincost = cost + minpair = pair + matrix[fwd] = matrix[rev] + return minpair, mincost + + def _make_matrix(self, corrmat, n_features): + """Generate the random projection matrix. + + Parameters + ---------- + corrmat : ndarray of shape (n_features, n_features), + Correlation matrix for the original features + + n_features : int, + Dimensionality of the original source space. + + Returns + ------- + components : ndarray of shape (n_components, n_features) + The generated random matrix. + """ + def costf(matrix): + return ((corrmat - matrix @ matrix.T)**2).sum() + + if self.arc == 'full': + dist = 2 * np.pi / n_features + elif self.arc == 'auto': + dist = np.arccos(corrmat.min()) / (n_features - 1) + else: + dist = self.arc / (n_features - 1) + matrix = np.array([ + [np.cos(idx * dist), np.sin(idx * dist)] for idx in range(n_features) + ]) + _, best_m, best_cost = self._best_permutation(matrix.copy(), n_features, costf) + logger.debug("fcorr:\n%s", corrmat.round(3)) + logger.debug("vdist:\n%s", (best_m @ best_m.T).round(3)) + logger.debug("diff:\n%s", (corrmat - best_m @ best_m.T).round(3)) + logger.debug("cost: %s", best_cost) + return best_m.T + + def fit(self, X: np.array, y=None): + """Generate a projection matrix. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training set: only the shape is used to find matrix dimensions. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + BaseRandomProjection class instance. + """ + X = self._validate_data( + X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] + ) + + n_features = X.shape[1] + corrmat = np.corrcoef(X.T) + self.components_ = self._make_matrix( + corrmat, n_features + ).astype(X.dtype, copy=False) + + return self + + def transform(self, X): + """Project the data by using matrix product with the random matrix. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input data to project into a smaller dimensional space. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Projected array. + """ + check_is_fitted(self) + X = self._validate_data( + X, accept_sparse=["csr", "csc"], reset=False, dtype=[np.float64, np.float32] + ) + + return X @ self.components_.T + + class ScoreClusters: """Cluster segments by filter scores @@ -42,6 +187,8 @@ def __init__(self, score_file, k=2): self.labels = self.kmeans.labels_ self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) * self.direction_vector self._noisy_label = self._get_noisy_label() + self.rejects = None + self.thresholds = None @property def noisy_label(self): @@ -119,17 +266,41 @@ def get_rejects(self): def get_result_df(self): """Return dataframe containing the thresholds and reject booleans""" + self.rejects = self.get_rejects() + self.thresholds = self.get_thresholds() return pd.DataFrame.from_dict( - {'name': self.get_columns(), - 'threshold': self.get_thresholds(), - 'reject': self.get_rejects()}) + {'name': self.get_columns(), 'threshold': self.thresholds, 'reject': self.rejects}) - def plot(self, plt): + def plot(self, plt, path=None, apply_rejects=True, projection='arc'): """Plot clustering and histograms""" + if projection == 'pca': + proj = decomposition.PCA(n_components=2) + elif projection == 'random': + proj = random_projection.GaussianRandomProjection(n_components=2) + elif projection == 'arc': + proj = ArcProjection() + else: + raise ValueError(f"Unknown projection: {projection}") + col_index = {col: idx for idx, col in enumerate(self.get_columns())} + index_col = dict(enumerate(self.get_columns())) + if apply_rejects and self.rejects: + indices = [idx for idx, reject in enumerate(self.rejects) if not reject] + cols = [index_col[idx] for idx in indices] + index_col = dict(enumerate(cols)) + data_t = proj.fit_transform(self.standard_data[:, indices]) + centroids = proj.transform(self.kmeans.cluster_centers_[:, indices]) + else: + data_t = proj.fit_transform(self.standard_data) + centroids = proj.transform(self.kmeans.cluster_centers_) + plt.figure() + for idx in range(proj.components_.shape[1]): + plt.arrow(0, 0, proj.components_[0, idx], proj.components_[1, idx], head_width=0.01, fc='k') + plt.text(proj.components_[0, idx], proj.components_[1, idx], index_col[idx]) + plt.gca().set_aspect('equal', adjustable='box') + plt.title('Projection vectors for clustering') + if path is not None: + plt.savefig(os.path.join(path, 'projection.pdf')) plt.figure(figsize=(10, 10)) - projection = random_projection.GaussianRandomProjection(n_components=2) - data_t = projection.fit_transform(self.standard_data) - centroids = projection.transform(self.kmeans.cluster_centers_) for label_id in range(self.k): points = np.where(self.labels == label_id) plt.scatter(data_t[points, 0], data_t[points, 1], @@ -141,10 +312,28 @@ def plot(self, plt): marker='+', c='brown' if label_id == self.noisy_label else 'darkblue', label='noisy centroid' if label_id == self.noisy_label else 'clean centroid') plt.legend() - plt.title('Clusters') + plt.title('Clustering') + if path is not None: + plt.savefig(os.path.join(path, 'clustering.pdf')) noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)] clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)] - noisy_samples.hist(bins=100, figsize=(10, 10)) + subplots_n = noisy_samples.hist(bins=100, figsize=(10, 10)) + fig_noisy = plt.gcf() plt.suptitle('Histograms for noisy samples') - clean_samples.hist(bins=100, figsize=(10, 10)) + subplots_c = clean_samples.hist(bins=100, figsize=(10, 10)) + fig_clean = plt.gcf() plt.suptitle('Histograms for clean samples') + if apply_rejects and self.rejects: + for axes in ([axes for sublist in subplots_c for axes in sublist] + + [axes for sublist in subplots_n for axes in sublist]): + title = axes.get_title() + if not title: + continue + idx = col_index[title] + if self.rejects[idx]: + axes.text(0.5, 1, 'REJECTED', horizontalalignment='center', + verticalalignment='top', transform=axes.transAxes, color='r') + axes.axvline(self.thresholds[idx], color='k', linestyle='--', alpha=0.5) + if path is not None: + fig_clean.savefig(os.path.join(path, 'histogram_clean.pdf')) + fig_noisy.savefig(os.path.join(path, 'histogram_noisy.pdf'))