diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 1315905..854a03a 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -33,10 +33,13 @@ parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=( 'If omitted, CharacterScoreFilter will not be used.')) parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering', help='Method for selecting filter thresholds (default: %(default)s)') -parser.add_argument('--sample-size', default=100000, type=int, +parser.add_argument('--sample-size', default=100000, type=int, metavar='INT', help='Max number of sentence pairs used for data-based methods (default %(default)s)') -parser.add_argument('--noisy-percentile', default=0.001, type=float, +parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT', help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)') +parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT', + help=('Number of clusters for the clustering method; try increasing if too much data is clustered ' + 'as noisy (default %(default)s)')) parser.add_argument('--work-dir', default='work', help='Location of the source and target files for the generated configuration (default %(default)s)') parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)') diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md index 9d65a1f..acad8f0 100644 --- a/docs/automatic_configuration.md +++ b/docs/automatic_configuration.md @@ -32,12 +32,15 @@ options: --method {defaults,percentiles,clustering} Method for selecting filter thresholds (default: clustering) - --sample-size SAMPLE_SIZE - Max number of sentence pairs used for data-based + --sample-size INT Max number of sentence pairs used for data-based methods (default 100000) - --noisy-percentile NOISY_PERCENTILE + --noisy-percentile FLOAT Proportion of the data considered to be noisy; only for percentiles method (default 0.001) + --clusters INT, -k INT + Number of clusters for the clustering method; try + increasing if too much data is clustered as noisy + (default 2) --work-dir WORK_DIR Location of the source and target files for the generated configuration (default work) --inter-dir INTER_DIR @@ -83,9 +86,11 @@ First, we remove duplicates and empty sentences from the input corpus. Next, we take a subset (`--sample-size`, 100k sentence pairs by default) of the corpus and produce scores for each sentence pair in the subset with the previously mentioned filters. These scores are -used as features for K-means clustering to classify the sentence pairs +used as features for K-means clustering to group the sentence pairs into clean and noisy pairs. The values of the noisy cluster center are used as the filter threshold parameters in the generated config file. +If it looks like too many samples are clustered as noisy, increasing +the number of clusters (`--clusters`) may help. Figures from the clustering and score histograms are plotted given the `--plot` option. If you want also to save the intermediate files, make diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index 8b93b39..b718486 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -515,8 +515,9 @@ class ClusterFilters(DataBasedFiltersABC): ('LanguageIDFilter', {'id_method': 'cld2'}), 'TerminalPunctuationFilter'] - def __init__(self, files, max_length=150, **kwargs): + def __init__(self, files, k=2, max_length=150, **kwargs): super().__init__(files, max_length=150, **kwargs) + self.k = k self.label_file_path = os.path.join(self.inter_dir, 'labels.txt') self.scoredata = None @@ -525,7 +526,7 @@ def set_filter_thresholds(self): score_file = get_score_file( self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) - self.scoredata = ScoreClusters(score_file) + self.scoredata = ScoreClusters(score_file, k=self.k) self._set_parameters(self.scoredata.get_result_df()) if os.path.isfile(self.label_file_path) and not self.overwrite: logger.info('Label file "%s" exits, not overwriting', self.label_file_path) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index 1e9f109..4e59c63 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -26,7 +26,8 @@ class ScoreClusters: """ - def __init__(self, score_file, n=2): + def __init__(self, score_file, k=2): + self.k = k self.df = load_dataframe(score_file) self.filters = {} for name in self.df.columns: @@ -36,8 +37,8 @@ def __init__(self, score_file, n=2): self.scaler = preprocessing.StandardScaler() self.standard_data = self.scaler.fit_transform(self.df) - logger.info('Training KMeans with %s clusters', n) - self.kmeans = KMeans(n_clusters=n, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) + logger.info('Training KMeans with %s clusters', self.k) + self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) self.labels = self.kmeans.labels_ self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) self._noisy_label = self._get_noisy_label() @@ -48,9 +49,9 @@ def noisy_label(self): return self._noisy_label @property - def clean_label(self): - """Cluster label for clean data""" - return np.abs(self._noisy_label - 1) + def clean_labels(self): + """Cluster labels for clean data""" + return [idx for idx in range(self.k) if idx != self._noisy_label] def _get_flipped_centers(self): """Get centers with values flipped when low score indicates clean data""" @@ -130,17 +131,23 @@ def get_result_df(self): def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) - data_t = PCA(n_components=2).fit_transform(self.standard_data) - for label_id in [self.noisy_label, self.clean_label]: + pca = PCA(n_components=2) + data_t = pca.fit_transform(self.standard_data) + centroids = pca.transform(self.kmeans.cluster_centers_) + for label_id in range(self.k): points = np.where(self.labels == label_id) plt.scatter(data_t[points, 0], data_t[points, 1], c='orange' if label_id == self.noisy_label else 'blue', label='noisy' if label_id == self.noisy_label else 'clean', - marker=',', s=1) + marker=',', s=1, alpha=0.3) + for label_id in range(self.k): + plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1, + marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue', + label='noisy centroid' if label_id == self.noisy_label else 'clean centroid') plt.legend() plt.title('Clusters') noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)] - clean_samples = self.df.iloc[np.where(self.labels == self.clean_label)] + clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)] noisy_samples.hist(bins=100, figsize=(10, 10)) plt.suptitle('Histograms for noisy samples') clean_samples.hist(bins=100, figsize=(10, 10))