add --clusters option to opusfilter-autogen

Helsinki-NLP · Sep 6, 2023 · 4ab1eb1 · 4ab1eb1
1 parent bbbf6b0
commit 4ab1eb1
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 18 deletions.
diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen
@@ -33,10 +33,13 @@ parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=(
     'If omitted, CharacterScoreFilter will not be used.'))
 parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering',
                     help='Method for selecting filter thresholds (default: %(default)s)')
-parser.add_argument('--sample-size', default=100000, type=int,
+parser.add_argument('--sample-size', default=100000, type=int, metavar='INT',
                     help='Max number of sentence pairs used for data-based methods (default %(default)s)')
-parser.add_argument('--noisy-percentile', default=0.001, type=float,
+parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT',
                     help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)')
+parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
+                    help=('Number of clusters for the clustering method; try increasing if too much data is clustered '
+                          'as noisy (default %(default)s)'))
 parser.add_argument('--work-dir', default='work',
                     help='Location of the source and target files for the generated configuration (default %(default)s)')
 parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')

diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md
@@ -32,12 +32,15 @@ options:
   --method {defaults,percentiles,clustering}
                         Method for selecting filter thresholds (default:
                         clustering)
-  --sample-size SAMPLE_SIZE
-                        Max number of sentence pairs used for data-based
+  --sample-size INT     Max number of sentence pairs used for data-based
                         methods (default 100000)
-  --noisy-percentile NOISY_PERCENTILE
+  --noisy-percentile FLOAT
                         Proportion of the data considered to be noisy; only
                         for percentiles method (default 0.001)
+  --clusters INT, -k INT
+                        Number of clusters for the clustering method; try
+                        increasing if too much data is clustered as noisy
+                        (default 2)
   --work-dir WORK_DIR   Location of the source and target files for the
                         generated configuration (default work)
   --inter-dir INTER_DIR
@@ -83,9 +86,11 @@ First, we remove duplicates and empty sentences from the input
 corpus. Next, we take a subset (`--sample-size`, 100k sentence pairs
 by default) of the corpus and produce scores for each sentence pair in
 the subset with the previously mentioned filters. These scores are
-used as features for K-means clustering to classify the sentence pairs
+used as features for K-means clustering to group the sentence pairs
 into clean and noisy pairs. The values of the noisy cluster center are
 used as the filter threshold parameters in the generated config file.
+If it looks like too many samples are clustered as noisy, increasing
+the number of clusters (`--clusters`) may help.
 
 Figures from the clustering and score histograms are plotted given the
 `--plot` option. If you want also to save the intermediate files, make

diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py
@@ -515,8 +515,9 @@ class ClusterFilters(DataBasedFiltersABC):
                        ('LanguageIDFilter', {'id_method': 'cld2'}),
                        'TerminalPunctuationFilter']
 
-    def __init__(self, files, max_length=150, **kwargs):
+    def __init__(self, files, k=2, max_length=150, **kwargs):
         super().__init__(files, max_length=150, **kwargs)
+        self.k = k
         self.label_file_path = os.path.join(self.inter_dir, 'labels.txt')
         self.scoredata = None
 
@@ -525,7 +526,7 @@ def set_filter_thresholds(self):
         score_file = get_score_file(
             self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size,
             overwrite=self.overwrite, max_length=self.max_length)
-        self.scoredata = ScoreClusters(score_file)
+        self.scoredata = ScoreClusters(score_file, k=self.k)
         self._set_parameters(self.scoredata.get_result_df())
         if os.path.isfile(self.label_file_path) and not self.overwrite:
             logger.info('Label file "%s" exits, not overwriting', self.label_file_path)

diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py
@@ -26,7 +26,8 @@ class ScoreClusters:
 
     """
 
-    def __init__(self, score_file, n=2):
+    def __init__(self, score_file, k=2):
+        self.k = k
         self.df = load_dataframe(score_file)
         self.filters = {}
         for name in self.df.columns:
@@ -36,8 +37,8 @@ def __init__(self, score_file, n=2):
         self.scaler = preprocessing.StandardScaler()
         self.standard_data = self.scaler.fit_transform(self.df)
 
-        logger.info('Training KMeans with %s clusters', n)
-        self.kmeans = KMeans(n_clusters=n, random_state=0, init='k-means++', n_init=1).fit(self.standard_data)
+        logger.info('Training KMeans with %s clusters', self.k)
+        self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data)
         self.labels = self.kmeans.labels_
         self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_)
         self._noisy_label = self._get_noisy_label()
@@ -48,9 +49,9 @@ def noisy_label(self):
         return self._noisy_label
 
     @property
-    def clean_label(self):
-        """Cluster label for clean data"""
-        return np.abs(self._noisy_label - 1)
+    def clean_labels(self):
+        """Cluster labels for clean data"""
+        return [idx for idx in range(self.k) if idx != self._noisy_label]
 
     def _get_flipped_centers(self):
         """Get centers with values flipped when low score indicates clean data"""
@@ -130,17 +131,23 @@ def get_result_df(self):
     def plot(self, plt):
         """Plot clustering and histograms"""
         plt.figure(figsize=(10, 10))
-        data_t = PCA(n_components=2).fit_transform(self.standard_data)
-        for label_id in [self.noisy_label, self.clean_label]:
+        pca = PCA(n_components=2)
+        data_t = pca.fit_transform(self.standard_data)
+        centroids = pca.transform(self.kmeans.cluster_centers_)
+        for label_id in range(self.k):
             points = np.where(self.labels == label_id)
             plt.scatter(data_t[points, 0], data_t[points, 1],
                         c='orange' if label_id == self.noisy_label else 'blue',
                         label='noisy' if label_id == self.noisy_label else 'clean',
-                        marker=',', s=1)
+                        marker=',', s=1, alpha=0.3)
+        for label_id in range(self.k):
+            plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1,
+                        marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue',
+                        label='noisy centroid' if label_id == self.noisy_label else 'clean centroid')
         plt.legend()
         plt.title('Clusters')
         noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)]
-        clean_samples = self.df.iloc[np.where(self.labels == self.clean_label)]
+        clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)]
         noisy_samples.hist(bins=100, figsize=(10, 10))
         plt.suptitle('Histograms for noisy samples')
         clean_samples.hist(bins=100, figsize=(10, 10))