Skip to content

Commit

Permalink
add --clusters option to opusfilter-autogen
Browse files Browse the repository at this point in the history
  • Loading branch information
svirpioj committed Sep 6, 2023
1 parent bbbf6b0 commit 34ce881
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 20 deletions.
9 changes: 6 additions & 3 deletions bin/opusfilter-autogen
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@ parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=(
'If omitted, CharacterScoreFilter will not be used.'))
parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering',
help='Method for selecting filter thresholds (default: %(default)s)')
parser.add_argument('--sample-size', default=100000, type=int,
parser.add_argument('--sample-size', default=100000, type=int, metavar='INT',
help='Max number of sentence pairs used for data-based methods (default %(default)s)')
parser.add_argument('--noisy-percentile', default=0.001, type=float,
parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT',
help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)')
parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
help=('Number of clusters for the clustering method; try increasing if too much data is clustered '
'as noisy (default %(default)s)'))
parser.add_argument('--work-dir', default='work',
help='Location of the source and target files for the generated configuration (default %(default)s)')
parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')
Expand All @@ -60,7 +63,7 @@ filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if
if args.method == 'clustering':
filtergen = ClusterFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite)
sample_size=args.sample_size, k=args.clusters, inter_dir=args.inter_dir, overwrite=args.overwrite)
elif args.method == 'percentiles':
filtergen = PercentileFilters(
files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
Expand Down
13 changes: 9 additions & 4 deletions docs/automatic_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ options:
--method {defaults,percentiles,clustering}
Method for selecting filter thresholds (default:
clustering)
--sample-size SAMPLE_SIZE
Max number of sentence pairs used for data-based
--sample-size INT Max number of sentence pairs used for data-based
methods (default 100000)
--noisy-percentile NOISY_PERCENTILE
--noisy-percentile FLOAT
Proportion of the data considered to be noisy; only
for percentiles method (default 0.001)
--clusters INT, -k INT
Number of clusters for the clustering method; try
increasing if too much data is clustered as noisy
(default 2)
--work-dir WORK_DIR Location of the source and target files for the
generated configuration (default work)
--inter-dir INTER_DIR
Expand Down Expand Up @@ -83,9 +86,11 @@ First, we remove duplicates and empty sentences from the input
corpus. Next, we take a subset (`--sample-size`, 100k sentence pairs
by default) of the corpus and produce scores for each sentence pair in
the subset with the previously mentioned filters. These scores are
used as features for K-means clustering to classify the sentence pairs
used as features for K-means clustering to group the sentence pairs
into clean and noisy pairs. The values of the noisy cluster center are
used as the filter threshold parameters in the generated config file.
If it looks like too many samples are clustered as noisy, increasing
the number of clusters (`--clusters`) may help.

Figures from the clustering and score histograms are plotted given the
`--plot` option. If you want also to save the intermediate files, make
Expand Down
2 changes: 1 addition & 1 deletion docs/filters/custom_filters.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The `accept` method takes a single output yielded by the `score`
method, and returns whether the sentence pair should be accepted based
on the score.

The `score_direction` should be one of the following contants defined
The `score_direction` should be one of the following constants defined
in the `opusfilter` module depending on the output of the `score()`
method:

Expand Down
5 changes: 3 additions & 2 deletions opusfilter/autogen.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,8 +515,9 @@ class ClusterFilters(DataBasedFiltersABC):
('LanguageIDFilter', {'id_method': 'cld2'}),
'TerminalPunctuationFilter']

def __init__(self, files, max_length=150, **kwargs):
def __init__(self, files, k=2, max_length=150, **kwargs):
super().__init__(files, max_length=150, **kwargs)
self.k = k
self.label_file_path = os.path.join(self.inter_dir, 'labels.txt')
self.scoredata = None

Expand All @@ -525,7 +526,7 @@ def set_filter_thresholds(self):
score_file = get_score_file(
self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size,
overwrite=self.overwrite, max_length=self.max_length)
self.scoredata = ScoreClusters(score_file)
self.scoredata = ScoreClusters(score_file, k=self.k)
self._set_parameters(self.scoredata.get_result_df())
if os.path.isfile(self.label_file_path) and not self.overwrite:
logger.info('Label file "%s" exits, not overwriting', self.label_file_path)
Expand Down
27 changes: 17 additions & 10 deletions opusfilter/autogen_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class ScoreClusters:
"""

def __init__(self, score_file, n=2):
def __init__(self, score_file, k=2):
self.k = k
self.df = load_dataframe(score_file)
self.filters = {}
for name in self.df.columns:
Expand All @@ -36,8 +37,8 @@ def __init__(self, score_file, n=2):
self.scaler = preprocessing.StandardScaler()
self.standard_data = self.scaler.fit_transform(self.df)

logger.info('Training KMeans with %s clusters', n)
self.kmeans = KMeans(n_clusters=n, random_state=0, init='k-means++', n_init=1).fit(self.standard_data)
logger.info('Training KMeans with %s clusters', self.k)
self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data)
self.labels = self.kmeans.labels_
self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_)
self._noisy_label = self._get_noisy_label()
Expand All @@ -48,9 +49,9 @@ def noisy_label(self):
return self._noisy_label

@property
def clean_label(self):
"""Cluster label for clean data"""
return np.abs(self._noisy_label - 1)
def clean_labels(self):
"""Cluster labels for clean data"""
return [idx for idx in range(self.k) if idx != self._noisy_label]

def _get_flipped_centers(self):
"""Get centers with values flipped when low score indicates clean data"""
Expand Down Expand Up @@ -130,17 +131,23 @@ def get_result_df(self):
def plot(self, plt):
"""Plot clustering and histograms"""
plt.figure(figsize=(10, 10))
data_t = PCA(n_components=2).fit_transform(self.standard_data)
for label_id in [self.noisy_label, self.clean_label]:
pca = PCA(n_components=2)
data_t = pca.fit_transform(self.standard_data)
centroids = pca.transform(self.kmeans.cluster_centers_)
for label_id in range(self.k):
points = np.where(self.labels == label_id)
plt.scatter(data_t[points, 0], data_t[points, 1],
c='orange' if label_id == self.noisy_label else 'blue',
label='noisy' if label_id == self.noisy_label else 'clean',
marker=',', s=1)
marker=',', s=1, alpha=0.3)
for label_id in range(self.k):
plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1,
marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue',
label='noisy centroid' if label_id == self.noisy_label else 'clean centroid')
plt.legend()
plt.title('Clusters')
noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)]
clean_samples = self.df.iloc[np.where(self.labels == self.clean_label)]
clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)]
noisy_samples.hist(bins=100, figsize=(10, 10))
plt.suptitle('Histograms for noisy samples')
clean_samples.hist(bins=100, figsize=(10, 10))
Expand Down

0 comments on commit 34ce881

Please sign in to comment.