From 5d5931d376a8cfe91b37fa6b8631b339bf0a991b Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 21 Jun 2023 16:53:20 +0300 Subject: [PATCH 01/12] refactor and unify clustering code --- .pylintrc | 2 +- bin/opusfilter-autogen | 11 ++- opusfilter/autogen_cluster.py | 67 +++++++++-------- opusfilter/classifier.py | 1 - opusfilter/filters.py | 4 +- tests/test_autogen.py | 136 ++++++++++++++++++++++++++++++++++ tests/test_autogen_cluster.py | 84 --------------------- 7 files changed, 181 insertions(+), 124 deletions(-) delete mode 100644 tests/test_autogen_cluster.py diff --git a/.pylintrc b/.pylintrc index 40c01f9..535543a 100644 --- a/.pylintrc +++ b/.pylintrc @@ -521,7 +521,7 @@ max-bool-expr=5 max-branches=12 # Maximum number of locals for function / method body. -max-locals=15 +max-locals=16 # Maximum number of parents for a class (see R0901). max-parents=7 diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 780461c..753a9e0 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -50,16 +50,15 @@ if args.filter_params == 'unsupervised': filtergen = FilterThresholdFinder( files=args.files, langs=args.langs, scripts=args.scripts, sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite) - filters, scoredata = filtergen.find_thresholds() - if args.plot: - scoredata.plot(plt) - plt.show() elif args.filter_params == 'percentiles': filtergen = PercentileFilters(files=args.files, excluded_percentile=args.noisy_percentile) - filters = filtergen.get_thresholds() else: filtergen = DefaultParameterFilters() - filters = filtergen.get_thresholds() +filters = filtergen.get_thresholds() + +if args.filter_params == 'unsupervised' and args.plot: + filtergen.scoredata.plot(plt) + plt.show() generator = ConfigurationGenerator( files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index 455067e..ed06350 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -28,7 +28,11 @@ class ScoreClusters: - """Cluster segments by filter scores""" + """Cluster segments by filter scores + + Train k-means clustering and take thresholds based on the noisy cluster center. + + """ def __init__(self, score_file, n=2): self.df = load_dataframe(score_file) @@ -43,20 +47,14 @@ def __init__(self, score_file, n=2): logger.info('Training KMeans with %s clusters', n) self.kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto').fit(self.standard_data) self.labels = self.kmeans.labels_ - - noisy_label, thresholds = self._get_noisy_label_and_thresholds() - self.noisy_label = noisy_label + self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) + self.noisy_label = self._get_noisy_label() self.clean_label = np.abs(self.noisy_label - 1) - self.thresholds = thresholds - - def _get_noisy_label_and_thresholds(self): - """Find filter thresholds - - Train k-means clustering and take thresholds from the noisy cluster center. - """ + def _get_noisy_label(self): + """Find label for the noisy cluster""" centers = self.kmeans.cluster_centers_ - inv_centers = self.scaler.inverse_transform(centers) + inv_centers = self.cluster_centers # Flip values if low score indicates clean data dir_fixed_centers = [] @@ -80,14 +78,18 @@ def _get_noisy_label_and_thresholds(self): # Cluster center of the noisiest cluster based on average features noisy_mean = np.min(means) noisy_label = np.argmin(means) - logger.info('Cluster center of the noisiest cluster (%s)', np.round(noisy_mean, 2)) logger.info('Noisy label: %s', noisy_label) noisy_labels = np.where(self.labels == noisy_label)[0] logger.info('Number of noisy labels: %s', f'{len(noisy_labels)}/{len(self.labels)} ({round(100*len(noisy_labels)/len(self.labels), 2)}%)') - thresholds = inv_centers[noisy_label].round(3).tolist() - return noisy_label, thresholds + return noisy_label + + def get_thresholds(self, method='noisy_center', precision=6): + """Return thresholds for noisy samples""" + if method != 'noisy_center': + raise ValueError(f'Method {method} for thresholds not implemented') + return self.cluster_centers[self.noisy_label].round(precision).tolist() def get_rejects(self): """Train random forest classifier to find important features""" @@ -158,29 +160,29 @@ def __init__(self, files, langs, scripts, sample_size, inter_dir, overwrite): self.filter_params['CharacterScoreFilter'] = {'scripts': self.scripts} if len(self.input_files) == 2: self.filter_params['TerminalPunctuationFilter'] = {} + self.scoredata = None - def find_thresholds(self): - """Find suitable filter thresholds - - Returns a dict of filter parameters and a ScoreClusters object - - """ - score_file = self._prepare_data() - scoreclusters = ScoreClusters(os.path.join(self.inter_dir, score_file)) - self._set_parameters(scoreclusters.thresholds, scoreclusters.get_rejects()) + def get_thresholds(self): + """Get filter configuration with thresholds""" + self.scoredata = ScoreClusters(self._get_score_file()) + self._set_parameters(self.scoredata.get_thresholds(), self.scoredata.get_rejects()) if os.path.isfile(self.label_file_path) and not self.overwrite: logger.info('Label file "%s" exits, not overwriting', self.label_file_path) else: with open(self.label_file_path, 'w', encoding='utf-8') as label_file: - for label in scoreclusters.labels: + for label in self.scoredata.labels: label_file.write(str(label)+'\n') if self.use_tmp: shutil.rmtree(self.inter_dir) filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] - return filters, scoreclusters + return filters + + def _get_score_file(self): + """Calculate filter scores and return score file + + Remove duplicates and empty lines, take a sample of size n, produce filter scores - def _prepare_data(self): - """Remove duplicates and empty lines, take a sample of size n, produce filter scores""" + """ config_gen = ConfigurationGenerator(files=[os.path.abspath(f) for f in self.input_files], workdir=self.inter_dir) config_gen.add_remove_duplicates() config_gen.add_filter([{'LengthFilter': {'unit': 'word', 'min_length': 1, 'max_length': 150}}]) @@ -190,10 +192,15 @@ def _prepare_data(self): yaml.dump(pre_config, pathlib.Path(os.path.join(self.inter_dir, 'config.yaml'))) opusf = OpusFilter(pre_config) opusf.execute_steps(overwrite=self.overwrite) - return score_file + return os.path.join(self.inter_dir, score_file) def _set_parameters(self, thresholds, rejects): - """Set filter parameters based on thresholds and rejects""" + """Set filter parameters based on thresholds and rejects + + thresholds: list of threshold values + rejects: boolean-valued dictionary, dataframe columns as keys + + """ for i, name in enumerate(rejects): fullname = name name_parts = name.split('.') diff --git a/opusfilter/classifier.py b/opusfilter/classifier.py index 8944b42..9e4796f 100644 --- a/opusfilter/classifier.py +++ b/opusfilter/classifier.py @@ -189,7 +189,6 @@ def __init__(self, training_scores=None, dev_scores=None, model_type=None, self.df_training_data = load_dataframe(training_scores) self.orig_data = self.df_training_data.copy() - self.group_config = features self.feature_config = {} found_in_data = set() for t_key in self.df_training_data.keys(): diff --git a/opusfilter/filters.py b/opusfilter/filters.py index 2acdea6..d3dff4b 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -540,9 +540,9 @@ class RepetitionFilter(FilterABC): def __init__(self, threshold=2, min_length=3, max_length=100, **kwargs): if threshold < 1: - raise ConfigurationError("threshold for RepetitionFilter has to be at least one") + raise ConfigurationError(f"threshold for RepetitionFilter has to be at least one, got {threshold}") if min_length < 1: - raise ConfigurationError("min_length for RepetitionFilter has to be at least one") + raise ConfigurationError(f"min_length for RepetitionFilter has to be at least one, got {min_length}") self._threshold = threshold self._min_length = min_length self._max_length = max_length diff --git a/tests/test_autogen.py b/tests/test_autogen.py index d72fd89..e8a4711 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -6,8 +6,144 @@ import tempfile import unittest +import opustools + from opusfilter import FilterABC, ConfigurationError, filters, pipeline from opusfilter.autogen import * +from opusfilter.autogen_cluster import FilterThresholdFinder +from opusfilter.opusfilter import OpusFilter + + +default_params = {'AlphabetRatioFilter': {}, + 'CharacterScoreFilter': {'scripts': ['latin', 'latin']}, + 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', 'languages': ['en', 'de']}, + 'LengthRatioFilter.char': {'name': 'char', 'unit': 'char'}, + 'LengthRatioFilter.word': {'name': 'word', 'unit': 'word'}, + 'NonZeroNumeralsFilter': {}, + 'TerminalPunctuationFilter': {}} + +example_params = {'AlphabetRatioFilter': {'threshold': 1}, + 'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}, + 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', + 'languages': ['en', 'de'], + 'thresholds': [1, 1]}, + 'LengthRatioFilter.char': {'name': 'char', 'threshold': 1, 'unit': 'char'}, + 'LengthRatioFilter.word': {'name': 'word', 'threshold': 1, 'unit': 'word'}, + 'NonZeroNumeralsFilter': {'threshold': 1}, + 'TerminalPunctuationFilter': {'threshold': 1}} + +default_rejects = {'TerminalPunctuationFilter': False, + 'AlphabetRatioFilter.0': False, + 'AlphabetRatioFilter.1': False, + 'CharacterScoreFilter.0': False, + 'CharacterScoreFilter.1': False, + 'LanguageIDFilter.0': False, + 'LanguageIDFilter.1': False, + 'LengthRatioFilter.char': False, + 'LengthRatioFilter.word': False, + 'NonZeroNumeralsFilter.0': False} + + +class TestAutogen(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.tempdir = tempfile.mkdtemp() + self.source = 'en' + self.target = 'sv' + self.src_out = os.path.join(self.tempdir, f'sents.{self.source}') + self.tgt_out = os.path.join(self.tempdir, f'sents.{self.target}') + opus_reader = opustools.OpusRead( + directory='RF', + source=self.source, + target=self.target, + release='v1', + suppress_prompts=True, + preprocess='raw', + write_mode='moses', + write=[self.src_out, self.tgt_out], + leave_non_alignments_out=True, + download_dir=self.tempdir) + opus_reader.printPairs() + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.tempdir) + + def _test_filters(self, filters): + generator = ConfigurationGenerator([self.src_out, self.tgt_out], workdir='work', langs=[self.source, self.target]) + generator.add_filter(filters) + configuration = generator.get_config() + of = OpusFilter(configuration) + of.execute_steps(overwrite=True) + + def test_default_filters(self): + filtergen = DefaultParameterFilters() + filters = filtergen.get_thresholds() + self._test_filters(filters) + + @unittest.expectedFailure + def test_percentile_filters(self): + filtergen = PercentileFilters(files=[self.src_out, self.tgt_out], excluded_percentile=0.05) + filters = filtergen.get_thresholds() + self._test_filters(filters) + + def test_threshold_finder(self): + filtergen = FilterThresholdFinder( + files=[self.src_out, self.tgt_out], langs=[self.source, self.target], scripts=['Latin', 'Latin'], + sample_size=180, inter_dir=self.tempdir, overwrite=True) + filters = filtergen.get_thresholds() + self._test_filters(filters) + + +class TestThresholdFinder(unittest.TestCase): + + def test_set_default_parameters(self): + tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + self.assertEqual(tf.filter_params, default_params) + + def test_reject_all_parameters(self): + tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf._set_parameters([1 for i in range(10)], {k: True for k in default_rejects.keys()}) + self.assertEqual(tf.filter_params, {}) + + def test_set_all_parameters(self): + tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf._set_parameters([1 for i in range(10)], default_rejects) + self.assertEqual(tf.filter_params, example_params) + + def test_set_parameters_reject_one_side(self): + tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + rejects = copy.deepcopy(default_rejects) + rejects['LanguageIDFilter.0'] = True + params = copy.deepcopy(example_params) + params['LanguageIDFilter']['thresholds'][0] = -1 + tf._set_parameters([1 for i in range(10)], rejects) + self.assertEqual(tf.filter_params, params) + + tf.filter_params = copy.deepcopy(default_params) + rejects = copy.deepcopy(default_rejects) + rejects['LanguageIDFilter.1'] = True + params = copy.deepcopy(example_params) + params['LanguageIDFilter']['thresholds'][1] = -1 + tf._set_parameters([1 for i in range(10)], rejects) + self.assertEqual(tf.filter_params, params) + + tf.filter_params = copy.deepcopy(default_params) + rejects = copy.deepcopy(default_rejects) + rejects['CharacterScoreFilter.0'] = True + params = copy.deepcopy(example_params) + params['CharacterScoreFilter']['thresholds'][0] = -1 + tf._set_parameters([1 for i in range(10)], rejects) + self.assertEqual(tf.filter_params, params) + + tf.filter_params = copy.deepcopy(default_params) + rejects = copy.deepcopy(default_rejects) + rejects['CharacterScoreFilter.1'] = True + params = copy.deepcopy(example_params) + params['CharacterScoreFilter']['thresholds'][1] = -1 + tf._set_parameters([1 for i in range(10)], rejects) + self.assertEqual(tf.filter_params, params) class TestGenericFilterAdjuster(unittest.TestCase): diff --git a/tests/test_autogen_cluster.py b/tests/test_autogen_cluster.py deleted file mode 100644 index 74b1b24..0000000 --- a/tests/test_autogen_cluster.py +++ /dev/null @@ -1,84 +0,0 @@ -import unittest -import pprint -import copy - -from opusfilter.autogen_cluster import FilterThresholdFinder - -default_params = {'AlphabetRatioFilter': {}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin']}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', 'languages': ['en', 'de']}, - 'LengthRatioFilter.char': {'name': 'char', 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'unit': 'word'}, - 'NonZeroNumeralsFilter': {}, - 'TerminalPunctuationFilter': {}} - -example_params = {'AlphabetRatioFilter': {'threshold': 1}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', - 'languages': ['en', 'de'], - 'thresholds': [1, 1]}, - 'LengthRatioFilter.char': {'name': 'char', 'threshold': 1, 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'threshold': 1, 'unit': 'word'}, - 'NonZeroNumeralsFilter': {'threshold': 1}, - 'TerminalPunctuationFilter': {'threshold': 1}} - -default_rejects = {'TerminalPunctuationFilter': False, - 'AlphabetRatioFilter.0': False, - 'AlphabetRatioFilter.1': False, - 'CharacterScoreFilter.0': False, - 'CharacterScoreFilter.1': False, - 'LanguageIDFilter.0': False, - 'LanguageIDFilter.1': False, - 'LengthRatioFilter.char': False, - 'LengthRatioFilter.word': False, - 'NonZeroNumeralsFilter.0': False} - - -class TestThresholdFinder(unittest.TestCase): - - def test_set_default_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - self.assertEqual(tf.filter_params, default_params) - - def test_reject_all_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - tf._set_parameters([1 for i in range(10)], {k: True for k in default_rejects.keys()}) - self.assertEqual(tf.filter_params, {}) - - def test_set_all_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - tf._set_parameters([1 for i in range(10)], default_rejects) - self.assertEqual(tf.filter_params, example_params) - - def test_set_parameters_reject_one_side(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - rejects = copy.deepcopy(default_rejects) - rejects['LanguageIDFilter.0'] = True - params = copy.deepcopy(example_params) - params['LanguageIDFilter']['thresholds'][0] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) - - tf.filter_params = copy.deepcopy(default_params) - rejects = copy.deepcopy(default_rejects) - rejects['LanguageIDFilter.1'] = True - params = copy.deepcopy(example_params) - params['LanguageIDFilter']['thresholds'][1] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) - - tf.filter_params = copy.deepcopy(default_params) - rejects = copy.deepcopy(default_rejects) - rejects['CharacterScoreFilter.0'] = True - params = copy.deepcopy(example_params) - params['CharacterScoreFilter']['thresholds'][0] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) - - tf.filter_params = copy.deepcopy(default_params) - rejects = copy.deepcopy(default_rejects) - rejects['CharacterScoreFilter.1'] = True - params = copy.deepcopy(example_params) - params['CharacterScoreFilter']['thresholds'][1] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) From d0e56578fb6df6e93bdb37427c683448310ae673 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 2 Aug 2023 17:12:08 +0300 Subject: [PATCH 02/12] refactor and unify autogen code --- bin/opusfilter-autogen | 35 ++-- opusfilter/autogen.py | 340 +++++++++++++++++++++++++--------- opusfilter/autogen_cluster.py | 121 ------------ opusfilter/filters.py | 10 +- tests/test_autogen.py | 111 ++++++----- 5 files changed, 348 insertions(+), 269 deletions(-) diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 753a9e0..d6b4bba 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -3,11 +3,11 @@ import argparse import logging import os +import sys import matplotlib.pyplot as plt -from opusfilter.autogen import ConfigurationGenerator, DefaultParameterFilters, PercentileFilters -from opusfilter.autogen_cluster import FilterThresholdFinder +from opusfilter.autogen import ClusterFilters, ConfigurationGenerator, DefaultParameterFilters, PercentileFilters from opusfilter.util import yaml try: @@ -24,14 +24,16 @@ parser = argparse.ArgumentParser( prog='opusfilter-autogen', description='Generate initial configuration based on parallel text data') +#parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'), +# help='Add filter of CLASS with JSON parameters object') parser.add_argument('--files', required=True, nargs='+', metavar='TEXTFILE', help='parallel text input file(s)') parser.add_argument('--langs', nargs='+', metavar='LANGCODE', help='Language codes corresponding to the input files. If omitted, LanguageIDFilters will not be used.') parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=( 'Alphabetic scripts (e.g. Latin) corresponding to the input files. ' 'If omitted, CharacterScoreFilter will not be used.')) -parser.add_argument('--filter-params', choices=['default', 'percentiles', 'unsupervised'], default='unsupervised', - help='Method for selecting filter parameters (default: %(default)s)') +parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering', + help='Method for selecting filter thresholds (default: %(default)s)') parser.add_argument('--sample-size', default=100000, type=int, help='Max number of sentence pairs used for clustering (default %(default)s)') parser.add_argument('--noisy-percentile', default=0.001, type=float, @@ -41,26 +43,35 @@ parser.add_argument('--work-dir', default='work', parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)') parser.add_argument('--plot', action='store_true', help='Show a scatter plot of the clustering and histograms of feature data distributions') +parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit') parser.add_argument('--overwrite', action='store_true', help='Overwrite existing config file and intermediate files') parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)') args = parser.parse_args() -if args.filter_params == 'unsupervised': - filtergen = FilterThresholdFinder( +if args.method == 'clustering': + filtergen = ClusterFilters( files=args.files, langs=args.langs, scripts=args.scripts, sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite) -elif args.filter_params == 'percentiles': - filtergen = PercentileFilters(files=args.files, excluded_percentile=args.noisy_percentile) +elif args.method == 'percentiles': + filtergen = PercentileFilters( + files=args.files, langs=args.langs, scripts=args.scripts, + excluded_percentile=args.noisy_percentile, sample_size=args.sample_size, + inter_dir=args.inter_dir, overwrite=args.overwrite) else: - filtergen = DefaultParameterFilters() -filters = filtergen.get_thresholds() + filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts) + +if args.list_defaults: + yaml.dump(filtergen.DEFAULT_FILTERS, args.output) + sys.exit(0) + +filters = filtergen.set_filter_thresholds() -if args.filter_params == 'unsupervised' and args.plot: +if args.method == 'clustering' and args.plot: filtergen.scoredata.plot(plt) plt.show() generator = ConfigurationGenerator( files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir) -generator.add_filter(filters) +generator.add_filter(filtergen.filters) yaml.dump(generator.get_config(), args.output) diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index 490543e..914ae7b 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -3,20 +3,24 @@ import copy import inspect import logging +import os +import pathlib +import shutil +import tempfile -from pandas import json_normalize -from tqdm import tqdm +import ruamel.yaml from . import CLEAN_LOW, CLEAN_HIGH, CLEAN_BETWEEN, CLEAN_TRUE, CLEAN_FALSE from . import OpusFilterError, ConfigurationError from . import filters as filtermodule -from . import pipeline -from .classifier import lists_to_dicts -from .util import file_open +from .autogen_cluster import ScoreClusters +from .classifier import load_dataframe +from .opusfilter import OpusFilter logger = logging.getLogger(__name__) logger.setLevel('WARNING') +yaml = ruamel.yaml.YAML() class FilterArgumentFailure(OpusFilterError): @@ -114,87 +118,171 @@ def get_config(self): return config +def get_score_file(input_files, filters, outputdir, sample_size, overwrite=False, max_length=150): + """Calculate filter scores and return score file + + Remove duplicates and empty lines, take a sample of size n, + produce filter scores file, and return its path. + + """ + config_gen = ConfigurationGenerator(files=[os.path.abspath(f) for f in input_files], workdir=outputdir) + config_gen.add_remove_duplicates() + config_gen.add_filter([{'LengthFilter': {'unit': 'word', 'min_length': 1, 'max_length': max_length}}]) + config_gen.add_subset(sample_size, 1) + score_file = config_gen.add_score(filters) + pre_config = config_gen.get_config() + yaml.dump(pre_config, pathlib.Path(os.path.join(outputdir, 'config.yaml'))) + opusf = OpusFilter(pre_config) + opusf.execute_steps(overwrite=overwrite) + return os.path.join(outputdir, score_file) + + +def get_default_parameters(filter_name): + """Get default parameters for a filter + + Uses the signature of the class. Arguments without default + values are ignored and will cause a failure. + + """ + filter_cls = getattr(filtermodule, filter_name) + default_parameters = {} + sig = inspect.signature(filter_cls) + logger.info("signature: %s%s", filter_name, sig) + for key, parameter in sig.parameters.items(): + if parameter.default == inspect.Signature.empty: + if key != 'kwargs': + logger.warning("Ignoring argument without default: %s", key) + continue + default_parameters[key] = parameter.default + return default_parameters + + +def parse_filter_specs(specs): + """Return classname, params tuple for filter specifications""" + if isinstance(specs, str): + name = specs + params = {} + else: + name, params = copy.deepcopy(specs) + if '.' in name: + name, subname = name.split('.', maxsplit=1) + params['name'] = subname + return name, params + + class DefaultParameterFilters: """Filter configuration with default parameters""" - def __init__(self, filters=None): - self.filters_to_add = filters if filters is not None else [ - 'LengthFilter', 'LengthRatioFilter', 'LongWordFilter', 'HtmlTagFilter', - 'AverageWordLengthFilter', 'AlphabetRatioFilter', - 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', - 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter' - ] - - def get_thresholds(self): + DEFAULT_FILTERS = ['LengthFilter', + ('LengthRatioFilter.word', {'unit': 'word'}), ('LengthRatioFilter.char', {'unit': 'char'}), + 'LongWordFilter', 'HtmlTagFilter', + 'AverageWordLengthFilter', 'AlphabetRatioFilter', + 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', + 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', + 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + + def __init__(self, langs=None, scripts=None, filters=None): + if filters is None: + filters = self.DEFAULT_FILTERS + filters = [parse_filter_specs(spec) for spec in filters] + self.filters_to_add = [] + for filter_name, filter_params in filters: + if filter_name == 'CharacterScoreFilter' and 'scripts' not in filter_params: + if not scripts: + logger.warning('Cannot add CharacterScoreFilter (no scripts provided)') + continue + filter_params['scripts'] = scripts + if filter_name == 'LanguageIDFilter' and 'languages' not in filter_params: + if not langs: + logger.warning('Cannot add LanguageIDFilter (no languages provided)') + continue + filter_params['languages'] = langs + self.filters_to_add.append((filter_name, filter_params)) + self._filters = [] # Final filters + + @property + def filters(self): """Get filter configuration with thresholds""" - filters = [] - for filterclass in self.filters_to_add: + return self._filters + + def set_filter_thresholds(self): + """Set filter thresholds""" + for filter_name, filter_params in self.filters_to_add: try: - filter_config = self.get_filter_parameters(filterclass) + filter_config = self.get_filter_parameters(filter_name, filter_params) except FilterArgumentFailure as err: - logger.error("Unusable default arguments for %s: %s", filterclass, err) + logger.error("Unusable default arguments for %s: %s", filter_name, err) continue - filters.append(filter_config) - return filters + self._filters.append(filter_config) @staticmethod - def get_filter_parameters(filterclass): - """Return default parameters for filter of the given class""" - adjuster = GenericFilterAdjuster(filterclass) - filter_cls = getattr(filtermodule, filterclass) + def get_filter_parameters(filter_name, filter_params): + """Return parameters for filter of the given class""" + filter_cls = getattr(filtermodule, filter_name) + defaults = get_default_parameters(filter_name) + defaults.update(filter_params) try: - filter_cls(**adjuster.default_parameters) + filter_cls(**defaults) except ConfigurationError as err: raise FilterArgumentFailure(err) from err - filter_config = {filterclass: adjuster.default_parameters} + filter_config = {filter_name: defaults} return filter_config -class PercentileFilters: - """Configuration generator based on filter score percentiles""" +class PercentileFilters(DefaultParameterFilters): + """Filter configuration based on filter score percentiles""" - def __init__(self, files, filters=None, excluded_percentile=0.001, sample_size=100000): + def __init__(self, files, langs=None, scripts=None, filters=None, excluded_percentile=0.001, + sample_size=100000, inter_dir=None, overwrite=False): + super().__init__(langs=langs, scripts=scripts, filters=filters) self.files = files self.sample_size = sample_size self.excluded_percentile = excluded_percentile - self.filters_to_add = filters if filters is not None else [ - 'LengthFilter', 'LengthRatioFilter', 'LongWordFilter', 'HtmlTagFilter', - 'AverageWordLengthFilter', 'AlphabetRatioFilter', - 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', - 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter' - ] - - def get_thresholds(self): - """Get filter configuration with thresholds""" - filters = [] - for filterclass in self.filters_to_add: + if inter_dir: + self.use_tmp = False + self.inter_dir = inter_dir + if not os.path.exists(self.inter_dir): + os.makedirs(self.inter_dir) + else: + self.use_tmp = True + self.inter_dir = tempfile.mkdtemp() + self.overwrite = overwrite + self.max_length = 1000 + self.df = None + + def set_filter_thresholds(self): + """Set filter thresholds""" + score_file = get_score_file( + self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size, + overwrite=self.overwrite, max_length=self.max_length) + self.df = load_dataframe(score_file) + for filter_name, filter_params in self.filters_to_add: try: - filter_config = self.get_filter_parameters(filterclass) + filter_config = self.get_filter_parameters(filter_name, filter_params) except FilterArgumentFailure as err: - logger.error("Unusable default arguments for %s: %s", filterclass, err) + logger.error("Unusable default arguments for %s: %s", filter_name, err) continue - filters.append(filter_config) - return filters - - def get_filter_parameters(self, filterclass): - """Return suitable parameters for filter of the given class""" - adjuster = GenericFilterAdjuster(filterclass) - filter_cls = getattr(filtermodule, filterclass) + self._filters.append(filter_config) + if self.use_tmp: + shutil.rmtree(self.inter_dir) + + def get_filter_parameters(self, filter_name, filter_params): + """Return parameters for filter of the given class""" + adjuster = GenericFilterAdjuster(filter_name, filter_params) + filter_cls = getattr(filtermodule, filter_name) try: filter_cls(**adjuster.default_parameters) except ConfigurationError as err: raise FilterArgumentFailure(err) from err + column_prefix = filter_name + if 'name' in filter_params: + column_prefix += '.' + filter_params['name'] + columns = [col for col in self.df.columns if col.startswith(column_prefix)] new_parameters = adjuster.get_adjusted_parameters( - self.read_lines(), excluded_percentile=self.excluded_percentile) - filter_config = {filterclass: new_parameters} + self.df[columns], excluded_percentile=self.excluded_percentile) + filter_config = {filter_name: new_parameters} return filter_config - def read_lines(self): - """Read segments without newlines""" - infs = [file_open(infile, 'r') for infile in self.files] - for pair in tqdm(zip(*infs)): - yield [segment.rstrip() for segment in pair] - class GenericFilterAdjuster: """Class for guessing suitable parameters for a filter""" @@ -205,32 +293,16 @@ class GenericFilterAdjuster: MIN_MAX_ARGUMENTS = [('min_length', 'max_length')] ALL_THRESHOLD_ARGUMENTS = SINGLE_THRESHOLD_ARGUMENTS + MULTI_THRESHOLD_ARGUMENTS + MIN_MAX_ARGUMENTS - def __init__(self, filterclass): + def __init__(self, filterclass, filter_parameters=None): if isinstance(filterclass, str): self.filter_name = filterclass self.filter_cls = getattr(filtermodule, self.filter_name) else: self.filter_name = filterclass.__name__ self.filter_cls = filterclass - self.default_parameters = self.get_default_parameters() - - def get_default_parameters(self): - """Get default parameters for the filter - - Uses the signature of the class. Arguments without default - values are ignored and will cause a failure. - - """ - default_parameters = {} - sig = inspect.signature(self.filter_cls) - logger.info("signature: %s%s", self.filter_name, sig) - for key, parameter in sig.parameters.items(): - if parameter.default == inspect.Signature.empty: - if key != 'kwargs': - logger.warning("Ignoring argument without default: %s", key) - continue - default_parameters[key] = parameter.default - return default_parameters + self.default_parameters = get_default_parameters(self.filter_name) + if filter_parameters: + self.default_parameters.update(filter_parameters) @staticmethod def _locate_arguments(candidates, arguments): @@ -259,7 +331,7 @@ def is_adjustable(self): return True return False - def get_adjusted_parameters(self, data, excluded_percentile=0.01): + def get_adjusted_parameters(self, df, excluded_percentile=0.01): """Estimate parameters for the filter using data Assumes that excluded_percentile amount of the data should be @@ -279,9 +351,6 @@ def get_adjusted_parameters(self, data, excluded_percentile=0.01): return parameters score_dir = self.filter_cls.score_direction logger.info("score type for %s: %s", self.filter_name, score_dir) - filter_config = {self.filter_name: self.default_parameters} - filter_pipe = pipeline.FilterPipeline.from_config([filter_config]) - df = self.get_score_df(filter_pipe, data) if score_dir == CLEAN_LOW: percentiles = [1 - excluded_percentile] pct_keys = [f'{100*(1-excluded_percentile):g}%'] @@ -337,7 +406,108 @@ def get_adjusted_parameters(self, data, excluded_percentile=0.01): logger.warning("Threshold adjusting not supported") return parameters - @staticmethod - def get_score_df(filter_pipe, data): - """Return dataframe containing filter scores from the data""" - return json_normalize([lists_to_dicts(scores_obj) for scores_obj in filter_pipe.score(data)]) + +class ClusterFilters: + """Filter configuration based on score clustering""" + + def __init__(self, files, langs, scripts, sample_size, inter_dir, overwrite): + self.files = files + self.sample_size = sample_size + self.max_length = 150 + self.langs = langs + self.scripts = scripts + if inter_dir: + self.use_tmp = False + self.inter_dir = inter_dir + if not os.path.exists(self.inter_dir): + os.makedirs(self.inter_dir) + else: + self.use_tmp = True + self.inter_dir = tempfile.mkdtemp() + self.label_file_path = os.path.join(self.inter_dir, 'labels.txt') + self.overwrite = overwrite + self.filter_params = { + 'AlphabetRatioFilter': {}, + 'LengthRatioFilter.char': { + 'name': 'char', + 'unit': 'char'}, + 'LengthRatioFilter.word': { + 'name': 'word', + 'unit': 'word'}, + 'NonZeroNumeralsFilter': {}, + } + if self.langs: + self.filter_params['LanguageIDFilter'] = { + 'name': 'cld2', + 'id_method': 'cld2', + 'languages': langs + } + if self.scripts: + self.filter_params['CharacterScoreFilter'] = {'scripts': self.scripts} + if len(self.files) == 2: + self.filter_params['TerminalPunctuationFilter'] = {} + self.scoredata = None + self._filters = [] + + @property + def filters(self): + """Get filter configuration with thresholds""" + return self._filters + + def set_filter_thresholds(self): + """Get filter configuration with thresholds""" + score_file = get_score_file( + self.files, [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()], + self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) + self.scoredata = ScoreClusters(score_file) + self._set_parameters(self.scoredata.get_thresholds(), self.scoredata.get_rejects()) + if os.path.isfile(self.label_file_path) and not self.overwrite: + logger.info('Label file "%s" exits, not overwriting', self.label_file_path) + else: + with open(self.label_file_path, 'w', encoding='utf-8') as label_file: + for label in self.scoredata.labels: + label_file.write(str(label)+'\n') + if self.use_tmp: + shutil.rmtree(self.inter_dir) + self._filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] + + def _set_parameters(self, thresholds, rejects): + """Set filter parameters based on thresholds and rejects + + thresholds: list of threshold values + rejects: boolean-valued dictionary, dataframe columns as keys + + """ + for i, name in enumerate(rejects): + fullname = name + name_parts = name.split('.') + filter_name = name_parts[0] + filter_cls = getattr(filtermodule, filter_name) + filt_args = inspect.signature(filter_cls).parameters + endp = name_parts[-1] + if endp.isnumeric(): + # numeric last part is language index + name = '.'.join(name_parts[:-1]) + if 'thresholds' in filt_args: + parameter = self.filter_params.get(filter_name) + if 'thresholds' not in parameter: + parameter['thresholds'] = [] + if rejects[fullname]: + # FIXME: -1 may not work for all filters + parameter['thresholds'].insert(int(endp), -1) + else: + parameter['thresholds'].insert(int(endp), thresholds[i]) + if len(parameter['thresholds']) == 2: + if all(v == -1 for v in parameter['thresholds']): + del self.filter_params[filter_name] + elif 'threshold' in filt_args: + parameter = self.filter_params.get(name) + if rejects[fullname]: + if name in self.filter_params: + del self.filter_params[name] + continue + if parameter is None: + continue + prev_t = parameter.get('threshold') + if prev_t is None or thresholds[i] < prev_t: + parameter['threshold'] = thresholds[i] diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index ed06350..aadd48c 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -1,12 +1,7 @@ """Unsupervised threshold selection for filters""" -import inspect -import os -import shutil from collections import Counter import logging -import pathlib -import tempfile from sklearn.cluster import KMeans from sklearn import preprocessing @@ -14,17 +9,13 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance import numpy as np -import ruamel.yaml from . import CLEAN_LOW from . import filters as filtermodule -from .autogen import ConfigurationGenerator from .classifier import load_dataframe -from .opusfilter import OpusFilter logger = logging.getLogger(__name__) -yaml = ruamel.yaml.YAML() class ScoreClusters: @@ -122,115 +113,3 @@ def plot(self, plt): plt.suptitle('Histograms for noisy samples') clean_samples.hist(bins=100, figsize=(10, 10)) plt.suptitle('Histograms for clean samples') - - -class FilterThresholdFinder: - """Find thresholds for filters based on score clustering""" - - def __init__(self, files, langs, scripts, sample_size, inter_dir, overwrite): - self.input_files = files - self.sample_size = sample_size - self.langs = langs - self.scripts = scripts - if inter_dir: - self.use_tmp = False - self.inter_dir = inter_dir - else: - self.use_tmp = True - self.inter_dir = tempfile.mkdtemp() - self.label_file_path = os.path.join(self.inter_dir, 'labels.txt') - self.overwrite = overwrite - self.filter_params = { - 'AlphabetRatioFilter': {}, - 'LengthRatioFilter.char': { - 'name': 'char', - 'unit': 'char'}, - 'LengthRatioFilter.word': { - 'name': 'word', - 'unit': 'word'}, - 'NonZeroNumeralsFilter': {}, - } - if self.langs: - self.filter_params['LanguageIDFilter'] = { - 'name': 'cld2', - 'id_method': 'cld2', - 'languages': langs - } - if self.scripts: - self.filter_params['CharacterScoreFilter'] = {'scripts': self.scripts} - if len(self.input_files) == 2: - self.filter_params['TerminalPunctuationFilter'] = {} - self.scoredata = None - - def get_thresholds(self): - """Get filter configuration with thresholds""" - self.scoredata = ScoreClusters(self._get_score_file()) - self._set_parameters(self.scoredata.get_thresholds(), self.scoredata.get_rejects()) - if os.path.isfile(self.label_file_path) and not self.overwrite: - logger.info('Label file "%s" exits, not overwriting', self.label_file_path) - else: - with open(self.label_file_path, 'w', encoding='utf-8') as label_file: - for label in self.scoredata.labels: - label_file.write(str(label)+'\n') - if self.use_tmp: - shutil.rmtree(self.inter_dir) - filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] - return filters - - def _get_score_file(self): - """Calculate filter scores and return score file - - Remove duplicates and empty lines, take a sample of size n, produce filter scores - - """ - config_gen = ConfigurationGenerator(files=[os.path.abspath(f) for f in self.input_files], workdir=self.inter_dir) - config_gen.add_remove_duplicates() - config_gen.add_filter([{'LengthFilter': {'unit': 'word', 'min_length': 1, 'max_length': 150}}]) - config_gen.add_subset(self.sample_size, 1) - score_file = config_gen.add_score([{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()]) - pre_config = config_gen.get_config() - yaml.dump(pre_config, pathlib.Path(os.path.join(self.inter_dir, 'config.yaml'))) - opusf = OpusFilter(pre_config) - opusf.execute_steps(overwrite=self.overwrite) - return os.path.join(self.inter_dir, score_file) - - def _set_parameters(self, thresholds, rejects): - """Set filter parameters based on thresholds and rejects - - thresholds: list of threshold values - rejects: boolean-valued dictionary, dataframe columns as keys - - """ - for i, name in enumerate(rejects): - fullname = name - name_parts = name.split('.') - filter_name = name_parts[0] - filter_cls = getattr(filtermodule, filter_name) - filt_args = inspect.signature(filter_cls).parameters - endp = name_parts[-1] - if endp.isnumeric(): - # numeric last part is language index - name = '.'.join(name_parts[:-1]) - if 'thresholds' in filt_args: - parameter = self.filter_params.get(filter_name) - if 'thresholds' not in parameter: - parameter['thresholds'] = [] - if rejects[fullname]: - # FIXME: -1 may not work for all filters - parameter['thresholds'].insert(int(endp), -1) - else: - parameter['thresholds'].insert(int(endp), thresholds[i]) - if len(parameter['thresholds']) == 2: - if all(v == -1 for v in parameter['thresholds']): - del self.filter_params[filter_name] - elif 'threshold' in filt_args: - parameter = self.filter_params.get(name) - if rejects[fullname]: - if name in self.filter_params: - del self.filter_params[name] - continue - if parameter is None: - continue - prev_t = parameter.get('threshold') - if prev_t is None or thresholds[i] < prev_t: - parameter['threshold'] = thresholds[i] diff --git a/opusfilter/filters.py b/opusfilter/filters.py index d3dff4b..6dcb386 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -491,7 +491,7 @@ def __init__(self, threshold=0.9, weights=(1, 1, 1), unit='char', lowercase=Fals require_all=True, **kwargs): if unit not in self.VALID_UNITS: raise ConfigurationError( - f"Value of 'unit' are not one of the allowed choices {self.VALID_UNITS}: {unit}") + f"Value of 'unit' is not one of the allowed choices {self.VALID_UNITS}: {unit}") self.threshold = threshold self.weights = weights self.unit = unit @@ -524,10 +524,11 @@ def accept(self, score): class RepetitionFilter(FilterABC): """Filter segments with repeated content - Filter segments with substrings of min_length to max_length + Filter out segments with substrings of min_length to max_length characters that are repeated at least threshold number of times. The first occurrence is not counted to the threshold, i.e., - threshold 2 means that the substring has to occur three times. + threshold 2 means that any substring cannot occur three times + in a row. There may be optional space character(s) between the repeated strings that are not counted to the length. The repeated string @@ -537,9 +538,10 @@ class RepetitionFilter(FilterABC): """ score_direction = CLEAN_LOW + min_threshold = 1 def __init__(self, threshold=2, min_length=3, max_length=100, **kwargs): - if threshold < 1: + if threshold < self.min_threshold: raise ConfigurationError(f"threshold for RepetitionFilter has to be at least one, got {threshold}") if min_length < 1: raise ConfigurationError(f"min_length for RepetitionFilter has to be at least one, got {min_length}") diff --git a/tests/test_autogen.py b/tests/test_autogen.py index e8a4711..19902d4 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -10,38 +10,43 @@ from opusfilter import FilterABC, ConfigurationError, filters, pipeline from opusfilter.autogen import * -from opusfilter.autogen_cluster import FilterThresholdFinder from opusfilter.opusfilter import OpusFilter -default_params = {'AlphabetRatioFilter': {}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin']}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', 'languages': ['en', 'de']}, - 'LengthRatioFilter.char': {'name': 'char', 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'unit': 'word'}, - 'NonZeroNumeralsFilter': {}, - 'TerminalPunctuationFilter': {}} - -example_params = {'AlphabetRatioFilter': {'threshold': 1}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', - 'languages': ['en', 'de'], - 'thresholds': [1, 1]}, - 'LengthRatioFilter.char': {'name': 'char', 'threshold': 1, 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'threshold': 1, 'unit': 'word'}, - 'NonZeroNumeralsFilter': {'threshold': 1}, - 'TerminalPunctuationFilter': {'threshold': 1}} - -default_rejects = {'TerminalPunctuationFilter': False, - 'AlphabetRatioFilter.0': False, - 'AlphabetRatioFilter.1': False, - 'CharacterScoreFilter.0': False, - 'CharacterScoreFilter.1': False, - 'LanguageIDFilter.0': False, - 'LanguageIDFilter.1': False, - 'LengthRatioFilter.char': False, - 'LengthRatioFilter.word': False, - 'NonZeroNumeralsFilter.0': False} +default_params = { + 'AlphabetRatioFilter': {}, + 'CharacterScoreFilter': {'scripts': ['latin', 'latin']}, + 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', 'languages': ['en', 'de']}, + 'LengthRatioFilter.char': {'name': 'char', 'unit': 'char'}, + 'LengthRatioFilter.word': {'name': 'word', 'unit': 'word'}, + 'NonZeroNumeralsFilter': {}, + 'TerminalPunctuationFilter': {} +} + +example_params = { + 'AlphabetRatioFilter': {'threshold': 1}, + 'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}, + 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', + 'languages': ['en', 'de'], + 'thresholds': [1, 1]}, + 'LengthRatioFilter.char': {'name': 'char', 'threshold': 1, 'unit': 'char'}, + 'LengthRatioFilter.word': {'name': 'word', 'threshold': 1, 'unit': 'word'}, + 'NonZeroNumeralsFilter': {'threshold': 1}, + 'TerminalPunctuationFilter': {'threshold': 1} +} + +default_rejects = { + 'TerminalPunctuationFilter': False, + 'AlphabetRatioFilter.0': False, + 'AlphabetRatioFilter.1': False, + 'CharacterScoreFilter.0': False, + 'CharacterScoreFilter.1': False, + 'LanguageIDFilter.0': False, + 'LanguageIDFilter.1': False, + 'LengthRatioFilter.char': False, + 'LengthRatioFilter.word': False, + 'NonZeroNumeralsFilter.0': False +} class TestAutogen(unittest.TestCase): @@ -71,49 +76,61 @@ def tearDownClass(self): shutil.rmtree(self.tempdir) def _test_filters(self, filters): - generator = ConfigurationGenerator([self.src_out, self.tgt_out], workdir='work', langs=[self.source, self.target]) + self.assertTrue(filters) + logging.info(filters) + generator = ConfigurationGenerator( + [self.src_out, self.tgt_out], workdir=os.path.join(self.tempdir, 'work'), langs=[self.source, self.target]) generator.add_filter(filters) configuration = generator.get_config() of = OpusFilter(configuration) of.execute_steps(overwrite=True) + self.assertTrue(os.path.isfile(os.path.join(self.tempdir, 'work', f'filtered.{self.source}.gz'))) + self.assertTrue(os.path.isfile(os.path.join(self.tempdir, 'work', f'filtered.{self.target}.gz'))) def test_default_filters(self): - filtergen = DefaultParameterFilters() - filters = filtergen.get_thresholds() - self._test_filters(filters) - + filtergen = DefaultParameterFilters( + langs=[self.source, self.target], scripts=['Latin', 'Latin']) + filtergen.set_filter_thresholds() + for spec in DefaultParameterFilters.DEFAULT_FILTERS: + filter_name, _ = parse_filter_specs(spec) + self.assertTrue(any(filter_name in f for f in filtergen.filters)) + self._test_filters(filtergen.filters) + @unittest.expectedFailure def test_percentile_filters(self): - filtergen = PercentileFilters(files=[self.src_out, self.tgt_out], excluded_percentile=0.05) - filters = filtergen.get_thresholds() - self._test_filters(filters) - + filtergen = PercentileFilters( + files=[self.src_out, self.tgt_out], langs=[self.source, self.target], scripts=['Latin', 'Latin'], + excluded_percentile=0.05) + filtergen.set_filter_thresholds() + logging.info(filtergen.filters) + self._test_filters(filtergen.filters) + def test_threshold_finder(self): - filtergen = FilterThresholdFinder( + filtergen = ClusterFilters( files=[self.src_out, self.tgt_out], langs=[self.source, self.target], scripts=['Latin', 'Latin'], sample_size=180, inter_dir=self.tempdir, overwrite=True) - filters = filtergen.get_thresholds() - self._test_filters(filters) + filtergen.set_filter_thresholds() + self._test_filters(filtergen.filters) class TestThresholdFinder(unittest.TestCase): def test_set_default_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) self.assertEqual(tf.filter_params, default_params) def test_reject_all_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) tf._set_parameters([1 for i in range(10)], {k: True for k in default_rejects.keys()}) self.assertEqual(tf.filter_params, {}) def test_set_all_parameters(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) tf._set_parameters([1 for i in range(10)], default_rejects) self.assertEqual(tf.filter_params, example_params) def test_set_parameters_reject_one_side(self): - tf = FilterThresholdFinder([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) rejects = copy.deepcopy(default_rejects) rejects['LanguageIDFilter.0'] = True params = copy.deepcopy(example_params) @@ -128,7 +145,7 @@ def test_set_parameters_reject_one_side(self): params['LanguageIDFilter']['thresholds'][1] = -1 tf._set_parameters([1 for i in range(10)], rejects) self.assertEqual(tf.filter_params, params) - + tf.filter_params = copy.deepcopy(default_params) rejects = copy.deepcopy(default_rejects) rejects['CharacterScoreFilter.0'] = True @@ -136,7 +153,7 @@ def test_set_parameters_reject_one_side(self): params['CharacterScoreFilter']['thresholds'][0] = -1 tf._set_parameters([1 for i in range(10)], rejects) self.assertEqual(tf.filter_params, params) - + tf.filter_params = copy.deepcopy(default_params) rejects = copy.deepcopy(default_rejects) rejects['CharacterScoreFilter.1'] = True From 76ee664570b8fc18467b7cfc631d10830576cb0f Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 9 Aug 2023 16:11:34 +0300 Subject: [PATCH 03/12] unify and cleanup autogen code --- opusfilter/autogen.py | 244 +++++++++++++++++----------------- opusfilter/autogen_cluster.py | 63 ++++++--- tests/test_autogen.py | 138 +++++++++---------- 3 files changed, 231 insertions(+), 214 deletions(-) diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index 914ae7b..ac248f9 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -1,5 +1,6 @@ """Configuration generation tools""" +import abc import copy import inspect import logging @@ -170,18 +171,12 @@ def parse_filter_specs(specs): return name, params -class DefaultParameterFilters: - """Filter configuration with default parameters""" +class AutoFiltersABC(metaclass=abc.ABCMeta): + """Abstract base class for automatic filter configuration""" - DEFAULT_FILTERS = ['LengthFilter', - ('LengthRatioFilter.word', {'unit': 'word'}), ('LengthRatioFilter.char', {'unit': 'char'}), - 'LongWordFilter', 'HtmlTagFilter', - 'AverageWordLengthFilter', 'AlphabetRatioFilter', - 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', - 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', - 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + DEFAULT_FILTERS = [] - def __init__(self, langs=None, scripts=None, filters=None): + def __init__(self, langs=None, scripts=None, filters=None, **kwargs): if filters is None: filters = self.DEFAULT_FILTERS filters = [parse_filter_specs(spec) for spec in filters] @@ -199,12 +194,31 @@ def __init__(self, langs=None, scripts=None, filters=None): filter_params['languages'] = langs self.filters_to_add.append((filter_name, filter_params)) self._filters = [] # Final filters + if kwargs: + logger.warning("Unused arguments: %s", kwargs) @property def filters(self): """Get filter configuration with thresholds""" return self._filters + @abc.abstractmethod + def set_filter_thresholds(self): + """Set filter thresholds""" + + +class DefaultParameterFilters(AutoFiltersABC): + """Filter configuration with default parameters""" + + DEFAULT_FILTERS = ['LengthFilter', + ('LengthRatioFilter.char', {'unit': 'char'}), + ('LengthRatioFilter.word', {'unit': 'word'}), + 'LongWordFilter', 'HtmlTagFilter', + 'AverageWordLengthFilter', 'AlphabetRatioFilter', + 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', + 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', + 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + def set_filter_thresholds(self): """Set filter thresholds""" for filter_name, filter_params in self.filters_to_add: @@ -229,15 +243,13 @@ def get_filter_parameters(filter_name, filter_params): return filter_config -class PercentileFilters(DefaultParameterFilters): - """Filter configuration based on filter score percentiles""" +class DataBasedFiltersABC(AutoFiltersABC, metaclass=abc.ABCMeta): + """Abstract base class for filter configuration based on data""" - def __init__(self, files, langs=None, scripts=None, filters=None, excluded_percentile=0.001, - sample_size=100000, inter_dir=None, overwrite=False): - super().__init__(langs=langs, scripts=scripts, filters=filters) + def __init__(self, files, sample_size=100000, max_length=1000, inter_dir=None, overwrite=False, **kwargs): + super().__init__(**kwargs) self.files = files self.sample_size = sample_size - self.excluded_percentile = excluded_percentile if inter_dir: self.use_tmp = False self.inter_dir = inter_dir @@ -247,7 +259,15 @@ def __init__(self, files, langs=None, scripts=None, filters=None, excluded_perce self.use_tmp = True self.inter_dir = tempfile.mkdtemp() self.overwrite = overwrite - self.max_length = 1000 + self.max_length = max_length + + +class PercentileFilters(DataBasedFiltersABC): + """Filter configuration based on filter score percentiles""" + + def __init__(self, files, excluded_percentile=0.001, **kwargs): + super().__init__(files, **kwargs) + self.excluded_percentile = excluded_percentile self.df = None def set_filter_thresholds(self): @@ -268,10 +288,10 @@ def set_filter_thresholds(self): def get_filter_parameters(self, filter_name, filter_params): """Return parameters for filter of the given class""" - adjuster = GenericFilterAdjuster(filter_name, filter_params) + adjuster = PercentileAdjuster(filter_name, filter_params) filter_cls = getattr(filtermodule, filter_name) try: - filter_cls(**adjuster.default_parameters) + filter_cls(**adjuster.initial_parameters) except ConfigurationError as err: raise FilterArgumentFailure(err) from err column_prefix = filter_name @@ -284,8 +304,8 @@ def get_filter_parameters(self, filter_name, filter_params): return filter_config -class GenericFilterAdjuster: - """Class for guessing suitable parameters for a filter""" +class FilterInspect: + """Helper methods for parameters a single filter""" # Lists of possible filter threshold arguments SINGLE_THRESHOLD_ARGUMENTS = ['threshold'] @@ -300,9 +320,9 @@ def __init__(self, filterclass, filter_parameters=None): else: self.filter_name = filterclass.__name__ self.filter_cls = filterclass - self.default_parameters = get_default_parameters(self.filter_name) + self.initial_parameters = get_default_parameters(self.filter_name) if filter_parameters: - self.default_parameters.update(filter_parameters) + self.initial_parameters.update(filter_parameters) @staticmethod def _locate_arguments(candidates, arguments): @@ -326,11 +346,38 @@ def is_adjustable(self): if self.filter_cls.score_direction in {CLEAN_TRUE, CLEAN_FALSE}: # Nothing to estimate for boolean return False - if self._locate_arguments(self.ALL_THRESHOLD_ARGUMENTS, self.default_parameters): + if self._locate_arguments(self.ALL_THRESHOLD_ARGUMENTS, self.initial_parameters): # Known threshold parameters to adjust return True return False + def find_threshold_keys(self, number): + """Return threshold parameters compatible with the number of thresholds""" + score_dir = self.filter_cls.score_direction + if score_dir in {CLEAN_LOW, CLEAN_HIGH}: + # Clean is below or above threshold + if number > 1: + # Multiple thresholds allowed (or needed) + threshold_key = self._locate_arguments(self.MULTI_THRESHOLD_ARGUMENTS, self.initial_parameters) + else: + # Single threshold + threshold_key = self._locate_arguments(self.SINGLE_THRESHOLD_ARGUMENTS, self.initial_parameters) + if not threshold_key: + logger.warning("Cannot find threshold parameter from %s", list(self.initial_parameters)) + elif score_dir == CLEAN_BETWEEN: + # Clean is between minimum and maximum + threshold_key = self._locate_arguments(self.MIN_MAX_ARGUMENTS, self.initial_parameters) + if not threshold_key: + logger.warning("Cannot find threshold parameter from %s", list(self.initial_parameters)) + else: + threshold_key = None + logger.warning("Threshold adjusting not supported for %s", self.filter_name) + return threshold_key + + +class PercentileAdjuster(FilterInspect): + """Class for setting filters to remove given percentile of data""" + def get_adjusted_parameters(self, df, excluded_percentile=0.01): """Estimate parameters for the filter using data @@ -346,7 +393,7 @@ def get_adjusted_parameters(self, df, excluded_percentile=0.01): # values are 1, the selected threshold 1 might remove all data # if the condition for accepting the value is being greater # than the threshold. - parameters = copy.deepcopy(self.default_parameters) + parameters = copy.deepcopy(self.initial_parameters) if not self.is_adjustable(): return parameters score_dir = self.filter_cls.score_direction @@ -364,32 +411,20 @@ def get_adjusted_parameters(self, df, excluded_percentile=0.01): else: raise ValueError(f"Unknown score type '{score_dir}'") score_dim = len(df.columns) + threshold_key = self.find_threshold_keys(score_dim) + if threshold_key is None: + return parameters if score_dir in {CLEAN_LOW, CLEAN_HIGH}: # Clean is below or above threshold - if score_dim > 1: - # Multiple thresholds allowed (or needed) - threshold_key = self._locate_arguments(self.MULTI_THRESHOLD_ARGUMENTS, parameters) - else: - # Single threshold - threshold_key = self._locate_arguments(self.SINGLE_THRESHOLD_ARGUMENTS, parameters) - if not threshold_key: - logger.warning("Cannot find threshold parameter from %s", list(parameters)) - return parameters values = [] for column in df.columns: stats = df[column].describe(percentiles=percentiles) logger.info(stats) values.append(stats.loc[pct_keys[0]].item()) - if score_dim == 1: - values = values[0] - logger.info("Selected value %s for %s", values, threshold_key) - parameters[threshold_key] = values + logger.info("Selected values %s for %s", values, threshold_key) + parameters[threshold_key] = values[0] if score_dim == 1 else values elif score_dir == CLEAN_BETWEEN: # Clean is between minimum and maximum - threshold_key = self._locate_arguments(self.MIN_MAX_ARGUMENTS, parameters) - if not threshold_key: - logger.warning("Cannot find threshold parameter from %s", list(parameters)) - return parameters min_values, max_values = [], [] for column in df.columns: stats = df[column].describe(percentiles=percentiles) @@ -407,60 +442,32 @@ def get_adjusted_parameters(self, df, excluded_percentile=0.01): return parameters -class ClusterFilters: +class ClusterFilters(DataBasedFiltersABC): """Filter configuration based on score clustering""" - def __init__(self, files, langs, scripts, sample_size, inter_dir, overwrite): - self.files = files - self.sample_size = sample_size - self.max_length = 150 - self.langs = langs - self.scripts = scripts - if inter_dir: - self.use_tmp = False - self.inter_dir = inter_dir - if not os.path.exists(self.inter_dir): - os.makedirs(self.inter_dir) - else: - self.use_tmp = True - self.inter_dir = tempfile.mkdtemp() + DEFAULT_FILTERS = ['AlphabetRatioFilter', + ('LengthRatioFilter.char', {'unit': 'char'}), + ('LengthRatioFilter.word', {'unit': 'word'}), + 'NonZeroNumeralsFilter', + 'CharacterScoreFilter', + ('LanguageIDFilter', {'id_method': 'cld2'}), + 'TerminalPunctuationFilter'] + + def __init__(self, files, max_length=150, **kwargs): + super().__init__(files, max_length=150, **kwargs) self.label_file_path = os.path.join(self.inter_dir, 'labels.txt') - self.overwrite = overwrite - self.filter_params = { - 'AlphabetRatioFilter': {}, - 'LengthRatioFilter.char': { - 'name': 'char', - 'unit': 'char'}, - 'LengthRatioFilter.word': { - 'name': 'word', - 'unit': 'word'}, - 'NonZeroNumeralsFilter': {}, - } - if self.langs: - self.filter_params['LanguageIDFilter'] = { - 'name': 'cld2', - 'id_method': 'cld2', - 'languages': langs - } - if self.scripts: - self.filter_params['CharacterScoreFilter'] = {'scripts': self.scripts} - if len(self.files) == 2: - self.filter_params['TerminalPunctuationFilter'] = {} self.scoredata = None - self._filters = [] - - @property - def filters(self): - """Get filter configuration with thresholds""" - return self._filters def set_filter_thresholds(self): """Get filter configuration with thresholds""" score_file = get_score_file( - self.files, [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()], - self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) + self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size, + overwrite=self.overwrite, max_length=self.max_length) + # score_file = get_score_file( + # self.files, [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()], + # self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) self.scoredata = ScoreClusters(score_file) - self._set_parameters(self.scoredata.get_thresholds(), self.scoredata.get_rejects()) + self._set_parameters(self.scoredata.get_result_df()) if os.path.isfile(self.label_file_path) and not self.overwrite: logger.info('Label file "%s" exits, not overwriting', self.label_file_path) else: @@ -469,45 +476,38 @@ def set_filter_thresholds(self): label_file.write(str(label)+'\n') if self.use_tmp: shutil.rmtree(self.inter_dir) - self._filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] + # self._filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] - def _set_parameters(self, thresholds, rejects): - """Set filter parameters based on thresholds and rejects + def _set_parameters(self, df): + """Set filter parameters based on ScoreClusters thresholds: list of threshold values rejects: boolean-valued dictionary, dataframe columns as keys """ - for i, name in enumerate(rejects): - fullname = name - name_parts = name.split('.') - filter_name = name_parts[0] - filter_cls = getattr(filtermodule, filter_name) - filt_args = inspect.signature(filter_cls).parameters - endp = name_parts[-1] - if endp.isnumeric(): - # numeric last part is language index - name = '.'.join(name_parts[:-1]) - if 'thresholds' in filt_args: - parameter = self.filter_params.get(filter_name) - if 'thresholds' not in parameter: - parameter['thresholds'] = [] - if rejects[fullname]: + self._filters = [] + for classname, params in self.filters_to_add: + new_params = copy.deepcopy(params) + filter_inspect = FilterInspect(classname, new_params) + column_prefix = classname + if 'name' in params: + column_prefix += '.' + params['name'] + df_part = df[df.name.str.startswith(column_prefix)] + logger.warning(column_prefix) + logger.warning(df_part) + if all(df_part.reject): + continue + threshold_key = filter_inspect.find_threshold_keys(len(df_part)) + logger.warning(threshold_key) + if threshold_key is None: + continue + thresholds = list(df_part['threshold']) + for i, reject in enumerate(df_part.reject): + if reject: # FIXME: -1 may not work for all filters - parameter['thresholds'].insert(int(endp), -1) - else: - parameter['thresholds'].insert(int(endp), thresholds[i]) - if len(parameter['thresholds']) == 2: - if all(v == -1 for v in parameter['thresholds']): - del self.filter_params[filter_name] - elif 'threshold' in filt_args: - parameter = self.filter_params.get(name) - if rejects[fullname]: - if name in self.filter_params: - del self.filter_params[name] - continue - if parameter is None: - continue - prev_t = parameter.get('threshold') - if prev_t is None or thresholds[i] < prev_t: - parameter['threshold'] = thresholds[i] + thresholds[i] = -1 + logger.warning(thresholds) + new_params[threshold_key] = thresholds if len(thresholds) > 1 else thresholds[0] + logger.warning({classname: new_params}) + self._filters.append({classname: new_params}) + logger.info("Filters: %s", self.filters) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index aadd48c..fe604e6 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -3,6 +3,7 @@ from collections import Counter import logging +import pandas as pd from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.decomposition import PCA @@ -39,29 +40,39 @@ def __init__(self, score_file, n=2): self.kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto').fit(self.standard_data) self.labels = self.kmeans.labels_ self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) - self.noisy_label = self._get_noisy_label() - self.clean_label = np.abs(self.noisy_label - 1) + self._noisy_label = self._get_noisy_label() - def _get_noisy_label(self): - """Find label for the noisy cluster""" - centers = self.kmeans.cluster_centers_ - inv_centers = self.cluster_centers + @property + def noisy_label(self): + """Cluster label for noisy data""" + return self._noisy_label + + @property + def clean_label(self): + """Cluster label for clean data""" + return np.abs(self._noisy_label - 1) - # Flip values if low score indicates clean data + def _get_flipped_centers(self): + """Get centers with values flipped when low score indicates clean data""" dir_fixed_centers = [] - for center in centers: + for center in self.kmeans.cluster_centers_: fixed_center = [] - for j, name in enumerate(self.df.columns): - value = center[j].copy() + for i, name in enumerate(self.df.columns): + value = center[i].copy() if self.filters[name].score_direction == CLEAN_LOW: value *= -1 fixed_center.append(value) dir_fixed_centers.append(fixed_center) - means = np.mean(dir_fixed_centers, axis=1) + return dir_fixed_centers + + def _get_noisy_label(self): + """Find label for the noisy cluster""" + means = np.mean(self._get_flipped_centers(), axis=1) + # Output some cluster information nlabels = Counter(self.labels) - for k, (center, inv_center, mean) in enumerate(zip(centers, inv_centers, means)): - logger.info('Cluster #%s - number of samples: %s', k, nlabels[k]) + for i, (center, inv_center, mean) in enumerate(zip(self.kmeans.cluster_centers_, self.cluster_centers, means)): + logger.info('Cluster #%s - number of samples: %s', i, nlabels[i]) for j, val in enumerate(center): logger.info('%s\t%s\t%s', self.df.columns[j], round(val, 2), round(inv_center[j], 2)) logger.info('Average center\t%s', np.round(mean, 2)) @@ -76,14 +87,22 @@ def _get_noisy_label(self): f'{len(noisy_labels)}/{len(self.labels)} ({round(100*len(noisy_labels)/len(self.labels), 2)}%)') return noisy_label + def get_columns(self): + """Return data column names""" + return self.df.columns + def get_thresholds(self, method='noisy_center', precision=6): - """Return thresholds for noisy samples""" + """Return a list of thresholds for noisy samples""" if method != 'noisy_center': raise ValueError(f'Method {method} for thresholds not implemented') return self.cluster_centers[self.noisy_label].round(precision).tolist() def get_rejects(self): - """Train random forest classifier to find important features""" + """Train random forest classifier to find important features + + Returns a list of booleans (True = reject). + + """ logger.info('Training random forest') clf = RandomForestClassifier(random_state=1) clf.fit(self.standard_data, self.labels) @@ -93,13 +112,21 @@ def get_rejects(self): rej_coef = 0.1 logger.info('mean importance: %s', round(importance_mean_mean, 3)) logger.info('rejection coefficient: %s', rej_coef) - rejects = {} + rejects = [] for i, k in enumerate(self.df.columns): importance = feature_importances['importances_mean'][i] - rejects[k] = importance < importance_mean_mean * rej_coef - logger.info('%s\t%s\t%s', k, round(importance, 3), 'reject' if rejects[k] else 'keep') + reject = importance < importance_mean_mean * rej_coef + logger.info('%s\t%s\t%s', k, round(importance, 3), 'reject' if reject else 'keep') + rejects.append(reject) return rejects + def get_result_df(self): + """Return dataframe containing the thresholds and reject booleans""" + return pd.DataFrame.from_dict( + {'name': self.get_columns(), + 'threshold': self.get_thresholds(), + 'reject': self.get_rejects()}) + def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) diff --git a/tests/test_autogen.py b/tests/test_autogen.py index 19902d4..8290097 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -1,54 +1,19 @@ import inspect import logging import os -import requests import shutil import tempfile import unittest +import pandas as pd + import opustools -from opusfilter import FilterABC, ConfigurationError, filters, pipeline +from opusfilter import FilterABC, ConfigurationError, filters from opusfilter.autogen import * from opusfilter.opusfilter import OpusFilter -default_params = { - 'AlphabetRatioFilter': {}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin']}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', 'languages': ['en', 'de']}, - 'LengthRatioFilter.char': {'name': 'char', 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'unit': 'word'}, - 'NonZeroNumeralsFilter': {}, - 'TerminalPunctuationFilter': {} -} - -example_params = { - 'AlphabetRatioFilter': {'threshold': 1}, - 'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}, - 'LanguageIDFilter': {'name': 'cld2', 'id_method': 'cld2', - 'languages': ['en', 'de'], - 'thresholds': [1, 1]}, - 'LengthRatioFilter.char': {'name': 'char', 'threshold': 1, 'unit': 'char'}, - 'LengthRatioFilter.word': {'name': 'word', 'threshold': 1, 'unit': 'word'}, - 'NonZeroNumeralsFilter': {'threshold': 1}, - 'TerminalPunctuationFilter': {'threshold': 1} -} - -default_rejects = { - 'TerminalPunctuationFilter': False, - 'AlphabetRatioFilter.0': False, - 'AlphabetRatioFilter.1': False, - 'CharacterScoreFilter.0': False, - 'CharacterScoreFilter.1': False, - 'LanguageIDFilter.0': False, - 'LanguageIDFilter.1': False, - 'LengthRatioFilter.char': False, - 'LengthRatioFilter.word': False, - 'NonZeroNumeralsFilter.0': False -} - - class TestAutogen(unittest.TestCase): @classmethod @@ -115,55 +80,80 @@ def test_threshold_finder(self): class TestThresholdFinder(unittest.TestCase): + col_names = [ + 'AlphabetRatioFilter.0', + 'AlphabetRatioFilter.1', + 'LengthRatioFilter.char', + 'LengthRatioFilter.word', + 'NonZeroNumeralsFilter.0', + 'CharacterScoreFilter.0', + 'CharacterScoreFilter.1', + 'LanguageIDFilter.0', + 'LanguageIDFilter.1', + 'TerminalPunctuationFilter' + ] + + example_params = [ + {'AlphabetRatioFilter': {'threshold': [1, 1]}}, + {'LengthRatioFilter': {'name': 'char', 'threshold': 1, 'unit': 'char'}}, + {'LengthRatioFilter': {'name': 'word', 'threshold': 1, 'unit': 'word'}}, + {'NonZeroNumeralsFilter': {'threshold': 1}}, + {'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}}, + {'LanguageIDFilter': {'id_method': 'cld2', 'languages': ['en', 'de'], 'thresholds': [1, 1]}}, + {'TerminalPunctuationFilter': {'threshold': 1}} + ] + + def _make_df(self, names, thresholds, rejects): + return pd.DataFrame.from_dict( + {'name': names, 'threshold': thresholds, 'reject': rejects}) + def test_set_default_parameters(self): - tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - self.assertEqual(tf.filter_params, default_params) + tf = ClusterFilters([None, None], langs=['en', 'de'], scripts=['latin', 'latin']) + self.assertEqual(tf.filters, []) def test_reject_all_parameters(self): - tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - tf._set_parameters([1 for i in range(10)], {k: True for k in default_rejects.keys()}) - self.assertEqual(tf.filter_params, {}) + tf = ClusterFilters([None, None], langs=['en', 'de'], scripts=['latin', 'latin']) + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), [True] * len(self.col_names))) + self.assertEqual(tf.filters, []) def test_set_all_parameters(self): - tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) - tf._set_parameters([1 for i in range(10)], default_rejects) - self.assertEqual(tf.filter_params, example_params) + tf = ClusterFilters([None, None], langs=['en', 'de'], scripts=['latin', 'latin']) + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), [False] * len(self.col_names))) + self.assertSequenceEqual(tf.filters, self.example_params) def test_set_parameters_reject_one_side(self): - tf = ClusterFilters([None, None], ['en', 'de'], ['latin', 'latin'], None, None, None) + tf = ClusterFilters([None, None], langs=['en', 'de'], scripts=['latin', 'latin']) + default_rejects = [False] * len(self.col_names) rejects = copy.deepcopy(default_rejects) - rejects['LanguageIDFilter.0'] = True - params = copy.deepcopy(example_params) - params['LanguageIDFilter']['thresholds'][0] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) + rejects[7] = True # 'LanguageIDFilter.0' + params = copy.deepcopy(self.example_params) + params[5]['LanguageIDFilter']['thresholds'][0] = -1 + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) + self.assertEqual(tf.filters, params) - tf.filter_params = copy.deepcopy(default_params) rejects = copy.deepcopy(default_rejects) - rejects['LanguageIDFilter.1'] = True - params = copy.deepcopy(example_params) - params['LanguageIDFilter']['thresholds'][1] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) + rejects[8] = True # LanguageIDFilter.1 + params = copy.deepcopy(self.example_params) + params[5]['LanguageIDFilter']['thresholds'][1] = -1 + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) + self.assertEqual(tf.filters, params) - tf.filter_params = copy.deepcopy(default_params) rejects = copy.deepcopy(default_rejects) - rejects['CharacterScoreFilter.0'] = True - params = copy.deepcopy(example_params) - params['CharacterScoreFilter']['thresholds'][0] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) + rejects[5] = True # 'CharacterScoreFilter.0' + params = copy.deepcopy(self.example_params) + params[4]['CharacterScoreFilter']['thresholds'][0] = -1 + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) + self.assertEqual(tf.filters, params) - tf.filter_params = copy.deepcopy(default_params) rejects = copy.deepcopy(default_rejects) - rejects['CharacterScoreFilter.1'] = True - params = copy.deepcopy(example_params) - params['CharacterScoreFilter']['thresholds'][1] = -1 - tf._set_parameters([1 for i in range(10)], rejects) - self.assertEqual(tf.filter_params, params) + rejects[6] = True # 'CharacterScoreFilter.1' + params = copy.deepcopy(self.example_params) + params[4]['CharacterScoreFilter']['thresholds'][1] = -1 + tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) + self.assertEqual(tf.filters, params) -class TestGenericFilterAdjuster(unittest.TestCase): +class TestPercentileAdjuster(unittest.TestCase): # These have arguments without defaults expected_failures = {'CharacterScoreFilter', 'CrossEntropyFilter', 'CrossEntropyDifferenceFilter', @@ -173,8 +163,8 @@ def test_default_parameters(self): for filter_name, filter_cls in inspect.getmembers(filters, inspect.isclass): if not issubclass(filter_cls, FilterABC) or filter_cls == FilterABC: continue - adjuster = GenericFilterAdjuster(filter_name) - params = adjuster.default_parameters + adjuster = PercentileAdjuster(filter_name) + params = adjuster.initial_parameters logging.info("%s %s", filter_name, params) if filter_name in self.expected_failures: with self.assertRaises((ConfigurationError, ModuleNotFoundError)): @@ -195,7 +185,7 @@ def test_adjusted_parameters(self): continue if filter_name in self.expected_failures: continue - adjuster = GenericFilterAdjuster(filter_name) + adjuster = PercentileAdjuster(filter_name) if not adjuster.is_adjustable(): continue params = adjuster.get_adjusted_parameters(data, excluded_percentile=0.1) From 48faa6c6bab438f5614cb0e0208d007d7f2ce36f Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 23 Aug 2023 16:08:39 +0300 Subject: [PATCH 04/12] add accept_threshold and reject_threshold properties for filters --- docs/CHANGELOG.md | 2 +- docs/filters/custom_filters.md | 41 ++++++++++++++++++++-- docs/filters/sentence_embedding_filters.md | 2 +- opusfilter/__init__.py | 22 ++++++++++++ opusfilter/autogen.py | 10 ++---- opusfilter/embeddings.py | 2 ++ opusfilter/filters.py | 29 +++++++++++++-- opusfilter/lm.py | 6 ++++ opusfilter/word_alignment.py | 3 ++ tests/test_autogen.py | 8 ++--- 10 files changed, 107 insertions(+), 18 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 5e9bce2..566245a 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `opusfilter-autogen` script for automatic filter config generation -- `score_direction` property for filters +- `score_direction`, `accept_threshold`, and `reject_threshold` properties for filters ### Changed diff --git a/docs/filters/custom_filters.md b/docs/filters/custom_filters.md index fe9890a..58889ca 100644 --- a/docs/filters/custom_filters.md +++ b/docs/filters/custom_filters.md @@ -5,7 +5,10 @@ the filter configuration entries. The custom filters should inherit the abstract base class `FilterABC` from the `opusfilter` package. They should implement two abstract -methods: `score` and `accept`. +methods, `score` and `accept`, and one abstract property, +`score_direction`. Additionally, for filters with adjustable +thresholds, defining `accept_threshold` and `reject_threshold` +properties is recommended. The `score` method is a generator that takes an iterator over tuples of parallel sentences, and yields a score object for each pair. The @@ -16,6 +19,17 @@ The `accept` method takes a single output yielded by the `score` method, and returns whether the sentence pair should be accepted based on the score. +The `score_direction` should be one of the following contants defined +in the `opusfilter` module depending on the output of the `score()` +method: + +* `CLEAN_LOW`: scores below a threshold parameter indicate clean data +* `CLEAN_HIGH`: scores above a threshold parameter indicate clean data +* `CLEAN_BETWEEN`: scores between minimum and maximum thresholds + indicate clean data +* `CLEAN_TRUE`: score value `True` indicates clean data +* `CLEAN_FALSE`: score value `False` indicates clean data + If the filter requires any parameters (e.g. score thresholds for the `accept` method), the class should implement also the `__init__` method. Arbitrary keyword arguments should be accepted (with @@ -24,6 +38,25 @@ should be called with the remaining keyword arguments. The keyword argument `name` is reserved for giving names to the filters and `workdir` for a location for non-temprary files. +For compability with the included [automatic configuration generation +tools](../automatic_configuration.md), also the following should be +considered: + +* If there is a threshold value used by `accept`, the argument should + be named as `threshold` (a single global threshold) or `thresholds` + (multiple thresholds, e.g. one per language). The `accept_threshold` + and `reject_threshold` properties should have threshold values that + force all inputs to be accepted or rejected, respectively. That is, + a sensible threshold value will always be between `accept_threshold` + and `reject_threshold`. +* If there are lower and upper thresholds used by `accept` + (i.e. `score_direction` is `CLEAN_BETWEEN`), the respective + arguments should be named as `min_threshold` and `max_threshold` or + `min_length` and `max_length`. The `accept_threshold` and + `reject_threshold` properties should have tuples of two threshold + values (for lower and upper thresholds) that force all inputs to be + accepted or rejected, respectively. + Based on the `score` and `accept` methods, the abstract class `FilterABC` implements the following three generators that take iterator over segment pairs as input: @@ -44,6 +77,10 @@ import opusfilter class UppercaseFilter(opusfilter.FilterABC): + score_direction = opusfilter.CLEAN_LOW + accept_threshold = 1 + 10**-6 + reject_threshold = 0 + def __init__(self, threshold=0.5, **kwargs): self.threshold = threshold super().__init__(**kwargs) @@ -88,4 +125,4 @@ If a filter requires external resources files (e.g. for model parameters), or stores non-temporary files itself, they should be located in the path defined the attribute `workdir`. The implementation of the filter should join `workdir` with relative file -paths using `os.path.join()`. \ No newline at end of file +paths using `os.path.join()`. diff --git a/docs/filters/sentence_embedding_filters.md b/docs/filters/sentence_embedding_filters.md index d9110eb..64288ef 100644 --- a/docs/filters/sentence_embedding_filters.md +++ b/docs/filters/sentence_embedding_filters.md @@ -18,7 +18,7 @@ calculate the similarity of the embeddings. If `nn_model` is provided, the similarities are normalized by the average similarity to K nearest neighbors in a reference corpus; see [train_nearest_neighbors](train_nearest_neighbors) for training a -model. With normalized scores, threshold around 1.0 is likely more +model. With normalized scores, threshold closer to 1.0 is likely more suitable than the default 0.5. Especially with the nearest neighbor normalization, this filter can be diff --git a/opusfilter/__init__.py b/opusfilter/__init__.py index 467acf8..33bd123 100644 --- a/opusfilter/__init__.py +++ b/opusfilter/__init__.py @@ -71,6 +71,28 @@ def filterfalse(self, pairs): def score_direction(self): """Hint for which score values indicate accept""" + @property + def accept_threshold(self): + """Threshold value for which accept() is always true + + If not applicable, the value is None. If score_direction is + CLEAN_BETWEEN, the value is a tuple of lower and upper + thresholds. + + """ + return None + + @property + def reject_threshold(self): + """Threshold value for which accept() is always false + + If not applicable, the value is None. If score_direction is + CLEAN_BETWEEN, the value is a tuple of lower and upper + thresholds. + + """ + return None + class PreprocessorABC(metaclass=abc.ABCMeta): """Abstract base class for preprocessors""" diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index ac248f9..53d154b 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -310,7 +310,7 @@ class FilterInspect: # Lists of possible filter threshold arguments SINGLE_THRESHOLD_ARGUMENTS = ['threshold'] MULTI_THRESHOLD_ARGUMENTS = ['threshold', 'thresholds'] - MIN_MAX_ARGUMENTS = [('min_length', 'max_length')] + MIN_MAX_ARGUMENTS = [('min_length', 'max_length'), ('min_threshold', 'max_threshold')] ALL_THRESHOLD_ARGUMENTS = SINGLE_THRESHOLD_ARGUMENTS + MULTI_THRESHOLD_ARGUMENTS + MIN_MAX_ARGUMENTS def __init__(self, filterclass, filter_parameters=None): @@ -463,9 +463,6 @@ def set_filter_thresholds(self): score_file = get_score_file( self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) - # score_file = get_score_file( - # self.files, [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()], - # self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) self.scoredata = ScoreClusters(score_file) self._set_parameters(self.scoredata.get_result_df()) if os.path.isfile(self.label_file_path) and not self.overwrite: @@ -476,7 +473,6 @@ def set_filter_thresholds(self): label_file.write(str(label)+'\n') if self.use_tmp: shutil.rmtree(self.inter_dir) - # self._filters = [{k.split('.', maxsplit=1)[0]: v} for k, v in self.filter_params.items()] def _set_parameters(self, df): """Set filter parameters based on ScoreClusters @@ -504,8 +500,8 @@ def _set_parameters(self, df): thresholds = list(df_part['threshold']) for i, reject in enumerate(df_part.reject): if reject: - # FIXME: -1 may not work for all filters - thresholds[i] = -1 + # Set a threshold that accepts all input + thresholds[i] = filter_inspect.filter_cls.accept_threshold logger.warning(thresholds) new_params[threshold_key] = thresholds if len(thresholds) > 1 else thresholds[0] logger.warning({classname: new_params}) diff --git a/opusfilter/embeddings.py b/opusfilter/embeddings.py index 5942d46..2f993c8 100644 --- a/opusfilter/embeddings.py +++ b/opusfilter/embeddings.py @@ -68,6 +68,8 @@ class SentenceEmbeddingFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = 1 + 10**-6 def __init__(self, languages=None, threshold=0.5, nn_model=None, chunksize=200, **kwargs): try: diff --git a/opusfilter/filters.py b/opusfilter/filters.py index 6dcb386..4e0ffff 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -24,6 +24,8 @@ class LengthFilter(FilterABC): """Sentence length filter""" score_direction = CLEAN_BETWEEN + accept_threshold = (0, math.inf) + reject_threshold = (math.inf, 0) def __init__(self, min_length=1, max_length=100, unit='word', pass_empty=False, **kwargs): min_length, max_length, unit = check_args_compability( @@ -57,6 +59,8 @@ class LengthRatioFilter(FilterABC): """Character length ratio""" score_direction = CLEAN_LOW + accept_threshold = math.inf + reject_threshold = 0 def __init__(self, threshold=3, unit='word', **kwargs): self.threshold = threshold @@ -89,6 +93,8 @@ class LongWordFilter(FilterABC): """Word length filter""" score_direction = CLEAN_LOW + accept_threshold = math.inf + reject_threshold = 1 def __init__(self, threshold=40, **kwargs): self.threshold = check_args_compability(threshold, required_types=[(int, float)], names=['threshold']) @@ -111,6 +117,8 @@ class AverageWordLengthFilter(FilterABC): """ score_direction = CLEAN_BETWEEN + accept_threshold = (0, math.inf) + reject_threshold = (math.inf, 0) def __init__(self, min_length=2, max_length=20, pass_empty=False, **kwargs): min_length, max_length = check_args_compability( @@ -202,6 +210,8 @@ class AlphabetRatioFilter(FilterABC): """Proportion of alphabetic characters in the segment""" score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = 1 + 10**-6 def __init__(self, threshold=0.75, exclude_whitespace=False, **kwargs): self.threshold = check_args_compability(threshold, required_types=[(float, int)], names=['threshold']) @@ -236,6 +246,8 @@ class CharacterScoreFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = 1 + 10**-6 def __init__(self, scripts=None, thresholds=None, **kwargs): if scripts is None: @@ -279,6 +291,8 @@ class LanguageIDFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = -1 + reject_threshold = 1 def __init__(self, languages=None, id_method='langid', thresholds=None, fasttext_model_path=None, langid_languages=None, cld2_options=None, @@ -379,6 +393,8 @@ class TerminalPunctuationFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = math.inf def __init__(self, threshold=-2, **kwargs): self.threshold = threshold @@ -391,7 +407,7 @@ def score(self, pairs): sent1, sent2 = pair spun = len([c for c in sent1 if c in ['.', '?', '!', '…']]) tpun = len([c for c in sent2 if c in ['.', '?', '!', '…']]) - score = abs(spun-tpun) + score = abs(spun - tpun) if spun > 1: score += spun - 1 if tpun > 1: @@ -416,6 +432,8 @@ class NonZeroNumeralsFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = 1 + 10**-6 def __init__(self, threshold=0.5, require_all=True, **kwargs): self.threshold = threshold @@ -448,6 +466,8 @@ class LongestCommonSubstringFilter(FilterABC): """ score_direction = CLEAN_LOW + accept_threshold = 1 + 10**-6 + reject_threshold = 0 def __init__(self, threshold=0.9, require_all=True, **kwargs): self.threshold = threshold @@ -484,6 +504,8 @@ class SimilarityFilter(FilterABC): """ score_direction = CLEAN_LOW + accept_threshold = 1 + 10**-6 + reject_threshold = 0 VALID_UNITS = ('word', 'char', 'character') @@ -538,10 +560,11 @@ class RepetitionFilter(FilterABC): """ score_direction = CLEAN_LOW - min_threshold = 1 + accept_threshold = math.inf + reject_threshold = 1 def __init__(self, threshold=2, min_length=3, max_length=100, **kwargs): - if threshold < self.min_threshold: + if threshold < self.reject_threshold: raise ConfigurationError(f"threshold for RepetitionFilter has to be at least one, got {threshold}") if min_length < 1: raise ConfigurationError(f"min_length for RepetitionFilter has to be at least one, got {min_length}") diff --git a/opusfilter/lm.py b/opusfilter/lm.py index 3cbbf09..70a7cfc 100644 --- a/opusfilter/lm.py +++ b/opusfilter/lm.py @@ -293,6 +293,8 @@ class CrossEntropyFilter(FilterABC): score_direction = CLEAN_LOW score_types = {'entropy', 'perplexity', 'logprob'} + accept_threshold = math.inf + reject_threshold = 0 def __init__(self, lm_params=None, score_type='entropy', thresholds=None, low_thresholds=None, diff_threshold=10.0, @@ -348,6 +350,8 @@ class CrossEntropyDifferenceFilter(FilterABC): """ score_direction = CLEAN_LOW + accept_threshold = math.inf + reject_threshold = -math.inf def __init__(self, id_lm_params=None, nd_lm_params=None, thresholds=None, score_for_empty=False, **kwargs): super().__init__(**kwargs) @@ -404,6 +408,8 @@ class LMClassifierFilter(FilterABC): """ score_direction = CLEAN_HIGH + accept_threshold = 0 + reject_threshold = 1 + 10**-6 def __init__(self, labels=None, lm_params=None, thresholds=None, relative_score=False, **kwargs): super().__init__(**kwargs) diff --git a/opusfilter/word_alignment.py b/opusfilter/word_alignment.py index 9f492ad..bb6f9a5 100644 --- a/opusfilter/word_alignment.py +++ b/opusfilter/word_alignment.py @@ -3,6 +3,7 @@ import contextlib import json import logging +import math import os import tempfile @@ -66,6 +67,8 @@ class WordAlignFilter(FilterABC): """ score_direction = CLEAN_LOW + accept_threshold = math.inf + reject_threshold = -math.inf _empty_pair_sentinel = object() def __init__(self, src_threshold=0, tgt_threshold=0, priors=None, model=3, diff --git a/tests/test_autogen.py b/tests/test_autogen.py index 8290097..331184c 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -127,28 +127,28 @@ def test_set_parameters_reject_one_side(self): rejects = copy.deepcopy(default_rejects) rejects[7] = True # 'LanguageIDFilter.0' params = copy.deepcopy(self.example_params) - params[5]['LanguageIDFilter']['thresholds'][0] = -1 + params[5]['LanguageIDFilter']['thresholds'][0] = filters.LanguageIDFilter.accept_threshold tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) self.assertEqual(tf.filters, params) rejects = copy.deepcopy(default_rejects) rejects[8] = True # LanguageIDFilter.1 params = copy.deepcopy(self.example_params) - params[5]['LanguageIDFilter']['thresholds'][1] = -1 + params[5]['LanguageIDFilter']['thresholds'][1] = filters.LanguageIDFilter.accept_threshold tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) self.assertEqual(tf.filters, params) rejects = copy.deepcopy(default_rejects) rejects[5] = True # 'CharacterScoreFilter.0' params = copy.deepcopy(self.example_params) - params[4]['CharacterScoreFilter']['thresholds'][0] = -1 + params[4]['CharacterScoreFilter']['thresholds'][0] = filters.CharacterScoreFilter.accept_threshold tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) self.assertEqual(tf.filters, params) rejects = copy.deepcopy(default_rejects) rejects[6] = True # 'CharacterScoreFilter.1' params = copy.deepcopy(self.example_params) - params[4]['CharacterScoreFilter']['thresholds'][1] = -1 + params[4]['CharacterScoreFilter']['thresholds'][1] = filters.CharacterScoreFilter.accept_threshold tf._set_parameters(self._make_df(self.col_names, [1] * len(self.col_names), rejects)) self.assertEqual(tf.filters, params) From c52b55dedd8a98a2e1c53b22ec5dd2dc13d37c6b Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 30 Aug 2023 20:00:49 +0300 Subject: [PATCH 05/12] refactor and improve percentiles method and fix unit tests --- bin/opusfilter-autogen | 2 - docs/automatic_configuration.md | 57 +++++++++------ opusfilter/autogen.py | 126 +++++++++++++++++++++++--------- opusfilter/classifier.py | 56 +------------- opusfilter/filters.py | 14 +++- opusfilter/util.py | 58 ++++++++++++++- tests/test_autogen.py | 26 +++++-- 7 files changed, 213 insertions(+), 126 deletions(-) diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index d6b4bba..136d655 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -24,8 +24,6 @@ parser = argparse.ArgumentParser( prog='opusfilter-autogen', description='Generate initial configuration based on parallel text data') -#parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'), -# help='Add filter of CLASS with JSON parameters object') parser.add_argument('--files', required=True, nargs='+', metavar='TEXTFILE', help='parallel text input file(s)') parser.add_argument('--langs', nargs='+', metavar='LANGCODE', help='Language codes corresponding to the input files. If omitted, LanguageIDFilters will not be used.') diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md index 1c76bad..3be02db 100644 --- a/docs/automatic_configuration.md +++ b/docs/automatic_configuration.md @@ -7,11 +7,14 @@ step, with a few options for determining the filter parameters. The usage description for the script is as follows: ```text usage: opusfilter-autogen [-h] --files TEXTFILE [TEXTFILE ...] - [--langs LANGCODE [LANGCODE ...]] [--scripts SCRIPT [SCRIPT ...]] - [--filter-params {cluster,default,percentiles}] - [--sample-size SAMPLE_SIZE] [--noisy-percentile NOISY_PERCENTILE] - [--work-dir WORK_DIR] [--inter-dir INTER_DIR] [--plot] - [--overwrite] [-o CONFIGFILE] + [--langs LANGCODE [LANGCODE ...]] + [--scripts SCRIPT [SCRIPT ...]] + [--method {defaults,percentiles,clustering}] + [--sample-size SAMPLE_SIZE] + [--noisy-percentile NOISY_PERCENTILE] + [--work-dir WORK_DIR] [--inter-dir INTER_DIR] + [--plot] [--list-defaults] [--overwrite] + [-o CONFIGFILE] Generate initial configuration based on parallel text data @@ -20,37 +23,43 @@ options: --files TEXTFILE [TEXTFILE ...] parallel text input file(s) --langs LANGCODE [LANGCODE ...] - Language codes corresponding to the input files. If omitted, - LanguageIDFilters will not be used. + Language codes corresponding to the input files. If + omitted, LanguageIDFilters will not be used. --scripts SCRIPT [SCRIPT ...] - Alphabetic scripts (e.g. Latin) corresponding to the input files. - If omitted, CharacterScoreFilter will not be used. - --filter-params {default,percentiles,unsupervised} - Method for selecting filter parameters (default: unsupervised) + Alphabetic scripts (e.g. Latin) corresponding to the + input files. If omitted, CharacterScoreFilter will not + be used. + --method {defaults,percentiles,clustering} + Method for selecting filter thresholds (default: + clustering) --sample-size SAMPLE_SIZE - Max number of sentence pairs used for clustering (default 100000) + Max number of sentence pairs used for clustering + (default 100000) --noisy-percentile NOISY_PERCENTILE - Proportion of the data considered to be noisy; only for percentiles - method (default 0.001) - --work-dir WORK_DIR Location of the source and target files for the generated - configuration (default work) + Proportion of the data considered to be noisy; only + for percentiles method (default 0.001) + --work-dir WORK_DIR Location of the source and target files for the + generated configuration (default work) --inter-dir INTER_DIR - Save intermediate files in this directory (use a temporary - directory if not given) - --plot Show a scatter plot of the clustering and histograms of feature - data distributions + Save intermediate files in this directory (use a + temporary directory if not given) + --plot Show a scatter plot of the clustering and histograms + of feature data distributions + --list-defaults List default filters of the method to the output and + quit --overwrite Overwrite existing config file and intermediate files -o CONFIGFILE, --output CONFIGFILE Output configuration file (default -) ``` -The `--filter-params` options sets how the filter parameters are set. -The option `default` uses the default parameters defined in the filter +The `--method` option sets how the filter parameters are set. The +option `default` uses the default parameters defined in the filter classes. The option `percentiles` assumes that a proportion of the data (set by `--noisy-percentile`) is noisy, and sets the thresholds for each filter independently based on the percentile. The -`unsupervised` option is likely the most useful of the three, and -described in more detail below. +`clustering` option is likely the most useful of the three, and +described in more detail below. However, it is applicable to a more +limited set of filters. ## Unsupervised feature selection for filters diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index 53d154b..8b93b39 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -265,6 +265,15 @@ def __init__(self, files, sample_size=100000, max_length=1000, inter_dir=None, o class PercentileFilters(DataBasedFiltersABC): """Filter configuration based on filter score percentiles""" + DEFAULT_FILTERS = ['LengthFilter', + ('LengthRatioFilter.char', {'unit': 'char'}), + ('LengthRatioFilter.word', {'unit': 'word'}), + 'LongWordFilter', 'HtmlTagFilter', + 'AverageWordLengthFilter', 'AlphabetRatioFilter', + 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', + 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', + 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + def __init__(self, files, excluded_percentile=0.001, **kwargs): super().__init__(files, **kwargs) self.excluded_percentile = excluded_percentile @@ -378,6 +387,86 @@ def find_threshold_keys(self, number): class PercentileAdjuster(FilterInspect): """Class for setting filters to remove given percentile of data""" + @staticmethod + def _find_index(target, sorted_value_list): + """Locate index of value exceeding target in sorted list + + If target is not in the list, returns the index of the first + value exceeding it. + + """ + for idx, value2 in enumerate(sorted_value_list): + if value2 >= target: + return idx + return len(sorted_value_list) - 1 + + def _select_value(self, values, percentile, stats_key, reverse): + """Select threshold value for single column""" + sorted_values = sorted(values.unique(), reverse=reverse) + if len(sorted_values) == 1: + # No variation -> accept all + return self.filter_cls.accept_threshold + stats = values.describe(percentiles=[percentile]) + logger.info(stats) + value = stats.loc[stats_key] + idx = self._find_index(value, sorted_values) + if idx < len(sorted_values) - 1: + # Using the next value ensures that the filter will not + # exclude everything + value = sorted_values[idx + 1] + return value.item() + + def _select_values(self, df, score_dir, excluded_percentile): + """Select threshold values from the distribution in dataframe + + df: DataFrame containing the data values for the filter + score_dir: filter's score direction (CLEAN_LOW or CLEAN_HIGH) + excluded_percentile: target percentile to exclude as noisy + + """ + if score_dir == CLEAN_LOW: + percentile = 1 - excluded_percentile + stats_key = f'{100*(1-excluded_percentile):g}%' + reverse = False + else: + percentile = excluded_percentile + stats_key = f'{100*excluded_percentile:g}%' + reverse = True + return [self._select_value(df[column], percentile, stats_key, reverse) for column in df.columns] + + def _select_values_between(self, df, excluded_percentile): + """Select min-max threshold values from the distribution in dataframe + + df: DataFrame containing the data values for the filter + excluded_percentile: target percentile to exclude as noisy + + """ + half_pct = excluded_percentile / 2 + percentiles = [half_pct, 1 - half_pct] + pct_keys = [f'{100*half_pct:g}%', f'{100*(1-half_pct):g}%'] + min_values, max_values = [], [] + for column in df.columns: + sorted_values = sorted(df[column].unique(), reverse=False) + if len(sorted_values) == 1: + # No variation -> accept all + min_value, max_value = self.filter_cls.accept_threshold + else: + stats = df[column].describe(percentiles=percentiles) + logger.info(stats) + min_value = stats.loc[pct_keys[0]].item() + min_idx = self._find_index(min_value, sorted_values) + if min_idx > 0: + # Ensure that the filter will not exclude everything + min_value = sorted_values[min_idx - 1].item() + max_value = stats.loc[pct_keys[1]].item() + max_idx = self._find_index(max_value, sorted_values) + if max_idx < len(sorted_values) - 1: + # Ensure that the filter will not exclude everything + max_value = sorted_values[max_idx + 1].item() + min_values.append(min_value) + max_values.append(max_value) + return min_values, max_values + def get_adjusted_parameters(self, df, excluded_percentile=0.01): """Estimate parameters for the filter using data @@ -387,50 +476,23 @@ def get_adjusted_parameters(self, df, excluded_percentile=0.01): the lowest and highest values. """ - # TODO: It should be checked that the selected threshold does - # not remove too much data. E.g. if the clean values are high, - # excluded_percentile is 1%, and the highest 99.1% of the - # values are 1, the selected threshold 1 might remove all data - # if the condition for accepting the value is being greater - # than the threshold. parameters = copy.deepcopy(self.initial_parameters) if not self.is_adjustable(): return parameters score_dir = self.filter_cls.score_direction logger.info("score type for %s: %s", self.filter_name, score_dir) - if score_dir == CLEAN_LOW: - percentiles = [1 - excluded_percentile] - pct_keys = [f'{100*(1-excluded_percentile):g}%'] - elif score_dir == CLEAN_HIGH: - percentiles = [excluded_percentile] - pct_keys = [f'{100*excluded_percentile:g}%'] - elif score_dir == CLEAN_BETWEEN: - half_pct = excluded_percentile / 2 - percentiles = [half_pct, 1 - half_pct] - pct_keys = [f'{100*half_pct:g}%', f'{100*(1-half_pct):g}%'] - else: - raise ValueError(f"Unknown score type '{score_dir}'") score_dim = len(df.columns) threshold_key = self.find_threshold_keys(score_dim) if threshold_key is None: return parameters if score_dir in {CLEAN_LOW, CLEAN_HIGH}: # Clean is below or above threshold - values = [] - for column in df.columns: - stats = df[column].describe(percentiles=percentiles) - logger.info(stats) - values.append(stats.loc[pct_keys[0]].item()) + values = self._select_values(df, score_dir, excluded_percentile) logger.info("Selected values %s for %s", values, threshold_key) parameters[threshold_key] = values[0] if score_dim == 1 else values elif score_dir == CLEAN_BETWEEN: # Clean is between minimum and maximum - min_values, max_values = [], [] - for column in df.columns: - stats = df[column].describe(percentiles=percentiles) - logger.info(stats) - min_values.append(stats.loc[pct_keys[0]].item()) - max_values.append(stats.loc[pct_keys[1]].item()) + min_values, max_values = self._select_values_between(df, excluded_percentile) if score_dim == 1: min_values = min_values[0] max_values = max_values[0] @@ -489,12 +551,9 @@ def _set_parameters(self, df): if 'name' in params: column_prefix += '.' + params['name'] df_part = df[df.name.str.startswith(column_prefix)] - logger.warning(column_prefix) - logger.warning(df_part) if all(df_part.reject): continue threshold_key = filter_inspect.find_threshold_keys(len(df_part)) - logger.warning(threshold_key) if threshold_key is None: continue thresholds = list(df_part['threshold']) @@ -502,8 +561,5 @@ def _set_parameters(self, df): if reject: # Set a threshold that accepts all input thresholds[i] = filter_inspect.filter_cls.accept_threshold - logger.warning(thresholds) new_params[threshold_key] = thresholds if len(thresholds) > 1 else thresholds[0] - logger.warning({classname: new_params}) self._filters.append({classname: new_params}) - logger.info("Filters: %s", self.filters) diff --git a/opusfilter/classifier.py b/opusfilter/classifier.py index 9e4796f..5a44d64 100644 --- a/opusfilter/classifier.py +++ b/opusfilter/classifier.py @@ -1,6 +1,5 @@ """Filter classifier""" -import json import logging import collections import functools @@ -9,70 +8,17 @@ import numpy as np import pandas as pd -from pandas import json_normalize import sklearn.linear_model from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, log_loss from . import CLEAN_LOW, CLEAN_HIGH, CLEAN_TRUE, CLEAN_FALSE from . import filters as filtermodule -from .util import file_open, grouper, import_class +from .util import file_open, import_class, load_dataframe, load_dataframe_in_chunks logger = logging.getLogger(__name__) -def lists_to_dicts(obj): - """Convert lists in a JSON-style object to dicts recursively - - Examples: - - >>> lists_to_dicts([3, 4]) - {"0": 3, "1": 4} - >>> lists_to_dicts([3, [4, 5]]) - {"0": 3, "1": {"0": 4, "1": 5}} - >>> lists_to_dicts({"a": [3, 4], "b": []}) - {"a": {"0": 3, "1": 4}, "b": {}} - - """ - if isinstance(obj, dict): - return {key: lists_to_dicts(value) for key, value in obj.items()} - if isinstance(obj, list): - return {str(idx): lists_to_dicts(value) for idx, value in enumerate(obj)} - return obj - - -def load_dataframe(data_file): - """Load normalized scores dataframe from a JSON lines file""" - data = [] - with file_open(data_file) as dfile: - for line in dfile: - try: - data.append(lists_to_dicts(json.loads(line))) - except json.decoder.JSONDecodeError as err: - logger.error(line) - raise err - return pd.DataFrame(json_normalize(data)) - - -def load_dataframe_in_chunks(data_file, chunksize): - """Yield normalized scores dataframes from a chunked JSON lines file - - Use instead of load_dataframe if the data is too large to fit in memory. - - """ - with file_open(data_file) as dfile: - for num, chunk in enumerate(grouper(dfile, chunksize)): - data = [] - for line in chunk: - try: - data.append(lists_to_dicts(json.loads(line))) - except json.decoder.JSONDecodeError as err: - logger.error(line) - raise err - logger.info("Processing chunk %s with %s lines", num, len(data)) - yield pd.DataFrame(json_normalize(data)) - - def standardize_dataframe_scores(dataframe, features, means_stds=None): """Normalize, zero average, and set direction for scores in each column""" new_df = pd.DataFrame() diff --git a/opusfilter/filters.py b/opusfilter/filters.py index 4e0ffff..b44dd1b 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -60,7 +60,7 @@ class LengthRatioFilter(FilterABC): score_direction = CLEAN_LOW accept_threshold = math.inf - reject_threshold = 0 + reject_threshold = 1 def __init__(self, threshold=3, unit='word', **kwargs): self.threshold = threshold @@ -564,14 +564,18 @@ class RepetitionFilter(FilterABC): reject_threshold = 1 def __init__(self, threshold=2, min_length=3, max_length=100, **kwargs): - if threshold < self.reject_threshold: - raise ConfigurationError(f"threshold for RepetitionFilter has to be at least one, got {threshold}") if min_length < 1: raise ConfigurationError(f"min_length for RepetitionFilter has to be at least one, got {min_length}") + if threshold < self.reject_threshold: + raise ConfigurationError(f"threshold for RepetitionFilter has to be at least one, got {threshold}") self._threshold = threshold self._min_length = min_length self._max_length = max_length - self._regexp = self._get_regexp() + if threshold == self.accept_threshold: + logger.warning("threshold for RepetitionFilter set to %s, filter disabled", threshold) + self._regexp = None + else: + self._regexp = self._get_regexp() super().__init__(**kwargs) @property @@ -604,6 +608,8 @@ def get_repetitions(self, segment): None are returned. """ + if not self._regexp: + return 0, None match = self._regexp.search(segment) if match: full = match.group(0) diff --git a/opusfilter/util.py b/opusfilter/util.py index dd4a0ed..6b994a6 100644 --- a/opusfilter/util.py +++ b/opusfilter/util.py @@ -5,10 +5,13 @@ import importlib import io import itertools +import json import logging import lzma import os +import pandas as pd +from pandas import json_normalize from tqdm import tqdm import ruamel.yaml @@ -18,6 +21,58 @@ logger = logging.getLogger(__name__) +def lists_to_dicts(obj): + """Convert lists in a JSON-style object to dicts recursively + + Examples: + + >>> lists_to_dicts([3, 4]) + {"0": 3, "1": 4} + >>> lists_to_dicts([3, [4, 5]]) + {"0": 3, "1": {"0": 4, "1": 5}} + >>> lists_to_dicts({"a": [3, 4], "b": []}) + {"a": {"0": 3, "1": 4}, "b": {}} + + """ + if isinstance(obj, dict): + return {key: lists_to_dicts(value) for key, value in obj.items()} + if isinstance(obj, list): + return {str(idx): lists_to_dicts(value) for idx, value in enumerate(obj)} + return obj + + +def load_dataframe(data_file): + """Load normalized scores dataframe from a JSON lines file""" + data = [] + with file_open(data_file) as dfile: + for line in dfile: + try: + data.append(lists_to_dicts(json.loads(line))) + except json.decoder.JSONDecodeError as err: + logger.error(line) + raise err + return pd.DataFrame(json_normalize(data)) + + +def load_dataframe_in_chunks(data_file, chunksize): + """Yield normalized scores dataframes from a chunked JSON lines file + + Use instead of load_dataframe if the data is too large to fit in memory. + + """ + with file_open(data_file) as dfile: + for num, chunk in enumerate(grouper(dfile, chunksize)): + data = [] + for line in chunk: + try: + data.append(lists_to_dicts(json.loads(line))) + except json.decoder.JSONDecodeError as err: + logger.error(line) + raise err + logger.info("Processing chunk %s with %s lines", num, len(data)) + yield pd.DataFrame(json_normalize(data)) + + def import_class(config_dict, default_modules): """Import class from default modules or custom module defined in config @@ -67,7 +122,8 @@ def check_args_compability(*args, required_types=None, choices=None, names=None) def type_error_msg(idx, type_, value): name = names[idx] if names else str(idx + 1) - return f"Values of argument '{name}' are not of the type {type_.__name__}: {value}" + typestr = ' or '.join(t.__name__ for t in type_) if isinstance(type_, tuple) else type_.__name__ + return f"Values of argument '{name}' are not of the type {typestr}: {value}" def value_error_msg(idx, choices, value): name = names[idx] if names else str(idx + 1) diff --git a/tests/test_autogen.py b/tests/test_autogen.py index 331184c..5fecd5f 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -1,3 +1,4 @@ +import copy import inspect import logging import os @@ -6,12 +7,16 @@ import unittest import pandas as pd +from pandas import json_normalize import opustools from opusfilter import FilterABC, ConfigurationError, filters -from opusfilter.autogen import * +from opusfilter.autogen import ConfigurationGenerator, DefaultParameterFilters, \ + parse_filter_specs, PercentileFilters, PercentileAdjuster, ClusterFilters from opusfilter.opusfilter import OpusFilter +from opusfilter.pipeline import FilterPipeline +from opusfilter.util import lists_to_dicts class TestAutogen(unittest.TestCase): @@ -61,7 +66,6 @@ def test_default_filters(self): self.assertTrue(any(filter_name in f for f in filtergen.filters)) self._test_filters(filtergen.filters) - @unittest.expectedFailure def test_percentile_filters(self): filtergen = PercentileFilters( files=[self.src_out, self.tgt_out], langs=[self.source, self.target], scripts=['Latin', 'Latin'], @@ -173,13 +177,24 @@ def test_default_parameters(self): try: obj = filter_cls(**params) except ModuleNotFoundError: - logger.info("Skipping test for %s: Requred module not found", filter_name) + logging.info("Skipping test for %s: Requred module not found", filter_name) + + def _get_score_df(self, filter_cls, data): + pipeline = FilterPipeline([filter_cls()]) + df_data = [lists_to_dicts(score) for score in pipeline.score(data)] + return pd.DataFrame(json_normalize(df_data)) - @unittest.expectedFailure def test_adjusted_parameters(self): src_data = ['a'] * 11 + ['a bbbbb'] * 78 + ['a bbbbb cccc'] * 11 tgt_data = [seg.upper() for seg in src_data] data = list(zip(src_data, tgt_data)) + self._test_adjusted_parameters(data) + src_data += ['a bbbbb'] + tgt_data += ['A'] + data = list(zip(src_data, tgt_data)) + self._test_adjusted_parameters(data) + + def _test_adjusted_parameters(self, data): for filter_name, filter_cls in inspect.getmembers(filters, inspect.isclass): if not issubclass(filter_cls, FilterABC) or filter_cls == FilterABC: continue @@ -188,7 +203,8 @@ def test_adjusted_parameters(self): adjuster = PercentileAdjuster(filter_name) if not adjuster.is_adjustable(): continue - params = adjuster.get_adjusted_parameters(data, excluded_percentile=0.1) + df = self._get_score_df(filter_cls, data) + params = adjuster.get_adjusted_parameters(df, excluded_percentile=0.1) logging.info("%s %s", filter_name, params) obj = filter_cls(**params) filtered = list(obj.filter(data)) From 4206df6e512d61f2686cceb5ef6005095aaeeb5b Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 30 Aug 2023 21:54:50 +0300 Subject: [PATCH 06/12] add option to select filters for opusfilter-autogen --- bin/opusfilter-autogen | 28 +++++++++++++++++++--------- docs/automatic_configuration.md | 18 +++++++++++++----- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 136d655..1315905 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -2,6 +2,7 @@ import argparse import logging +import json import os import sys @@ -33,31 +34,40 @@ parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=( parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering', help='Method for selecting filter thresholds (default: %(default)s)') parser.add_argument('--sample-size', default=100000, type=int, - help='Max number of sentence pairs used for clustering (default %(default)s)') + help='Max number of sentence pairs used for data-based methods (default %(default)s)') parser.add_argument('--noisy-percentile', default=0.001, type=float, help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)') parser.add_argument('--work-dir', default='work', help='Location of the source and target files for the generated configuration (default %(default)s)') parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)') parser.add_argument('--plot', action='store_true', - help='Show a scatter plot of the clustering and histograms of feature data distributions') + help=('Show a scatter plot of the clustering and histograms of feature data distributions; ' + 'only for the clustering method')) parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit') -parser.add_argument('--overwrite', action='store_true', help='Overwrite existing config file and intermediate files') -parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-', metavar='CONFIGFILE', - help='Output configuration file (default %(default)s)') +parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'), + help=('Instead of using default filters, add a filter of CLASS with JSON parameters object ' + '("{}" for default parameters). The class name may be followed by a dot and a unique ' + 'filter identifier in order to allow multiple filters of the same class. Example: ' + '--add-filter LanguageIDFilter.cld2 \'{"id_method": "cld2"}\'')) +parser.add_argument('--overwrite', action='store_true', + help='Overwrite existing config file and intermediate files') +parser.add_argument('-o', '--output', type=argparse.FileType('w'), + default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)') args = parser.parse_args() +filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if args.add_filter else None + if args.method == 'clustering': filtergen = ClusterFilters( - files=args.files, langs=args.langs, scripts=args.scripts, sample_size=args.sample_size, - inter_dir=args.inter_dir, overwrite=args.overwrite) + files=args.files, langs=args.langs, scripts=args.scripts, filters=filters, + sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite) elif args.method == 'percentiles': filtergen = PercentileFilters( - files=args.files, langs=args.langs, scripts=args.scripts, + files=args.files, langs=args.langs, scripts=args.scripts, filters=filters, excluded_percentile=args.noisy_percentile, sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite) else: - filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts) + filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts, filters=filters) if args.list_defaults: yaml.dump(filtergen.DEFAULT_FILTERS, args.output) diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md index 3be02db..9d65a1f 100644 --- a/docs/automatic_configuration.md +++ b/docs/automatic_configuration.md @@ -13,8 +13,8 @@ usage: opusfilter-autogen [-h] --files TEXTFILE [TEXTFILE ...] [--sample-size SAMPLE_SIZE] [--noisy-percentile NOISY_PERCENTILE] [--work-dir WORK_DIR] [--inter-dir INTER_DIR] - [--plot] [--list-defaults] [--overwrite] - [-o CONFIGFILE] + [--plot] [--list-defaults] [--add-filter CLASS JSON] + [--overwrite] [-o CONFIGFILE] Generate initial configuration based on parallel text data @@ -33,8 +33,8 @@ options: Method for selecting filter thresholds (default: clustering) --sample-size SAMPLE_SIZE - Max number of sentence pairs used for clustering - (default 100000) + Max number of sentence pairs used for data-based + methods (default 100000) --noisy-percentile NOISY_PERCENTILE Proportion of the data considered to be noisy; only for percentiles method (default 0.001) @@ -44,9 +44,17 @@ options: Save intermediate files in this directory (use a temporary directory if not given) --plot Show a scatter plot of the clustering and histograms - of feature data distributions + of feature data distributions; only for the clustering + method --list-defaults List default filters of the method to the output and quit + --add-filter CLASS JSON + Instead of using default filters, add a filter of + CLASS with JSON parameters object ("{}" for default + parameters). The class name may be followed by a dot + and a unique filter identifier in order to allow + multiple filters of the same class. Example: --add- + filter LanguageIDFilter.cld2 '{"id_method": "cld2"}' --overwrite Overwrite existing config file and intermediate files -o CONFIGFILE, --output CONFIGFILE Output configuration file (default -) From 0eef64f9178ac96e46799b8ab8bd8ac1f3324456 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 30 Aug 2023 21:55:12 +0300 Subject: [PATCH 07/12] add legend to cluster plot --- opusfilter/autogen_cluster.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index fe604e6..d25e0b5 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -131,8 +131,13 @@ def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) data_t = PCA(n_components=2).fit_transform(self.standard_data) - colors = ['orange' if lbl == self.noisy_label else 'blue' for lbl in self.labels] - plt.scatter(data_t[:, 0], data_t[:, 1], c=colors, marker=',', s=1) + for label_id in [self.noisy_label, self.clean_label]: + points = np.where(self.labels == label_id) + plt.scatter(data_t[points, 0], data_t[points, 1], + c='orange' if label_id == self.noisy_label else 'blue', + label='noisy' if label_id == self.noisy_label else 'clean', + marker=',', s=1) + plt.legend() plt.title('Clusters') noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)] clean_samples = self.df.iloc[np.where(self.labels == self.clean_label)] From e8a3f4e5f2fefb2ac5a986f9ef06253bbaf27658 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 6 Sep 2023 09:29:00 +0300 Subject: [PATCH 08/12] fix KMeans arguments to support older sklearn versions --- opusfilter/autogen_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index d25e0b5..1e9f109 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -37,7 +37,7 @@ def __init__(self, score_file, n=2): self.standard_data = self.scaler.fit_transform(self.df) logger.info('Training KMeans with %s clusters', n) - self.kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto').fit(self.standard_data) + self.kmeans = KMeans(n_clusters=n, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) self.labels = self.kmeans.labels_ self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) self._noisy_label = self._get_noisy_label() From bbbf6b0158768d95137991cf375120b26c17e097 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 6 Sep 2023 09:51:39 +0300 Subject: [PATCH 09/12] remove py3.6 support and tests --- .github/workflows/ci.yml | 4 ---- README.md | 2 +- docs/CHANGELOG.md | 1 + docs/CONTRIBUTING.md | 5 +++-- docs/installation.md | 2 +- requirements-py36.txt | 24 ------------------------ setup.py | 2 +- 7 files changed, 7 insertions(+), 33 deletions(-) delete mode 100644 requirements-py36.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d332686..c0b8889 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,10 +20,6 @@ jobs: os: [ubuntu-latest] python-version: ["3.7", "3.8", "3.9", "3.10"] requirements-file: ["requirements.txt"] - include: - - os: ubuntu-20.04 - python-version: "3.6" - requirements-file: "requirements-py36.txt" runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index 375d593..13c91e6 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Install from source: ### Troubleshooting -OpusFilter should generally work fine on Python 3.6 to 3.10. In the case of troubles, try installing the exact versions in `requirements.txt`: +OpusFilter should generally work fine on Python 3.7 to 3.10. In the case of troubles, try installing the exact versions in `requirements.txt`: * `pip install -r requirements.txt` diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 566245a..6a0d8fe 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - use xxhash instead of pyhash for hash functions - use opus-fast-mosestokenizer instead of fast-mosestokenizer - install eflomal from PyPI and use the new interface in WordAlignFilter +- remove Python 3.6 support and tests ### Fixed diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 5e08bdf..8a7ab74 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -5,7 +5,7 @@ issues page. We are also happy to consider pull requests. There are a few rules for pull requests: * Make a pull request to the `develop` branch instead of `master`. -* The code should support at least Python versions from 3.6 to 3.8. +* The code should support at least Python versions from 3.7 to 3.10. * Please follow [PEP 8](https://www.python.org/dev/peps/pep-0008/). Exception: The maximum line length is 127 characters instead of 79. * Especially for new features, please include test cases for unit testing. @@ -19,7 +19,8 @@ work, if you have VariKN and eflomal set up as instructed - `pytest` skips the respective tests if not.) GitHub workflows defined in the project run automatically `flake8` -checks and unit testing with `pytest` using Python 3.6, 3.7, and 3.8. +checks and unit testing with `pytest` using Python 3.7, 3.8, 3.9, and +3.10. Especially for larger contributions, consider using a code analysis tool like [Pylint](https://github.com/PyCQA/pylint). Install it diff --git a/docs/installation.md b/docs/installation.md index 9213b11..f2f2033 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -12,7 +12,7 @@ Install from source: Note that all required libraries are not available to install via PyPI on Windows OS. On Linux and MacOS, it should work directly for Python -versions from 3.6 to 3.10. +versions from 3.7 to 3.10. ## Required libraries diff --git a/requirements-py36.txt b/requirements-py36.txt deleted file mode 100644 index 833f1e3..0000000 --- a/requirements-py36.txt +++ /dev/null @@ -1,24 +0,0 @@ -setuptools==59.6.0 -setuptools_scm==6.4.2 -opustools -jieba>=0.42 -beautifulsoup4>=4.8.2 -graphviz>=0.16 -langid==1.1.6 -matplotlib>=3.3.0 -opus-fast-mosestokenizer==0.0.8.3 -pandas>=1.0.0 -pycld2==0.41 -xxhash==3.2.0 -rapidfuzz>=2.0.5 -regex>=2019.11.1 -requests>=2.22.0 -ruamel.yaml>=0.15.0 -scikit-learn>=0.24.0 -sentence-splitter==1.4 -tqdm>=4.38.0 -fasttext==0.9.2 -mecab-python3==1.0.5 -unidic-lite==1.0.8 -subword-nmt==0.3.8 -Morfessor==2.0.6 diff --git a/setup.py b/setup.py index 92c25f6..f567c42 100644 --- a/setup.py +++ b/setup.py @@ -84,5 +84,5 @@ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ), - python_requires=">=3.6", + python_requires=">=3.7", ) From 34ce88139685c54f231e6b94fcedf0d262881e48 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 6 Sep 2023 16:17:37 +0300 Subject: [PATCH 10/12] add --clusters option to opusfilter-autogen --- bin/opusfilter-autogen | 9 ++++++--- docs/automatic_configuration.md | 13 +++++++++---- docs/filters/custom_filters.md | 2 +- opusfilter/autogen.py | 5 +++-- opusfilter/autogen_cluster.py | 27 +++++++++++++++++---------- 5 files changed, 36 insertions(+), 20 deletions(-) diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index 1315905..a40cafd 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -33,10 +33,13 @@ parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=( 'If omitted, CharacterScoreFilter will not be used.')) parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering', help='Method for selecting filter thresholds (default: %(default)s)') -parser.add_argument('--sample-size', default=100000, type=int, +parser.add_argument('--sample-size', default=100000, type=int, metavar='INT', help='Max number of sentence pairs used for data-based methods (default %(default)s)') -parser.add_argument('--noisy-percentile', default=0.001, type=float, +parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT', help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)') +parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT', + help=('Number of clusters for the clustering method; try increasing if too much data is clustered ' + 'as noisy (default %(default)s)')) parser.add_argument('--work-dir', default='work', help='Location of the source and target files for the generated configuration (default %(default)s)') parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)') @@ -60,7 +63,7 @@ filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if if args.method == 'clustering': filtergen = ClusterFilters( files=args.files, langs=args.langs, scripts=args.scripts, filters=filters, - sample_size=args.sample_size, inter_dir=args.inter_dir, overwrite=args.overwrite) + sample_size=args.sample_size, k=args.clusters, inter_dir=args.inter_dir, overwrite=args.overwrite) elif args.method == 'percentiles': filtergen = PercentileFilters( files=args.files, langs=args.langs, scripts=args.scripts, filters=filters, diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md index 9d65a1f..acad8f0 100644 --- a/docs/automatic_configuration.md +++ b/docs/automatic_configuration.md @@ -32,12 +32,15 @@ options: --method {defaults,percentiles,clustering} Method for selecting filter thresholds (default: clustering) - --sample-size SAMPLE_SIZE - Max number of sentence pairs used for data-based + --sample-size INT Max number of sentence pairs used for data-based methods (default 100000) - --noisy-percentile NOISY_PERCENTILE + --noisy-percentile FLOAT Proportion of the data considered to be noisy; only for percentiles method (default 0.001) + --clusters INT, -k INT + Number of clusters for the clustering method; try + increasing if too much data is clustered as noisy + (default 2) --work-dir WORK_DIR Location of the source and target files for the generated configuration (default work) --inter-dir INTER_DIR @@ -83,9 +86,11 @@ First, we remove duplicates and empty sentences from the input corpus. Next, we take a subset (`--sample-size`, 100k sentence pairs by default) of the corpus and produce scores for each sentence pair in the subset with the previously mentioned filters. These scores are -used as features for K-means clustering to classify the sentence pairs +used as features for K-means clustering to group the sentence pairs into clean and noisy pairs. The values of the noisy cluster center are used as the filter threshold parameters in the generated config file. +If it looks like too many samples are clustered as noisy, increasing +the number of clusters (`--clusters`) may help. Figures from the clustering and score histograms are plotted given the `--plot` option. If you want also to save the intermediate files, make diff --git a/docs/filters/custom_filters.md b/docs/filters/custom_filters.md index 58889ca..b23c341 100644 --- a/docs/filters/custom_filters.md +++ b/docs/filters/custom_filters.md @@ -19,7 +19,7 @@ The `accept` method takes a single output yielded by the `score` method, and returns whether the sentence pair should be accepted based on the score. -The `score_direction` should be one of the following contants defined +The `score_direction` should be one of the following constants defined in the `opusfilter` module depending on the output of the `score()` method: diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index 8b93b39..b718486 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -515,8 +515,9 @@ class ClusterFilters(DataBasedFiltersABC): ('LanguageIDFilter', {'id_method': 'cld2'}), 'TerminalPunctuationFilter'] - def __init__(self, files, max_length=150, **kwargs): + def __init__(self, files, k=2, max_length=150, **kwargs): super().__init__(files, max_length=150, **kwargs) + self.k = k self.label_file_path = os.path.join(self.inter_dir, 'labels.txt') self.scoredata = None @@ -525,7 +526,7 @@ def set_filter_thresholds(self): score_file = get_score_file( self.files, [{name: params} for name, params in self.filters_to_add], self.inter_dir, self.sample_size, overwrite=self.overwrite, max_length=self.max_length) - self.scoredata = ScoreClusters(score_file) + self.scoredata = ScoreClusters(score_file, k=self.k) self._set_parameters(self.scoredata.get_result_df()) if os.path.isfile(self.label_file_path) and not self.overwrite: logger.info('Label file "%s" exits, not overwriting', self.label_file_path) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index 1e9f109..4e59c63 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -26,7 +26,8 @@ class ScoreClusters: """ - def __init__(self, score_file, n=2): + def __init__(self, score_file, k=2): + self.k = k self.df = load_dataframe(score_file) self.filters = {} for name in self.df.columns: @@ -36,8 +37,8 @@ def __init__(self, score_file, n=2): self.scaler = preprocessing.StandardScaler() self.standard_data = self.scaler.fit_transform(self.df) - logger.info('Training KMeans with %s clusters', n) - self.kmeans = KMeans(n_clusters=n, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) + logger.info('Training KMeans with %s clusters', self.k) + self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) self.labels = self.kmeans.labels_ self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) self._noisy_label = self._get_noisy_label() @@ -48,9 +49,9 @@ def noisy_label(self): return self._noisy_label @property - def clean_label(self): - """Cluster label for clean data""" - return np.abs(self._noisy_label - 1) + def clean_labels(self): + """Cluster labels for clean data""" + return [idx for idx in range(self.k) if idx != self._noisy_label] def _get_flipped_centers(self): """Get centers with values flipped when low score indicates clean data""" @@ -130,17 +131,23 @@ def get_result_df(self): def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) - data_t = PCA(n_components=2).fit_transform(self.standard_data) - for label_id in [self.noisy_label, self.clean_label]: + pca = PCA(n_components=2) + data_t = pca.fit_transform(self.standard_data) + centroids = pca.transform(self.kmeans.cluster_centers_) + for label_id in range(self.k): points = np.where(self.labels == label_id) plt.scatter(data_t[points, 0], data_t[points, 1], c='orange' if label_id == self.noisy_label else 'blue', label='noisy' if label_id == self.noisy_label else 'clean', - marker=',', s=1) + marker=',', s=1, alpha=0.3) + for label_id in range(self.k): + plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1, + marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue', + label='noisy centroid' if label_id == self.noisy_label else 'clean centroid') plt.legend() plt.title('Clusters') noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)] - clean_samples = self.df.iloc[np.where(self.labels == self.clean_label)] + clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)] noisy_samples.hist(bins=100, figsize=(10, 10)) plt.suptitle('Histograms for noisy samples') clean_samples.hist(bins=100, figsize=(10, 10)) From 35edc8856ac0d23333d83bd241e8193723971b86 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 20 Sep 2023 10:36:04 +0300 Subject: [PATCH 11/12] simplify direction flipping for clustering --- opusfilter/autogen_cluster.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/opusfilter/autogen_cluster.py b/opusfilter/autogen_cluster.py index 4e59c63..b30f4f3 100644 --- a/opusfilter/autogen_cluster.py +++ b/opusfilter/autogen_cluster.py @@ -5,8 +5,7 @@ import pandas as pd from sklearn.cluster import KMeans -from sklearn import preprocessing -from sklearn.decomposition import PCA +from sklearn import preprocessing, random_projection from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance import numpy as np @@ -35,12 +34,13 @@ def __init__(self, score_file, k=2): filter_cls = getattr(filtermodule, first_part) self.filters[name] = filter_cls self.scaler = preprocessing.StandardScaler() - self.standard_data = self.scaler.fit_transform(self.df) + self.standard_data = self.scaler.fit_transform(self.df.mul(self.direction_vector)) logger.info('Training KMeans with %s clusters', self.k) - self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1).fit(self.standard_data) + self.kmeans = KMeans(n_clusters=self.k, random_state=0, init='k-means++', n_init=1) + self.kmeans.fit(self.standard_data) self.labels = self.kmeans.labels_ - self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) + self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) * self.direction_vector self._noisy_label = self._get_noisy_label() @property @@ -53,34 +53,29 @@ def clean_labels(self): """Cluster labels for clean data""" return [idx for idx in range(self.k) if idx != self._noisy_label] - def _get_flipped_centers(self): - """Get centers with values flipped when low score indicates clean data""" - dir_fixed_centers = [] - for center in self.kmeans.cluster_centers_: - fixed_center = [] - for i, name in enumerate(self.df.columns): - value = center[i].copy() - if self.filters[name].score_direction == CLEAN_LOW: - value *= -1 - fixed_center.append(value) - dir_fixed_centers.append(fixed_center) - return dir_fixed_centers + @property + def direction_vector(self): + """Direction vector for the features (1 for CLEAN_LOW, -1 for CLEAN_HIGH)""" + return np.array([1 if self.filters[name].score_direction == CLEAN_LOW else -1 + for name in self.df.columns]) def _get_noisy_label(self): """Find label for the noisy cluster""" - means = np.mean(self._get_flipped_centers(), axis=1) + means = np.mean(self.kmeans.cluster_centers_, axis=1) # Output some cluster information nlabels = Counter(self.labels) for i, (center, inv_center, mean) in enumerate(zip(self.kmeans.cluster_centers_, self.cluster_centers, means)): - logger.info('Cluster #%s - number of samples: %s', i, nlabels[i]) + logger.info('Cluster #%s', i) + logger.info('* number of samples: %s', nlabels[i]) + logger.info('* centroid (score, scaled value, original value):') for j, val in enumerate(center): - logger.info('%s\t%s\t%s', self.df.columns[j], round(val, 2), round(inv_center[j], 2)) + logger.info(' %s\t%s\t%s', self.df.columns[j].ljust(25), round(val, 2), round(inv_center[j], 2)) logger.info('Average center\t%s', np.round(mean, 2)) # Cluster center of the noisiest cluster based on average features - noisy_mean = np.min(means) - noisy_label = np.argmin(means) + noisy_mean = np.max(means) + noisy_label = np.argmax(means) logger.info('Cluster center of the noisiest cluster (%s)', np.round(noisy_mean, 2)) logger.info('Noisy label: %s', noisy_label) noisy_labels = np.where(self.labels == noisy_label)[0] @@ -111,13 +106,14 @@ def get_rejects(self): feature_importances = permutation_importance(clf, self.standard_data, self.labels) importance_mean_mean = np.mean(feature_importances.importances_mean) rej_coef = 0.1 - logger.info('mean importance: %s', round(importance_mean_mean, 3)) - logger.info('rejection coefficient: %s', rej_coef) + logger.info('* mean importance: %s', round(importance_mean_mean, 3)) + logger.info('* rejection coefficient: %s', rej_coef) + logger.info('* decisions:') rejects = [] - for i, k in enumerate(self.df.columns): + for i, col in enumerate(self.df.columns): importance = feature_importances['importances_mean'][i] reject = importance < importance_mean_mean * rej_coef - logger.info('%s\t%s\t%s', k, round(importance, 3), 'reject' if reject else 'keep') + logger.info(' %s\t%s\t%s', col.ljust(25), round(importance, 3), 'reject' if reject else 'keep') rejects.append(reject) return rejects @@ -131,18 +127,18 @@ def get_result_df(self): def plot(self, plt): """Plot clustering and histograms""" plt.figure(figsize=(10, 10)) - pca = PCA(n_components=2) - data_t = pca.fit_transform(self.standard_data) - centroids = pca.transform(self.kmeans.cluster_centers_) + projection = random_projection.GaussianRandomProjection(n_components=2) + data_t = projection.fit_transform(self.standard_data) + centroids = projection.transform(self.kmeans.cluster_centers_) for label_id in range(self.k): points = np.where(self.labels == label_id) plt.scatter(data_t[points, 0], data_t[points, 1], c='orange' if label_id == self.noisy_label else 'blue', label='noisy' if label_id == self.noisy_label else 'clean', - marker=',', s=1, alpha=0.3) + marker=',', s=1, alpha=0.1) for label_id in range(self.k): plt.scatter(centroids[label_id, 0], centroids[label_id, 1], s=100, alpha=1, - marker='+', c='darkorange' if label_id == self.noisy_label else 'darkblue', + marker='+', c='brown' if label_id == self.noisy_label else 'darkblue', label='noisy centroid' if label_id == self.noisy_label else 'clean centroid') plt.legend() plt.title('Clusters') From 954d28d8d8cc43f4488dd3fd37df0f2a4aa1aa26 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 20 Sep 2023 16:28:40 +0300 Subject: [PATCH 12/12] update documentation --- bin/opusfilter-autogen | 2 +- docs/automatic_configuration.md | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen index a40cafd..9124170 100644 --- a/bin/opusfilter-autogen +++ b/bin/opusfilter-autogen @@ -53,7 +53,7 @@ parser.add_argument('--add-filter', nargs=2, action='append', default=[], metava 'filter identifier in order to allow multiple filters of the same class. Example: ' '--add-filter LanguageIDFilter.cld2 \'{"id_method": "cld2"}\'')) parser.add_argument('--overwrite', action='store_true', - help='Overwrite existing config file and intermediate files') + help='Overwrite existing intermediate files') parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)') args = parser.parse_args() diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md index acad8f0..a40229b 100644 --- a/docs/automatic_configuration.md +++ b/docs/automatic_configuration.md @@ -58,7 +58,7 @@ options: and a unique filter identifier in order to allow multiple filters of the same class. Example: --add- filter LanguageIDFilter.cld2 '{"id_method": "cld2"}' - --overwrite Overwrite existing config file and intermediate files + --overwrite Overwrite existing intermediate files -o CONFIGFILE, --output CONFIGFILE Output configuration file (default -) ``` @@ -68,11 +68,11 @@ option `default` uses the default parameters defined in the filter classes. The option `percentiles` assumes that a proportion of the data (set by `--noisy-percentile`) is noisy, and sets the thresholds for each filter independently based on the percentile. The -`clustering` option is likely the most useful of the three, and -described in more detail below. However, it is applicable to a more -limited set of filters. +`clustering` option may be the most useful of the three, and described +in more detail below. However, it is applicable to a more limited set +of filters. -## Unsupervised feature selection for filters +## Unsupervised threshold selection for filters This implements the method introduced by {cite:t}`aulamo-etal-2023-unsupervised`. It takes a parallel corpus as an input and tries to separate the clean @@ -95,3 +95,7 @@ the number of clusters (`--clusters`) may help. Figures from the clustering and score histograms are plotted given the `--plot` option. If you want also to save the intermediate files, make sure to use the `--inter-dir` argument. + +*Note: The method should be considered as experimental, and it is not +expected to give good results on all corpora. If you try it, please +consider giving feedback on the project issues page.*