Helsinki-NLP · svirpioj · Sep 20, 2023 · Jun 21, 2023 · Aug 2, 2023 · Aug 9, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,10 +20,6 @@ jobs:
         os: [ubuntu-latest]
         python-version: ["3.7", "3.8", "3.9", "3.10"]
         requirements-file: ["requirements.txt"]
-        include:
-          - os: ubuntu-20.04
-            python-version: "3.6"
-            requirements-file: "requirements-py36.txt"
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3

diff --git a/.pylintrc b/.pylintrc
@@ -521,7 +521,7 @@ max-bool-expr=5
 max-branches=12
 
 # Maximum number of locals for function / method body.
-max-locals=15
+max-locals=16
 
 # Maximum number of parents for a class (see R0901).
 max-parents=7

diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ Install from source:
 
 ### Troubleshooting
 
-OpusFilter should generally work fine on Python 3.6 to 3.10. In the case of troubles, try installing the exact versions in `requirements.txt`:
+OpusFilter should generally work fine on Python 3.7 to 3.10. In the case of troubles, try installing the exact versions in `requirements.txt`:
 
 * `pip install -r requirements.txt`
 

diff --git a/bin/opusfilter-autogen b/bin/opusfilter-autogen
@@ -2,12 +2,13 @@
 
 import argparse
 import logging
+import json
 import os
+import sys
 
 import matplotlib.pyplot as plt
 
-from opusfilter.autogen import ConfigurationGenerator, DefaultParameterFilters, PercentileFilters
-from opusfilter.autogen_cluster import FilterThresholdFinder
+from opusfilter.autogen import ClusterFilters, ConfigurationGenerator, DefaultParameterFilters, PercentileFilters
 from opusfilter.util import yaml
 
 try:
@@ -30,38 +31,58 @@ parser.add_argument('--langs', nargs='+', metavar='LANGCODE',
 parser.add_argument('--scripts', nargs='+', metavar='SCRIPT', help=(
     'Alphabetic scripts (e.g. Latin) corresponding to the input files. '
     'If omitted, CharacterScoreFilter will not be used.'))
-parser.add_argument('--filter-params', choices=['default', 'percentiles', 'unsupervised'], default='unsupervised',
-                    help='Method for selecting filter parameters (default: %(default)s)')
-parser.add_argument('--sample-size', default=100000, type=int,
-                    help='Max number of sentence pairs used for clustering (default %(default)s)')
-parser.add_argument('--noisy-percentile', default=0.001, type=float,
+parser.add_argument('--method', choices=['defaults', 'percentiles', 'clustering'], default='clustering',
+                    help='Method for selecting filter thresholds (default: %(default)s)')
+parser.add_argument('--sample-size', default=100000, type=int, metavar='INT',
+                    help='Max number of sentence pairs used for data-based methods (default %(default)s)')
+parser.add_argument('--noisy-percentile', default=0.001, type=float, metavar='FLOAT',
                     help='Proportion of the data considered to be noisy; only for percentiles method (default %(default)s)')
+parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
+                    help=('Number of clusters for the clustering method; try increasing if too much data is clustered '
+                          'as noisy (default %(default)s)'))
 parser.add_argument('--work-dir', default='work',
                     help='Location of the source and target files for the generated configuration (default %(default)s)')
 parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')
 parser.add_argument('--plot', action='store_true',
-                    help='Show a scatter plot of the clustering and histograms of feature data distributions')
-parser.add_argument('--overwrite', action='store_true', help='Overwrite existing config file and intermediate files')
-parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-', metavar='CONFIGFILE',
-                    help='Output configuration file (default %(default)s)')
+                    help=('Show a scatter plot of the clustering and histograms of feature data distributions; '
+                          'only for the clustering method'))
+parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit')
+parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'),
+                    help=('Instead of using default filters, add a filter of CLASS with JSON parameters object '
+                          '("{}" for default parameters). The class name may be followed by a dot and a unique '
+                          'filter identifier in order to allow multiple filters of the same class. Example: '
+                          '--add-filter LanguageIDFilter.cld2 \'{"id_method": "cld2"}\''))
+parser.add_argument('--overwrite', action='store_true',
+                    help='Overwrite existing intermediate files')
+parser.add_argument('-o', '--output', type=argparse.FileType('w'),
+                    default='-', metavar='CONFIGFILE', help='Output configuration file (default %(default)s)')
 args = parser.parse_args()
 
-if args.filter_params == 'unsupervised':
-    filtergen = FilterThresholdFinder(
-        files=args.files, langs=args.langs, scripts=args.scripts, sample_size=args.sample_size,
+filters = [(name, json.loads(jsonstr)) for name, jsonstr in args.add_filter] if args.add_filter else None
+
+if args.method == 'clustering':
+    filtergen = ClusterFilters(
+        files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
+        sample_size=args.sample_size, k=args.clusters, inter_dir=args.inter_dir, overwrite=args.overwrite)
+elif args.method == 'percentiles':
+    filtergen = PercentileFilters(
+        files=args.files, langs=args.langs, scripts=args.scripts, filters=filters,
+        excluded_percentile=args.noisy_percentile, sample_size=args.sample_size,
         inter_dir=args.inter_dir, overwrite=args.overwrite)
-    filters, scoredata = filtergen.find_thresholds()
-    if args.plot:
-        scoredata.plot(plt)
-        plt.show()
-elif args.filter_params == 'percentiles':
-    filtergen = PercentileFilters(files=args.files, excluded_percentile=args.noisy_percentile)
-    filters = filtergen.get_thresholds()
 else:
-    filtergen = DefaultParameterFilters()
-    filters = filtergen.get_thresholds()
+    filtergen = DefaultParameterFilters(langs=args.langs, scripts=args.scripts, filters=filters)
+
+if args.list_defaults:
+    yaml.dump(filtergen.DEFAULT_FILTERS, args.output)
+    sys.exit(0)
+
+filters = filtergen.set_filter_thresholds()
+
+if args.method == 'clustering' and args.plot:
+    filtergen.scoredata.plot(plt)
+    plt.show()
 
 generator = ConfigurationGenerator(
     files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir)
-generator.add_filter(filters)
+generator.add_filter(filtergen.filters)
 yaml.dump(generator.get_config(), args.output)
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - `opusfilter-autogen` script for automatic filter config generation
-- `score_direction` property for filters
+- `score_direction`, `accept_threshold`, and `reject_threshold` properties for filters
 
 ### Changed
 
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - use xxhash instead of pyhash for hash functions
 - use opus-fast-mosestokenizer instead of fast-mosestokenizer
 - install eflomal from PyPI and use the new interface in WordAlignFilter
+- remove Python 3.6 support and tests
 
 ### Fixed
 

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -5,7 +5,7 @@ issues page. We are also happy to consider pull requests. There are a
 few rules for pull requests:
 
 * Make a pull request to the `develop` branch instead of `master`.
-* The code should support at least Python versions from 3.6 to 3.8.
+* The code should support at least Python versions from 3.7 to 3.10.
 * Please follow [PEP 8](https://www.python.org/dev/peps/pep-0008/). Exception: The maximum line length is 127 characters instead of 79.
 * Especially for new features, please include test cases for unit testing.
 
@@ -19,7 +19,8 @@ work, if you have VariKN and eflomal set up as instructed - `pytest`
 skips the respective tests if not.)
 
 GitHub workflows defined in the project run automatically `flake8`
-checks and unit testing with `pytest` using Python 3.6, 3.7, and 3.8.
+checks and unit testing with `pytest` using Python 3.7, 3.8, 3.9, and
+3.10.
 
 Especially for larger contributions, consider using a code analysis
 tool like [Pylint](https://github.com/PyCQA/pylint). Install it

diff --git a/docs/automatic_configuration.md b/docs/automatic_configuration.md
@@ -7,10 +7,13 @@ step, with a few options for determining the filter parameters.
 The usage description for the script is as follows:
 ```text
 usage: opusfilter-autogen [-h] --files TEXTFILE [TEXTFILE ...]
-                          [--langs LANGCODE [LANGCODE ...]] [--scripts SCRIPT [SCRIPT ...]]
-                          [--filter-params {cluster,default,percentiles}]
-                          [--sample-size SAMPLE_SIZE] [--noisy-percentile NOISY_PERCENTILE]
-                          [--work-dir WORK_DIR] [--inter-dir INTER_DIR] [--plot]
+                          [--langs LANGCODE [LANGCODE ...]]
+                          [--scripts SCRIPT [SCRIPT ...]]
+                          [--method {defaults,percentiles,clustering}]
+                          [--sample-size SAMPLE_SIZE]
+                          [--noisy-percentile NOISY_PERCENTILE]
+                          [--work-dir WORK_DIR] [--inter-dir INTER_DIR]
+                          [--plot] [--list-defaults] [--add-filter CLASS JSON]
                           [--overwrite] [-o CONFIGFILE]
 
 Generate initial configuration based on parallel text data
@@ -20,39 +23,56 @@ options:
   --files TEXTFILE [TEXTFILE ...]
                         parallel text input file(s)
   --langs LANGCODE [LANGCODE ...]
-                        Language codes corresponding to the input files. If omitted,
-                        LanguageIDFilters will not be used.
+                        Language codes corresponding to the input files. If
+                        omitted, LanguageIDFilters will not be used.
   --scripts SCRIPT [SCRIPT ...]
-                        Alphabetic scripts (e.g. Latin) corresponding to the input files.
-                        If omitted, CharacterScoreFilter will not be used.
-  --filter-params {default,percentiles,unsupervised}
-                        Method for selecting filter parameters (default: unsupervised)
-  --sample-size SAMPLE_SIZE
-                        Max number of sentence pairs used for clustering (default 100000)
-  --noisy-percentile NOISY_PERCENTILE
-                        Proportion of the data considered to be noisy; only for percentiles
-                        method (default 0.001)
-  --work-dir WORK_DIR   Location of the source and target files for the generated
-                        configuration (default work)
+                        Alphabetic scripts (e.g. Latin) corresponding to the
+                        input files. If omitted, CharacterScoreFilter will not
+                        be used.
+  --method {defaults,percentiles,clustering}
+                        Method for selecting filter thresholds (default:
+                        clustering)
+  --sample-size INT     Max number of sentence pairs used for data-based
+                        methods (default 100000)
+  --noisy-percentile FLOAT
+                        Proportion of the data considered to be noisy; only
+                        for percentiles method (default 0.001)
+  --clusters INT, -k INT
+                        Number of clusters for the clustering method; try
+                        increasing if too much data is clustered as noisy
+                        (default 2)
+  --work-dir WORK_DIR   Location of the source and target files for the
+                        generated configuration (default work)
   --inter-dir INTER_DIR
-                        Save intermediate files in this directory (use a temporary
-                        directory if not given)
-  --plot                Show a scatter plot of the clustering and histograms of feature
-                        data distributions
-  --overwrite           Overwrite existing config file and intermediate files
+                        Save intermediate files in this directory (use a
+                        temporary directory if not given)
+  --plot                Show a scatter plot of the clustering and histograms
+                        of feature data distributions; only for the clustering
+                        method
+  --list-defaults       List default filters of the method to the output and
+                        quit
+  --add-filter CLASS JSON
+                        Instead of using default filters, add a filter of
+                        CLASS with JSON parameters object ("{}" for default
+                        parameters). The class name may be followed by a dot
+                        and a unique filter identifier in order to allow
+                        multiple filters of the same class. Example: --add-
+                        filter LanguageIDFilter.cld2 '{"id_method": "cld2"}'
+  --overwrite           Overwrite existing intermediate files
   -o CONFIGFILE, --output CONFIGFILE
                         Output configuration file (default -)
 ```
 
-The `--filter-params` options sets how the filter parameters are set.
-The option `default` uses the default parameters defined in the filter
+The `--method` option sets how the filter parameters are set.  The
+option `default` uses the default parameters defined in the filter
 classes. The option `percentiles` assumes that a proportion of the
 data (set by `--noisy-percentile`) is noisy, and sets the thresholds
 for each filter independently based on the percentile. The
-`unsupervised` option is likely the most useful of the three, and
-described in more detail below.
+`clustering` option may be the most useful of the three, and described
+in more detail below. However, it is applicable to a more limited set
+of filters.
 
-## Unsupervised feature selection for filters
+## Unsupervised threshold selection for filters
 
 This implements the method introduced by {cite:t}`aulamo-etal-2023-unsupervised`.
 It takes a parallel corpus as an input and tries to separate the clean
@@ -66,10 +86,16 @@ First, we remove duplicates and empty sentences from the input
 corpus. Next, we take a subset (`--sample-size`, 100k sentence pairs
 by default) of the corpus and produce scores for each sentence pair in
 the subset with the previously mentioned filters. These scores are
-used as features for K-means clustering to classify the sentence pairs
+used as features for K-means clustering to group the sentence pairs
 into clean and noisy pairs. The values of the noisy cluster center are
 used as the filter threshold parameters in the generated config file.
+If it looks like too many samples are clustered as noisy, increasing
+the number of clusters (`--clusters`) may help.
 
 Figures from the clustering and score histograms are plotted given the
 `--plot` option. If you want also to save the intermediate files, make
 sure to use the `--inter-dir` argument.
+
+*Note: The method should be considered as experimental, and it is not
+expected to give good results on all corpora. If you try it, please
+consider giving feedback on the project issues page.*
diff --git a/docs/filters/custom_filters.md b/docs/filters/custom_filters.md
@@ -5,7 +5,10 @@ the filter configuration entries.
 
 The custom filters should inherit the abstract base class `FilterABC`
 from the `opusfilter` package. They should implement two abstract
-methods: `score` and `accept`.
+methods, `score` and `accept`, and one abstract property,
+`score_direction`. Additionally, for filters with adjustable
+thresholds, defining `accept_threshold` and `reject_threshold`
+properties is recommended.
 
 The `score` method is a generator that takes an iterator over tuples
 of parallel sentences, and yields a score object for each pair. The
@@ -16,6 +19,17 @@ The `accept` method takes a single output yielded by the `score`
 method, and returns whether the sentence pair should be accepted based
 on the score.
 
+The `score_direction` should be one of the following constants defined
+in the `opusfilter` module depending on the output of the `score()`
+method:
+
+* `CLEAN_LOW`: scores below a threshold parameter indicate clean data
+* `CLEAN_HIGH`: scores above a threshold parameter indicate clean data
+* `CLEAN_BETWEEN`: scores between minimum and maximum thresholds
+  indicate clean data
+* `CLEAN_TRUE`: score value `True` indicates clean data
+* `CLEAN_FALSE`: score value `False` indicates clean data
+
 If the filter requires any parameters (e.g. score thresholds for the
 `accept` method), the class should implement also the `__init__`
 method.  Arbitrary keyword arguments should be accepted (with
@@ -24,6 +38,25 @@ should be called with the remaining keyword arguments. The keyword
 argument `name` is reserved for giving names to the filters and
 `workdir` for a location for non-temprary files.
 
+For compability with the included [automatic configuration generation
+tools](../automatic_configuration.md), also the following should be
+considered:
+
+* If there is a threshold value used by `accept`, the argument should
+  be named as `threshold` (a single global threshold) or `thresholds`
+  (multiple thresholds, e.g. one per language). The `accept_threshold`
+  and `reject_threshold` properties should have threshold values that
+  force all inputs to be accepted or rejected, respectively.  That is,
+  a sensible threshold value will always be between `accept_threshold`
+  and `reject_threshold`.
+* If there are lower and upper thresholds used by `accept`
+  (i.e. `score_direction` is `CLEAN_BETWEEN`), the respective
+  arguments should be named as `min_threshold` and `max_threshold` or
+  `min_length` and `max_length`. The `accept_threshold` and
+  `reject_threshold` properties should have tuples of two threshold
+  values (for lower and upper thresholds) that force all inputs to be
+  accepted or rejected, respectively.
+
 Based on the `score` and `accept` methods, the abstract class
 `FilterABC` implements the following three generators that take
 iterator over segment pairs as input:
@@ -44,6 +77,10 @@ import opusfilter
 
 class UppercaseFilter(opusfilter.FilterABC):
 
+	score_direction = opusfilter.CLEAN_LOW
+	accept_threshold = 1 + 10**-6
+	reject_threshold = 0
+
     def __init__(self, threshold=0.5, **kwargs):
         self.threshold = threshold
         super().__init__(**kwargs)
@@ -88,4 +125,4 @@ If a filter requires external resources files (e.g. for model
 parameters), or stores non-temporary files itself, they should be
 located in the path defined the attribute `workdir`. The
 implementation of the filter should join `workdir` with relative file
-paths using `os.path.join()`.
+paths using `os.path.join()`.
diff --git a/docs/filters/sentence_embedding_filters.md b/docs/filters/sentence_embedding_filters.md
@@ -18,7 +18,7 @@ calculate the similarity of the embeddings.  If `nn_model` is
 provided, the similarities are normalized by the average similarity to
 K nearest neighbors in a reference corpus; see
 [train_nearest_neighbors](train_nearest_neighbors) for training a
-model. With normalized scores, threshold around 1.0 is likely more
+model. With normalized scores, threshold closer to 1.0 is likely more
 suitable than the default 0.5.
 
 Especially with the nearest neighbor normalization, this filter can be

diff --git a/docs/installation.md b/docs/installation.md
@@ -12,7 +12,7 @@ Install from source:
 
 Note that all required libraries are not available to install via PyPI
 on Windows OS. On Linux and MacOS, it should work directly for Python
-versions from 3.6 to 3.10.
+versions from 3.7 to 3.10.
 
 ## Required libraries