From 9bf90f19b512de245a43fe10a3af3519e655fb4a Mon Sep 17 00:00:00 2001 From: SimonLangerQC Date: Tue, 4 Jun 2024 18:35:28 +0200 Subject: [PATCH] adding configuration options to uniques functionality (#224) Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Docs update Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> update doc string on null columns everywhere and fix typo Update docs Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Update docs Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> Update docs Co-authored-by: Kevin Klein <7267523+kklein@users.noreply.github.com> docs updates update docs filternull docs clarification replace assert by raise ValueError shorten name to apply_output_formatting add unit tests for new utils functions set default to limit 100 elements ensure all relevant tests run for impala and ensure they pass disable extralong test for bigquery due to slow speed capitalization test handle parallel if table already created --- .github/workflows/ci.yaml | 8 +- run_integration_tests_postgres.sh | 15 + src/datajudge/constraints/base.py | 25 +- src/datajudge/constraints/miscs.py | 11 +- src/datajudge/constraints/uniques.py | 204 ++------ src/datajudge/requirements.py | 216 +++++---- src/datajudge/utils.py | 142 +++++- start_postgres.sh | 2 +- tests/integration/conftest.py | 59 ++- tests/integration/test_integration.py | 669 +++++++++----------------- tests/unit/test_utils.py | 276 ++++++++++- 11 files changed, 909 insertions(+), 718 deletions(-) create mode 100755 run_integration_tests_postgres.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4d22e901..d44df6fc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -262,7 +262,7 @@ jobs: uses: ./.github/actions/pytest with: backend: bigquery - args: -n auto tests/integration + args: -n 16 -v tests/integration impala-column: if: ${{ contains(github.event.pull_request.labels.*.name, 'impala') || contains(github.event.pull_request.labels.*.name, 'ready') || github.ref == 'refs/heads/main' }} @@ -275,7 +275,11 @@ jobs: matrix: PYTHON_VERSION: [ '3.8' ] SA_VERSION: ["<2.0"] - PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"] + # PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"] + + # more comprehensive matching; note that tests which start with test_i and not test_integer are not matched and must be added here + + PYTEST_ARG: ["tests/integration/test_integration.py -k 'test_a or test_b or test_c or test_d or test_e or test_f or test_g or test_h or test_integer or test_j or test_k or test_l or test_m'", "tests/integration/test_integration.py -k 'test_n or test_o or test_p or test_q or test_r or test_s or test_t or test_u or test_v or test_w or test_x or test_y or test_z'"] steps: - name: Checkout branch diff --git a/run_integration_tests_postgres.sh b/run_integration_tests_postgres.sh new file mode 100755 index 00000000..ace0d316 --- /dev/null +++ b/run_integration_tests_postgres.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +docker stop $(docker ps -q --filter name=postgres_datajudge) + +./start_postgres.sh & +bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" & + +read -p "Press enter to once postgres is ready: " +kill %% + +echo "STARTING PYTEST" +pytest tests/integration -vv --backend=postgres "$@" + +docker stop $(docker ps -q --filter name=postgres_datajudge) + diff --git a/src/datajudge/constraints/base.py b/src/datajudge/constraints/base.py index 0fbb3555..e9aad829 100644 --- a/src/datajudge/constraints/base.py +++ b/src/datajudge/constraints/base.py @@ -1,12 +1,13 @@ import abc from dataclasses import dataclass, field from functools import lru_cache -from typing import Any, Callable, List, Optional, Tuple, TypeVar +from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union import sqlalchemy as sa from ..db_access import DataReference from ..formatter import Formatter +from ..utils import OutputProcessor, output_processor_limit DEFAULT_FORMATTER = Formatter() @@ -113,7 +114,15 @@ class Constraint(abc.ABC): """ def __init__( - self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None + self, + ref: DataReference, + *, + ref2=None, + ref_value: Any = None, + name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): self._check_if_valid_between_or_within(ref2, ref_value) self.ref = ref @@ -125,6 +134,12 @@ def __init__( self.factual_queries: Optional[List[str]] = None self.target_queries: Optional[List[str]] = None + if (output_processors is not None) and ( + not isinstance(output_processors, list) + ): + output_processors = [output_processors] + self.output_processors = output_processors + def _check_if_valid_between_or_within( self, ref2: Optional[DataReference], ref_value: Optional[Any] ): @@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult: target_queries, ) + def apply_output_formatting(self, values: Collection) -> Collection: + if self.output_processors is not None: + for output_processor in self.output_processors: + values, _ = output_processor(values) + return values + def format_sample(sample, ref: DataReference) -> str: """Build a string from a database row indicating its column values.""" diff --git a/src/datajudge/constraints/miscs.py b/src/datajudge/constraints/miscs.py index db8c88c4..3ea35312 100644 --- a/src/datajudge/constraints/miscs.py +++ b/src/datajudge/constraints/miscs.py @@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult: return TestResult.success() assertion_text = ( - f"{self.ref} has violations of functional dependence, e.g.:\n" - + "\n".join([f"{tuple(violation)}" for violation in violations][:5]) + f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n" + + "\n".join( + [ + f"{violation}" + for violation in self.apply_output_formatting( + [tuple(elem) for elem in violations] + ) + ] + ) ) return TestResult.failure(assertion_text) diff --git a/src/datajudge/constraints/uniques.py b/src/datajudge/constraints/uniques.py index 3f9b0130..e8bcb4ad 100644 --- a/src/datajudge/constraints/uniques.py +++ b/src/datajudge/constraints/uniques.py @@ -9,6 +9,7 @@ from .. import db_access from ..db_access import DataReference +from ..utils import OutputProcessor, filternull_element, output_processor_limit from .base import Constraint, OptionalSelections, T, TestResult, ToleranceGetter @@ -36,109 +37,6 @@ def _subset_violation_counts( return len(remainder) == 0, remainder -def util_output_postprocessing_sorter( - collection: Collection, counts: Optional[Collection] = None -): - """ - Sorts a collection of tuple elements in descending order of their counts, - and for ties, makes use of the ascending order of the elements themselves. - - If the first element is not instanceof tuple, - each element will be transparently packaged into a 1-tuple for processing; - this process is not visible to the caller. - - Handles None values as described in `sort_tuple_none_aware`. - """ - collection = list(collection) - if not isinstance(collection[0], tuple): - # package into a 1 tuple and pass into the method again - packaged_list = [(elem,) for elem in collection] - res_main, res_counts = util_output_postprocessing_sorter(packaged_list, counts) - return [elem[0] for elem in res_main], res_counts - - if counts is None: - return sort_tuple_none_aware(collection), counts - - assert len(collection) == len( - counts - ), "collection and counts must have the same length" - - if len(collection) <= 1: - return collection, counts # empty or 1 element lists are always sorted - - lst = sort_tuple_none_aware( - [(-count, *elem) for count, elem in zip(counts, collection)] - ) - return [elem[1:] for elem in lst], [-elem[0] for elem in lst] - - -def util_filternull_default_deprecated(values: List[T]) -> List[T]: - return list(filter(lambda value: value is not None, values)) - - -def util_filternull_never(values: List[T]) -> List[T]: - return values - - -def util_filternull_element_or_tuple_all(values: List[T]) -> List[T]: - return list( - filter( - lambda value: (value is not None) - and (not (isinstance(value, tuple) and all(x is None for x in value))), - values, - ) - ) - - -def util_filternull_element_or_tuple_any(values: List[T]) -> List[T]: - return list( - filter( - lambda value: (value is not None) - and (not (isinstance(value, tuple) and any(x is None for x in value))), - values, - ) - ) - - -def sort_tuple_none_aware(collection: Collection[Tuple], ascending=True): - """ - Sorts a collection of either tuples or single elements, - where `None` is considered the same as the default value of the respective column's type. - For ints/floats `int()`/`float()` yield `0`/`0.0`, for strings `str()` yields `''`. - The constructor is determined by calling type() on the first non-`None` element of the respective column. - - Checks and requires all elements in collection are tuples, and that all tuples have the same length. - """ - lst = list(collection) - - if len(lst) <= 1: - return lst # empty or 1 element lists are always sorted - - assert all( - isinstance(elem, tuple) and len(elem) == len(lst[0]) for elem in lst - ), "all elements must be tuples and have the same length" - - dtypes_each_tupleelement: List[Optional[type]] = [None] * len(lst[0]) - for dtypeidx in range(len(dtypes_each_tupleelement)): - for elem in lst: - if elem[dtypeidx] is not None: - dtypes_each_tupleelement[dtypeidx] = type(elem[dtypeidx]) - break - else: - # if all entries are None, just use a constant int() == 0 - dtypes_each_tupleelement[dtypeidx] = int - - def replace_None_with_default(elem): - return tuple( - (dtype() if subelem is None else subelem) - for dtype, subelem in zip(dtypes_each_tupleelement, elem) - ) - - return sorted( - lst, key=lambda elem: replace_None_with_default(elem), reverse=not ascending - ) - - class Uniques(Constraint, abc.ABC): """Uniques is an abstract class for comparisons between unique values of a column and a reference. @@ -146,17 +44,20 @@ class Uniques(Constraint, abc.ABC): are part of a reference set of expected values - either externally supplied through parameter `uniques` or obtained from another `DataSource`. - Null values in the column are ignored by default. To assert the non-existence of them use - the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for - `WithinRequirement`. + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method + for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. There are two ways to do some post processing of the data obtained from the @@ -188,18 +89,16 @@ class Uniques(Constraint, abc.ABC): or if `max_relative_violations` is 0. By default, the assertion messages make use of sets, - thus, they may differ from run to run despite the exact same situation being present. - To enforce a reproducible output via (e.g.) sorting, set `output_postprocessing_sorter` to a callable - which takes in two collections, and returns modified (e.g. sorted) versions of them. + thus, they may differ from run to run despite the exact same situation being present, + and can have an arbitrary length. + To enforce a reproducible, limited output via (e.g.) sorting and slicing, + set `output_processors` to a callable or a list of callables. By default, only the first 100 elements are displayed (:func:`~datajudge.utils.output_processor_limit`). + + Each callable takes in two collections, and returns modified (e.g. sorted) versions of them. In most cases, the second argument is simply None, but for `UniquesSubset` it is the counts of each of the elements. - The suggested function is `util_output_postprocessing_sorter` from this file, - - see its documentation for details. - - By default, the number of subset or superset remainders (excess or missing values) - for `UniquesSubset` and `UniquesSuperset` is sliced by [:5] (i.e. the first 5) in the assertion message. - This can be configured using `output_remainder_slicer`. - This argument does not have an effect for `UniquesEquality`. + The suggested functions are :func:`~datajudge.utils.output_processor_sort` and :func:`~datajudge.utils.output_processor_limit` + - see their respective docstrings for details. One use is of this constraint is to test for consistency in columns with expected categorical values. @@ -209,6 +108,9 @@ def __init__( self, ref: DataReference, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, *, ref2: DataReference = None, uniques: Collection = None, @@ -217,39 +119,26 @@ def __init__( reduce_func: Callable[[Collection], Collection] = None, max_relative_violations=0, compare_distinct=False, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, - output_remainder_slicer: slice = slice(5), ): ref_value: Optional[Tuple[Collection, List]] ref_value = (uniques, []) if uniques else None - super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name) + super().__init__( + ref, + ref2=ref2, + ref_value=ref_value, + name=name, + output_processors=output_processors, + ) if filter_func is None: - warnings.warn( - "Using deprecated default null filter function. " - "Set filter_func explicitly to disable this warning." - ) - filter_func = util_filternull_default_deprecated + warnings.warn("Using deprecated default null filter function.") + filter_func = filternull_element self.filter_func = filter_func self.local_func = map_func self.global_func = reduce_func self.max_relative_violations = max_relative_violations self.compare_distinct = compare_distinct - self.output_postprocessing_sorter = output_postprocessing_sorter - self.output_remainder_slicer = output_remainder_slicer - - def apply_output_formatting_no_counts( - self, values: Collection[T], apply_remainder_limit=False - ) -> Collection[T]: - if self.output_postprocessing_sorter is not None: - values, _ = self.output_postprocessing_sorter(values) # type: ignore[call-arg] - if apply_remainder_limit: - values = list(values) - values = values[self.output_remainder_slicer] - return values def retrieve( self, engine: sa.engine.Engine, ref: DataReference @@ -293,22 +182,22 @@ def compare( if not is_subset and not is_superset: assertion_text = ( f"{self.ref} doesn't have the element(s) " - f"'{self.apply_output_formatting_no_counts(lacking_values)}' and has the excess element(s) " - f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. " + f"'{self.apply_output_formatting(lacking_values)}' and has the excess element(s) " + f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. " f"{self.condition_string}" ) return False, assertion_text if not is_subset: assertion_text = ( f"{self.ref} has the excess element(s) " - f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. " + f"'{self.apply_output_formatting(excess_values)}' when compared with the reference values. " f"{self.condition_string}" ) return False, assertion_text if not is_superset: assertion_text = ( f"{self.ref} doesn't have the element(s) " - f"'{self.apply_output_formatting_no_counts(lacking_values)}' when compared with the reference values. " + f"'{self.apply_output_formatting(lacking_values)}' when compared with the reference values. " f"{self.condition_string}" ) return False, assertion_text @@ -342,17 +231,16 @@ def compare( output_elemes, output_counts = list(remainder.keys()), list( remainder.values() ) - if self.output_postprocessing_sorter is not None: - output_elemes, output_counts = self.output_postprocessing_sorter( - output_elemes, output_counts - ) - output_elemes = output_elemes[self.output_remainder_slicer] - output_counts = output_counts[self.output_remainder_slicer] + if self.output_processors is not None: + for output_processor in self.output_processors: + output_elemes, output_counts = output_processor( + output_elemes, output_counts + ) assertion_text = ( f"{self.ref} has a fraction of {relative_violations} > " f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of " - f"'{self.apply_output_formatting_no_counts(set(target_values))}'. It has e.g. ({self.output_remainder_slicer}) excess elements " + f"'{self.apply_output_formatting(set(target_values))}'. It has excess elements " f"'{output_elemes}' " f"with counts {output_counts}." f"{self.condition_string}" @@ -387,8 +275,8 @@ def compare( assertion_text = ( f"{self.ref} has a fraction of " f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) " - f"lacking unique values of '{self.apply_output_formatting_no_counts(set(target_values))}'. E.g. ({self.output_remainder_slicer}) it " - f"doesn't have the unique value(s) '{self.apply_output_formatting_no_counts(list(remainder), apply_remainder_limit=True)}'." + f"lacking unique values of '{self.apply_output_formatting(set(target_values))}'. It " + f"doesn't have the unique value(s) '{self.apply_output_formatting(list(remainder))}'." f"{self.condition_string}" ) return False, assertion_text diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py index 96cb5dae..210d5151 100644 --- a/src/datajudge/requirements.py +++ b/src/datajudge/requirements.py @@ -34,6 +34,7 @@ TableDataSource, get_date_growth_rate, ) +from .utils import OutputProcessor, output_processor_limit T = TypeVar("T") @@ -262,9 +263,9 @@ def add_uniques_equality_constraint( filter_func: Callable[[List[T]], List[T]] = None, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, condition: Condition = None, name: str = None, ): @@ -274,20 +275,23 @@ def add_uniques_equality_constraint( of a ``DataSource`` are strictly the ones of a reference set of expected values, specified via the ``uniques`` parameter. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. See the ``Uniques`` class for further parameter details on ``map_func`` and - ``reduce_func``, and ``output_postprocessing_sorter``. + ``reduce_func``, and ``output_processors``. """ ref = DataReference(self.data_source, columns, condition) @@ -298,7 +302,7 @@ def add_uniques_equality_constraint( filter_func=filter_func, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, + output_processors=output_processors, name=name, ) ) @@ -311,29 +315,31 @@ def add_uniques_superset_constraint( filter_func: Callable[[List[T]], List[T]] = None, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, - output_remainder_slicer=slice(5), condition: Condition = None, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): """Check if unique values of columns are contained in the reference data. The ``UniquesSuperset`` constraint asserts that reference set of expected values, specified via ``uniques``, is contained in given columns of a ``DataSource``. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. ``max_relative_violations`` indicates what fraction of unique values of the given ``DataSource`` are not represented in the reference set of unique values. Please @@ -343,7 +349,7 @@ def add_uniques_superset_constraint( categorical values. See ``Uniques`` for further details on ``map_func``, ``reduce_func``, - ``output_postprocessing_sorter``, and ``output_remainder_slicer``. + and ``output_processors``. """ ref = DataReference(self.data_source, columns, condition) @@ -355,8 +361,7 @@ def add_uniques_superset_constraint( filter_func=filter_func, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, name=name, ) ) @@ -370,12 +375,11 @@ def add_uniques_subset_constraint( compare_distinct: bool = False, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, - output_remainder_slicer=slice(5), condition: Condition = None, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): """Check if the data's unique values are contained in a given set of values. @@ -383,17 +387,20 @@ def add_uniques_subset_constraint( a ``DataSource`` are part of a reference set of expected values, specified via ``uniques``. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. ``max_relative_violations`` indicates what fraction of rows of the given table @@ -406,7 +413,7 @@ def add_uniques_subset_constraint( or if `max_relative_violations` is 0. See ``Uniques`` for further details on ``map_func``, ``reduce_func``, - ``output_postprocessing_sorter``, and ``output_remainder_slicer``. + and ``output_processors``. """ ref = DataReference(self.data_source, columns, condition) @@ -419,8 +426,7 @@ def add_uniques_subset_constraint( compare_distinct=compare_distinct, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, name=name, ) ) @@ -879,6 +885,9 @@ def add_functional_dependency_constraint( value_columns: List[str], condition: Condition = None, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): """ Expresses a functional dependency, a constraint where the `value_columns` are uniquely determined by the `key_columns`. @@ -888,6 +897,9 @@ def add_functional_dependency_constraint( and all other columns are included `value_columns`. This constraint allows for a more general definition of functional dependencies, where the `key_columns` are not necessarily a primary key. + An additional configuration option (for details see the analogous parameter in for ``Uniques``-constraints) + on how the output is sorted and how many counterexamples are shown is available as ``output_processors``. + For more information on functional dependencies, see https://en.wikipedia.org/wiki/Functional_dependency. """ relevant_columns = key_columns + value_columns @@ -896,6 +908,7 @@ def add_functional_dependency_constraint( miscs_constraints.FunctionalDependency( ref, key_columns=key_columns, + output_processors=output_processors, name=name, ) ) @@ -1471,9 +1484,9 @@ def add_uniques_equality_constraint( filter_func: Callable[[List[T]], List[T]] = None, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, condition1: Condition = None, condition2: Condition = None, name: str = None, @@ -1484,20 +1497,23 @@ def add_uniques_equality_constraint( of a ``DataSource``'s columns, are strictly the ones of another ``DataSource``'s columns. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. - - See the ``Uniques`` class for further parameter details on ``map_func``, - ``reduce_func``, and ``output_postprocessing_sorter``. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. + + See :class:`~datajudge.constraints.uniques.Uniques` for further parameter details on ``map_func``, + ``reduce_func``, and ``output_processors``. """ ref = DataReference(self.data_source, columns1, condition1) @@ -1509,7 +1525,7 @@ def add_uniques_equality_constraint( filter_func=filter_func, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, + output_processors=output_processors, name=name, ) ) @@ -1522,13 +1538,12 @@ def add_uniques_superset_constraint( filter_func: Callable[[List[T]], List[T]] = None, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, - output_remainder_slicer=slice(5), condition1: Condition = None, condition2: Condition = None, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): """Check if unique values of columns are contained in the reference data. @@ -1536,17 +1551,20 @@ def add_uniques_superset_constraint( derived from the unique values in given columns of the reference ``DataSource``, is contained in given columns of a ``DataSource``. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. ``max_relative_violations`` indicates what fraction of unique values of the given ``DataSource`` are not represented in the reference set of unique values. Please @@ -1555,8 +1573,8 @@ def add_uniques_superset_constraint( One use of this constraint is to test for consistency in columns with expected categorical values. - See ``Uniques`` for further details on ``map_func``, ``reduce_func``, - ``output_postprocessing_sorter``, and ``output_remainder_slicer``. + See :class:`~datajudge.constraints.uniques.Uniques` for further details on ``map_func``, ``reduce_func``, + and ``output_processors``. """ ref = DataReference(self.data_source, columns1, condition1) @@ -1569,8 +1587,7 @@ def add_uniques_superset_constraint( filter_func=filter_func, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, name=name, ) ) @@ -1584,13 +1601,12 @@ def add_uniques_subset_constraint( compare_distinct: bool = False, map_func: Callable[[T], T] = None, reduce_func: Callable[[Collection], Collection] = None, - output_postprocessing_sorter: Callable[ - [Collection, Optional[Collection]], Collection - ] = None, - output_remainder_slicer=slice(5), condition1: Condition = None, condition2: Condition = None, name: str = None, + output_processors: Optional[ + Union[OutputProcessor, List[OutputProcessor]] + ] = output_processor_limit, ): """Check if the given columns's unique values in are contained in reference data. @@ -1598,29 +1614,32 @@ def add_uniques_subset_constraint( a ``DataSource`` are part of the unique values of given columns of another ``DataSource``. - Null values in the column are ignored. To assert the non-existence of them use - the ``NullAbsence`` constraint via the ``add_null_absence_constraint`` helper method + Null values in the columns ``columns`` are ignored. To assert the non-existence of them use + the :meth:`~datajudge.requirements.WithinRequirement.add_null_absence_constraint`` helper method for ``WithinRequirement``. By default, the null filtering does not trigger if multiple columns are fetched at once. - It can be configured in more detail by supplying a custom `filter_func` function. - Some exemplary implementations are available in this module as `util_filternull_default_deprecated`, - `util_filternull_never`, `util_filternull_element_or_tuple_all`, `util_filternull_element_or_tuple_any`. - For new deployments, using one of the above filters or a custom one is recommended. - Passing None as the argument is equivalent to `util_filternull_default_deprecated`, but triggers a warning. - The deprecated default may change in future versions. - To silence the warning, set `filter_func` explicitly. + It can be configured in more detail by supplying a custom ``filter_func`` function. + Some exemplary implementations are available as :func:`~datajudge.utils.filternull_element`, + :func:`~datajudge.utils.filternull_never`, :func:`~datajudge.utils.filternull_element_or_tuple_all`, + :func:`~datajudge.utils.filternull_element_or_tuple_any`. + Passing ``None`` as the argument is equivalent to :func:`~datajudge.utils.filternull_element` but triggers a warning. + The current default of :func:`~datajudge.utils.filternull_element` + Cause (possibly often unintended) changes in behavior when the users adds a second column + (filtering no longer can trigger at all). + The default will be changed to :func:`~datajudge.utils.filternull_element_or_tuple_all` in future versions. + To silence the warning, set ``filter_func`` explicitly.. ``max_relative_violations`` indicates what fraction of rows of the given table may have values not included in the reference set of unique values. Please note that ``UniquesSubset`` and ``UniquesSuperset`` are not symmetrical in this regard. By default, the number of occurrences affects the computed fraction of violations. - To disable this weighting, set `compare_distinct=True`. - This argument does not have an effect on the test results for other `Uniques` constraints, - or if `max_relative_violations` is 0. + To disable this weighting, set ``compare_distinct=True``. + This argument does not have an effect on the test results for other :class:`~datajudge.constraints.uniques.Uniques` constraints, + or if ``max_relative_violations`` is 0. - See ``Uniques`` for further details on ``map_func``, ``reduce_func``, - ``output_postprocessing_sorter``, and ``output_remainder_slicer``. + See :class:`~datajudge.constraints.uniques.Uniques` for further details on ``map_func``, ``reduce_func``, + and ``output_processors``. """ ref = DataReference(self.data_source, columns1, condition1) @@ -1634,8 +1653,7 @@ def add_uniques_subset_constraint( filter_func=filter_func, map_func=map_func, reduce_func=reduce_func, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, name=name, ) ) diff --git a/src/datajudge/utils.py b/src/datajudge/utils.py index aa968e32..66b2ae11 100644 --- a/src/datajudge/utils.py +++ b/src/datajudge/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Collection, List, Optional, Protocol, Tuple, Union def _fmt_diff_part(s, d): @@ -42,3 +42,143 @@ def format_difference( f"{s1[:diff_idx]}{_fmt_diff_part(s1, diff_idx)}", f"{s2[:diff_idx]}{_fmt_diff_part(s2, diff_idx)}", ) + + +class OutputProcessor(Protocol): + def __call__( # noqa: E704 + self, + collection: Collection, + counts: Optional[Collection] = None, + ) -> Collection: ... + + +def output_processor_sort( + collection: Collection, counts: Optional[Collection] = None +) -> Tuple[Collection, Optional[Collection]]: + """ + Sorts a collection of tuple elements in descending order of their counts, + and for ties, makes use of the ascending order of the elements themselves. + + If the first element is not instanceof tuple, + each element will be transparently packaged into a 1-tuple for processing; + this process is not visible to the caller. + + Handles ``None`` values as described in ``sort_tuple_none_aware``. + """ + collection = list(collection) + if not isinstance(collection[0], tuple): + # package into a 1 tuple and pass into the method again + packaged_list = [(elem,) for elem in collection] + res_main, res_counts = output_processor_sort(packaged_list, counts) + return [elem[0] for elem in res_main], res_counts + + if counts is None: + return sort_tuple_none_aware(collection), counts + + if len(collection) != len(counts): + raise ValueError("collection and counts must have the same length") + + if len(collection) <= 1: + return collection, counts # empty or 1 element lists are always sorted + + lst = sort_tuple_none_aware( + [(-count, *elem) for count, elem in zip(counts, collection)] + ) + return [elem[1:] for elem in lst], [-elem[0] for elem in lst] + + +def output_processor_limit( + collection: Collection, counts: Optional[Collection] = None, limit: int = 100 +) -> Tuple[Collection, Optional[Collection]]: + """ + Limits the collection to the first ``limit`` elements. + If the list was shortened, + will add a ``limit+1``-th string element, + informing the user of the truncation. + The default limit of ``100`` can be adjusted using ``functools.partial`` + """ + collection = list(collection) + + ret_collection = collection[:limit] + ret_counts = None if counts is None else list(counts)[:limit] + if len(collection) > limit: + ret_collection.append( + f"" + ) + if ret_counts is not None: + ret_counts.append( + f"" + ) + + return ret_collection, ret_counts + + +def filternull_element(values: List) -> List: + return [value for value in values if value is not None] + + +def filternull_never(values: List) -> List: + return values + + +def filternull_element_or_tuple_all(values: List) -> List: + return [ + value + for value in values + if value is not None + and not (isinstance(value, tuple) and all(x is None for x in value)) + ] + + +def filternull_element_or_tuple_any(values: List) -> List: + return [ + value + for value in values + if value is not None + and not (isinstance(value, tuple) and any(x is None for x in value)) + ] + + +def sort_tuple_none_aware( + collection: Collection[Tuple], ascending=True +) -> Collection[Tuple]: + """ + Stable sort of a collection of tuples. + Each tuple in the collection must have the same length, + since they are treated as rows in a table, + with ``elem[0]`` being the first column, + ``elem[1]`` the second, etc. for each ``elem`` in ``collection``. + For sorting, ``None`` is considered the same as the default value of the respective column's type. + + ints and floats ``int()`` and ``float()`` yield ``0`` and ``0.0`` respectively; for strings, ``str()`` yields ``''``. + The constructor is determined by calling ``type`` on the first non-``None`` element of the respective column. + + Validates that all elements in collection are tuples and that all tuples have the same length. + """ + lst = list(collection) + + if len(lst) <= 1: + return lst # empty or 1 element lists are always sorted + + if not all(isinstance(elem, tuple) and len(elem) == len(lst[0]) for elem in lst): + raise ValueError("all elements must be tuples and have the same length") + + dtypes_each_tupleelement: List[Optional[type]] = [None] * len(lst[0]) + for dtypeidx in range(len(dtypes_each_tupleelement)): + for elem in lst: + if elem[dtypeidx] is not None: + dtypes_each_tupleelement[dtypeidx] = type(elem[dtypeidx]) + break + else: + # if all entries are None, just use a constant int() == 0 + dtypes_each_tupleelement[dtypeidx] = int + + def replace_None_with_default(elem): + return tuple( + (dtype() if subelem is None else subelem) + for dtype, subelem in zip(dtypes_each_tupleelement, elem) + ) + + return sorted( + lst, key=lambda elem: replace_None_with_default(elem), reverse=not ascending + ) diff --git a/start_postgres.sh b/start_postgres.sh index 46224a8b..80d21c09 100755 --- a/start_postgres.sh +++ b/start_postgres.sh @@ -2,4 +2,4 @@ set -e -docker run -e POSTGRES_DB=datajudge -e POSTGRES_USER=datajudge -e POSTGRES_PASSWORD=datajudge -p 5432:5432 postgres:11 +docker run --name postgres_datajudge --rm -e POSTGRES_DB=datajudge -e POSTGRES_USER=datajudge -e POSTGRES_PASSWORD=datajudge -p 5432:5432 postgres:11 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index e8c69075..e47afa49 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -716,6 +716,26 @@ def unique_table2(engine, metadata): return TEST_DB_NAME, SCHEMA, table_name +@pytest.fixture(scope="module") +def unique_table_extralong(engine, metadata): + if is_impala(engine): + pytest.skip( + "Skipping this larger output check for impala due to it being quite brittle" + ) + if is_bigquery(engine): + pytest.skip( + "Skipping this larger output check for bigquery since creating the table is very slow" + ) + table_name = "unique_table_extralong" + columns = [ + sa.Column("col_int", sa.Integer()), + sa.Column("col_varchar", _string_column(engine)), + ] + data = [{"col_int": i // 2, "col_varchar": f"hi{i // 3}"} for i in range(12345)] + _handle_table(engine, metadata, table_name, columns, data) + return TEST_DB_NAME, SCHEMA, table_name + + @pytest.fixture(scope="module") def nested_table(engine, metadata): table_name = "nested_table" @@ -760,22 +780,40 @@ def functional_dependency_table_multi_key(engine, metadata): # ab -> c # ab -/-> d + # ab -/-> ce columns = [ sa.Column("a", sa.Integer()), sa.Column("b", sa.Integer()), sa.Column("c", sa.Integer()), sa.Column("d", sa.Integer()), + sa.Column("e", sa.Integer()), ] + + # fmt: off data = [ - {"a": 1, "b": 1, "c": 2, "d": 3}, - {"a": 1, "b": 1, "c": 2, "d": 4}, - {"a": 1, "b": 2, "c": 3, "d": 5}, - {"a": 1, "b": 2, "c": 3, "d": 6}, - {"a": 2, "b": 1, "c": 4, "d": 7}, - {"a": 2, "b": 1, "c": 4, "d": 8}, - {"a": 2, "b": 2, "c": 5, "d": 9}, - {"a": 2, "b": 2, "c": 5, "d": 10}, - ] + {"a": 1, "b": 1, "c": 2, "d": 3, "e": 2, }, + {"a": 1, "b": 1, "c": 2, "d": 4, "e": 2, }, + {"a": 1, "b": 2, "c": 3, "d": 5, "e": 3, }, + {"a": 1, "b": 2, "c": 3, "d": 6, "e": 3, }, + {"a": 2, "b": 1, "c": 4, "d": 7, "e": 4, }, + {"a": 2, "b": 1, "c": 4, "d": 8, "e": 4, }, + {"a": 2, "b": 2, "c": 5, "d": 9, "e": 5, }, + {"a": 2, "b": 2, "c": 5, "d": 10, "e": 5, }, + + # if NULL is on the LHS, this is not considered a functional dependency violation + {"a": None, "b": None, "c": 6, "d": 10, "e": 6, }, + {"a": None, "b": None, "c": 6, "d": 11, "e": 8, }, + + {"a": None, "b": 99, "c": 6, "d": 10, "e": 6, }, + {"a": None, "b": 99, "c": 6, "d": 11, "e": 8, }, + {"a": 42, "b": None, "c": 6, "d": 11, "e": 6, }, + {"a": None, "b": 42, "c": 6, "d": 11, "e": 6, }, + {"a": 43, "b": 43, "c": 6, "d": 12, "e": 6, }, + {"a": 43, "b": 43, "c": 6, "d": 12, "e": 7, }, + {"a": 44, "b": 44, "c": None, "d": 12, "e": None, }, + {"a": 44, "b": 44, "c": None, "d": 13, "e": 99, }, + ] + # fmt: on _handle_table(engine, metadata, table_name, columns, data) return TEST_DB_NAME, SCHEMA, table_name @@ -1028,7 +1066,10 @@ def capitalization_table(engine, metadata): primary_key = "" else: str_datatype = "TEXT" + with engine.begin() as connection: + if sa.inspect(connection).has_table(table_name, schema=SCHEMA): + return TEST_DB_NAME, SCHEMA, table_name, uppercase_column, lowercase_column connection.execute(sa.text(f"DROP TABLE IF EXISTS {SCHEMA}.{table_name}")) connection.execute( sa.text( diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index aefe3389..2f0f3b70 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -1,16 +1,10 @@ import functools +import textwrap import pytest import sqlalchemy as sa import datajudge.requirements as requirements -from datajudge.constraints.uniques import ( - util_filternull_default_deprecated, - util_filternull_element_or_tuple_all, - util_filternull_element_or_tuple_any, - util_filternull_never, - util_output_postprocessing_sorter, -) from datajudge.db_access import ( Condition, is_bigquery, @@ -20,6 +14,14 @@ is_postgresql, is_snowflake, ) +from datajudge.utils import ( + filternull_element, + filternull_element_or_tuple_all, + filternull_element_or_tuple_any, + filternull_never, + output_processor_limit, + output_processor_sort, +) def skip_if_mssql(engine): @@ -318,9 +320,9 @@ def test_uniques_equality_within(engine, unique_table1, data): negation, ["col_int", "col_varchar"], [(0, "hi0"), (1, "hi0")], - util_filternull_element_or_tuple_any, + filternull_element_or_tuple_any, None, - util_output_postprocessing_sorter, + output_processor_sort, None, "column(s) 'col_int', 'col_varchar' has the excess element(s) '[(1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]' when compared with the reference values. ", ), @@ -333,7 +335,7 @@ def test_uniques_equality_within_with_outputcheck(engine, unique_table1, data): uniques, filter_func, map_func, - output_postprocessing_sorter, + output_processors, condition, failure_message_suffix, ) = data @@ -344,7 +346,7 @@ def test_uniques_equality_within_with_outputcheck(engine, unique_table1, data): condition=condition, filter_func=filter_func, map_func=map_func, - output_postprocessing_sorter=output_postprocessing_sorter, + output_processors=output_processors, ) test_result = req[0].test(engine) assert operation(test_result.outcome), test_result.failure_message @@ -421,14 +423,12 @@ def test_uniques_equality_between(engine, unique_table1, unique_table2, data): negation, ["col_int", "col_varchar"], ["col_int", "col_varchar"], - util_filternull_element_or_tuple_all, + filternull_element_or_tuple_all, None, - util_output_postprocessing_sorter, + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has the excess element(s) '[(None, 'hi'), (20, 'hi13'), (21, 'hi14'), " - "(22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, " - "'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]' when compared with the reference values. ", + "column(s) 'col_int', 'col_varchar' has the excess element(s) '[(None, 'hi'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]' when compared with the reference values. ", ), ], ) @@ -441,7 +441,7 @@ def test_uniques_equality_between_with_outputcheck( columns2, filter_func, map_func, - output_postprocessing_sorter, + output_processors, condition1, condition2, failure_message_suffix, @@ -452,7 +452,7 @@ def test_uniques_equality_between_with_outputcheck( columns2, filter_func=filter_func, map_func=map_func, - output_postprocessing_sorter=output_postprocessing_sorter, + output_processors=output_processors, condition1=condition1, condition2=condition2, ) @@ -512,14 +512,11 @@ def test_uniques_superset_within(engine, unique_table1, data): ["col_int", "col_varchar"], [(1337, "hi0"), (None, "hi"), (None, None)], 0, - util_filternull_never, - util_output_postprocessing_sorter, - slice(None), + filternull_never, + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.3333333333333333 > 0 (1 / 3) lacking " - "unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. E.g. (slice(None, None, " - "None)) it doesn't have the unique value(s) '[(1337, 'hi0')]'.", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.3333333333333333 > 0 (1 / 3) lacking unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. It doesn't have the unique value(s) '[(1337, 'hi0')]'.", ), ( negation, @@ -527,41 +524,32 @@ def test_uniques_superset_within(engine, unique_table1, data): [(1337, "hi0"), (None, "hi"), (None, None)], 0, None, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.3333333333333333 > 0 (1 / 3) lacking " - "unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. E.g. (slice(None, None, " - "None)) it doesn't have the unique value(s) '[(1337, 'hi0')]'.", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.3333333333333333 > 0 (1 / 3) lacking unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. It doesn't have the unique value(s) '[(1337, 'hi0')]'.", ), ( negation, ["col_int", "col_varchar"], [(1337, "hi0"), (None, "hi"), (None, None)], 0, - util_filternull_element_or_tuple_all, - util_output_postprocessing_sorter, - slice(None), + filternull_element_or_tuple_all, + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.6666666666666666 > 0 (2 / 3) lacking " - "unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. E.g. (slice(None, None, " - "None)) it doesn't have the unique value(s) '[(None, None), (1337, 'hi0')]'.", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.6666666666666666 > 0 (2 / 3) lacking unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. It doesn't have the unique value(s) '[(None, None), (1337, 'hi0')]'.", ), ( negation, ["col_int", "col_varchar"], [(1337, "hi0"), (None, "hi"), (None, None)], 0, - util_filternull_element_or_tuple_any, - util_output_postprocessing_sorter, - slice(None), + filternull_element_or_tuple_any, + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 1.0 > 0 (3 / 3) lacking unique values of " - "'[(None, None), (None, 'hi'), (1337, 'hi0')]'. E.g. (slice(None, None, None)) it doesn't have " - "the unique value(s) '[(None, None), (None, 'hi'), (1337, 'hi0')]'.", + "column(s) 'col_int', 'col_varchar' has a fraction of 1.0 > 0 (3 / 3) lacking unique values of '[(None, None), (None, 'hi'), (1337, 'hi0')]'. It doesn't have the unique value(s) '[(None, None), (None, 'hi'), (1337, 'hi0')]'.", ), ( negation, @@ -576,15 +564,11 @@ def test_uniques_superset_within(engine, unique_table1, data): (9999, "hi4"), ], 0, - util_filternull_element_or_tuple_any, - util_output_postprocessing_sorter, - slice(5), + filternull_element_or_tuple_any, + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.8571428571428571 > 0 (6 / 7) lacking " - "unique values of '[(0, 'hi0'), (1234, 'hi'), (1234, 'hi2'), (1234, 'hi3'), (1234, 'hi4'), " - "(1234, 'hi5'), (9999, 'hi4')]'. E.g. (slice(None, 5, None)) it doesn't have the unique value(s) " - "'[(1234, 'hi'), (1234, 'hi2'), (1234, 'hi3'), (1234, 'hi4'), (1234, 'hi5')]'.", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.8571428571428571 > 0 (6 / 7) lacking unique values of '[(0, 'hi0'), (1234, 'hi'), (1234, 'hi2'), (1234, 'hi3'), (1234, 'hi4'), (1234, 'hi5'), (9999, 'hi4')]'. It doesn't have the unique value(s) '[(1234, 'hi'), (1234, 'hi2'), (1234, 'hi3'), (1234, 'hi4'), (1234, 'hi5'), (9999, 'hi4')]'.", ), ], ) @@ -595,8 +579,7 @@ def test_uniques_superset_within_with_outputcheck(engine, unique_table1, data): uniques, max_relative_violations, filter_func, - output_postprocessing_sorter, - output_remainder_slicer, + output_processors, function, condition, failure_message_suffix, @@ -609,8 +592,7 @@ def test_uniques_superset_within_with_outputcheck(engine, unique_table1, data): filter_func=filter_func, condition=condition, map_func=function, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, ) test_result = req[0].test(engine) assert operation(test_result.outcome), test_result.failure_message @@ -627,13 +609,12 @@ def test_uniques_superset_within_with_outputcheck(engine, unique_table1, data): ["col_int", "col_varchar"], ["col_int", "col_varchar"], 0, - util_filternull_element_or_tuple_any, - util_output_postprocessing_sorter, - slice(None), + filternull_element_or_tuple_any, + [output_processor_sort], None, Condition(raw_string="col_int < 19"), None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.375 > 0 (15 / 40) lacking unique values of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]'. E.g. (slice(None, None, None)) it doesn't have the unique value(s) '[(19, 'hi12'), (19, 'hi13'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]'.Condition on first table: WHERE col_int < 19; ", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.375 > 0 (15 / 40) lacking unique values of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]'. It doesn't have the unique value(s) '[(19, 'hi12'), (19, 'hi13'), (20, 'hi13'), (21, 'hi14'), (22, 'hi14'), (22, 'hi15'), (23, 'hi15'), (24, 'hi16'), (25, 'hi16'), (25, 'hi17'), (26, 'hi17'), (27, 'hi18'), (28, 'hi18'), (28, 'hi19'), (29, 'hi19')]'.Condition on first table: WHERE col_int < 19; ", ), ], ) @@ -646,8 +627,7 @@ def test_uniques_superset_between_with_outputcheck( columns2, max_relative_violations, filter_func, - output_postprocessing_sorter, - output_remainder_slicer, + output_processors, map_func, condition1, condition2, @@ -659,8 +639,7 @@ def test_uniques_superset_between_with_outputcheck( columns2, max_relative_violations=max_relative_violations, filter_func=filter_func, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, map_func=map_func, condition1=condition1, condition2=condition2, @@ -811,14 +790,10 @@ def test_uniques_subset_within(engine, unique_table1, data): 0, None, False, - util_output_postprocessing_sorter, - slice(5), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.9516129032258065 > 0 values (59 / 62) " - "not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, 5, None)) excess elements " - "'[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5')]' with counts [2, 2, 2, 2, 2].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9516129032258065 > 0 values (59 / 62) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, @@ -827,433 +802,142 @@ def test_uniques_subset_within(engine, unique_table1, data): 0, None, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values " - "(40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, None, None)) excess elements " - "'[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, " - "'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, " - "'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), " - "(1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, " - "'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, " - "'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, " - "2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, " - "1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values (40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, None, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, None, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, None, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 DISTINCT values (1 / 30) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 DISTINCT values (1 / 30) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, - util_filternull_default_deprecated, + filternull_element, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, - util_filternull_element_or_tuple_all, + filternull_element_or_tuple_all, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, - util_filternull_element_or_tuple_any, + filternull_element_or_tuple_any, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[3]' with counts [2].", + "column(s) 'col_int' has a fraction of 0.03333333333333333 > 0 values (2 / 60) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[3]' with counts [2].", ), ( negation, ["col_int"], - [ - 0, - 1, - 2, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - ], + [i for i in range(30) if i != 3], 0, - util_filternull_never, + filternull_never, False, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int' has a fraction of 0.06451612903225806 > 0 values (4 / 62) " - "not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, " - "21, 22, 23, 24, 25, 26, 27, 28, 29]'. " - "It has e.g. (slice(None, None, None)) excess elements '[None, 3]' with counts [2, 2].", + "column(s) 'col_int' has a fraction of 0.06451612903225806 > 0 values (4 / 62) not being an element of '[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]'. It has excess elements '[None, 3]' with counts [2, 2].", ), ( negation, ["col_int", "col_varchar"], [(0, "hi0"), (1, "hi0")], 0, - util_filternull_default_deprecated, + filternull_element, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values " - "(40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, None, None)) excess elements " - "'[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, " - "'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, " - "'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), " - "(1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, " - "'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, " - "'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, " - "2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, " - "1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values (40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int", "col_varchar"], [(0, "hi0"), (1, "hi0")], 0, - util_filternull_never, + filternull_never, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values " - "(40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, None, None)) excess elements " - "'[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, " - "'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, " - "'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), " - "(1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, " - "'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, " - "'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, " - "2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, " - "1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9523809523809523 > 0 DISTINCT values (40 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int", "col_varchar"], [(0, "hi0"), (1, "hi0")], 0, - util_filternull_element_or_tuple_all, + filternull_element_or_tuple_all, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.9512195121951219 > 0 DISTINCT values " - "(39 / 41) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, None, None)) excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, " - "'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, " - "'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), " - "(27, 'hi18'), (29, 'hi19'), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, " - "'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, " - "'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), " - "(28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, " - "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9512195121951219 > 0 DISTINCT values (39 / 41) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, 'hi'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int", "col_varchar"], [(0, "hi0"), (1, "hi0")], 0, - util_filternull_element_or_tuple_any, + filternull_element_or_tuple_any, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.95 > 0 DISTINCT values " - "(38 / 40) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. " - "It has e.g. (slice(None, None, None)) excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, " - "'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, " - "'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), " - "(27, 'hi18'), (29, 'hi19'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, " - "'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, " - "'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, " - "'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, " - "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.95 > 0 DISTINCT values (38 / 40) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (1, 'hi1'), (4, 'hi2'), (4, 'hi3'), (7, 'hi4'), (7, 'hi5'), (10, 'hi6'), (10, 'hi7'), (13, 'hi8'), (13, 'hi9'), (16, 'hi10'), (16, 'hi11'), (19, 'hi12'), (19, 'hi13'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1].", ), ], ) @@ -1265,8 +949,7 @@ def test_uniques_subset_within_complex_with_outputcheck(engine, unique_table1, d max_relative_violations, filter_func, compare_distinct, - output_postprocessing_sorter, - output_remainder_slicer, + output_processors, function, condition, failure_message_suffix, @@ -1278,8 +961,7 @@ def test_uniques_subset_within_complex_with_outputcheck(engine, unique_table1, d max_relative_violations, filter_func=filter_func, compare_distinct=compare_distinct, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, condition=condition, map_func=function, ) @@ -1299,70 +981,106 @@ def test_uniques_subset_within_complex_with_outputcheck(engine, unique_table1, d ( negation, ["col_int", "col_varchar"], + [(0, "hi0"), (1, "hi0")], + 0, + filternull_element_or_tuple_any, + True, + [output_processor_sort, output_processor_limit], + None, + None, + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9997569866342649 > 0 DISTINCT values (8228 / 8230) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), (9, 'hi6'), (11, 'hi7'), (12, 'hi8'), (14, 'hi9'), (15, 'hi10'), (17, 'hi11'), (18, 'hi12'), (20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (30, 'hi20'), (32, 'hi21'), (33, 'hi22'), (35, 'hi23'), (36, 'hi24'), (38, 'hi25'), (39, 'hi26'), (41, 'hi27'), (42, 'hi28'), (44, 'hi29'), (45, 'hi30'), (47, 'hi31'), (48, 'hi32'), (50, 'hi33'), (51, 'hi34'), (53, 'hi35'), (54, 'hi36'), (56, 'hi37'), (57, 'hi38'), (59, 'hi39'), (60, 'hi40'), (62, 'hi41'), (63, 'hi42'), (65, 'hi43'), (66, 'hi44'), (68, 'hi45'), (69, 'hi46'), (71, 'hi47'), (72, 'hi48'), (74, 'hi49'), (75, 'hi50'), (77, 'hi51'), (78, 'hi52'), (80, 'hi53'), (81, 'hi54'), (83, 'hi55'), (84, 'hi56'), (86, 'hi57'), (87, 'hi58'), (89, 'hi59'), (90, 'hi60'), (92, 'hi61'), (93, 'hi62'), (95, 'hi63'), (96, 'hi64'), (98, 'hi65'), (99, 'hi66'), (101, 'hi67'), (102, 'hi68'), (104, 'hi69'), (105, 'hi70'), (107, 'hi71'), (108, 'hi72'), (110, 'hi73'), (111, 'hi74'), (113, 'hi75'), (114, 'hi76'), (116, 'hi77'), (117, 'hi78'), (119, 'hi79'), (120, 'hi80'), (122, 'hi81'), (123, 'hi82'), (125, 'hi83'), (126, 'hi84'), (128, 'hi85'), (129, 'hi86'), (131, 'hi87'), (132, 'hi88'), (134, 'hi89'), (135, 'hi90'), (137, 'hi91'), (138, 'hi92'), (140, 'hi93'), (141, 'hi94'), (143, 'hi95'), (144, 'hi96'), (146, 'hi97'), (147, 'hi98'), (149, 'hi99'), (150, 'hi100'), '']' with counts [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ''].", + ), + ( + negation, ["col_int", "col_varchar"], + [(0, "hi0"), (1, "hi0")], 0, - util_filternull_element_or_tuple_any, + filternull_element_or_tuple_any, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort, functools.partial(output_processor_limit, limit=5)], None, None, + "column(s) 'col_int', 'col_varchar' has a fraction of 0.9997569866342649 > 0 DISTINCT values (8228 / 8230) not being an element of '[(0, 'hi0'), (1, 'hi0')]'. It has excess elements '[(2, 'hi1'), (3, 'hi2'), (5, 'hi3'), (6, 'hi4'), (8, 'hi5'), '']' with counts [2, 2, 2, 2, 2, ''].", + ), + ], +) +def test_uniques_subset_within_complex_with_outputcheck_extralong( + engine, unique_table_extralong, data +): + ( + operation, + columns, + uniques, + max_relative_violations, + filter_func, + compare_distinct, + output_processors, + function, + condition, + failure_message_suffix, + ) = data + req = requirements.WithinRequirement.from_table(*unique_table_extralong) + req.add_uniques_subset_constraint( + columns, + uniques, + max_relative_violations, + filter_func=filter_func, + compare_distinct=compare_distinct, + output_processors=output_processors, + condition=condition, + map_func=function, + ) + + test_result = req[0].test(engine) + print(test_result) + print(test_result.failure_message) + assert operation(test_result.outcome), test_result.failure_message + assert test_result.failure_message.endswith( + failure_message_suffix + ), test_result.failure_message + + +@pytest.mark.parametrize( + "data", + [ + ( + negation, + ["col_int", "col_varchar"], + ["col_int", "col_varchar"], + 0, + filternull_element_or_tuple_any, + True, + [output_processor_sort], + None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.325 > 0 DISTINCT values (13 / 40) " - "not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, " - "'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, " - "'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), " - "(14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, " - "'hi12'), (19, 'hi13')]'. It has e.g. (slice(None, None, None)) excess elements '[(20, " - "'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, " - "'hi19'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, " - "'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1].", + None, + "column(s) 'col_int', 'col_varchar' has a fraction of 0.325 > 0 DISTINCT values (13 / 40) not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13')]'. It has excess elements '[(20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int", "col_varchar"], ["col_int", "col_varchar"], 0, - util_filternull_element_or_tuple_all, + filternull_element_or_tuple_all, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.34146341463414637 > 0 DISTINCT " - "values (14 / 41) not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, " - "'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, " - "'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, " - "'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), " - "(18, 'hi12'), (19, 'hi12'), (19, 'hi13')]'. It has e.g. (slice(None, None, None)) excess " - "elements '[(20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, " - "'hi18'), (29, 'hi19'), (None, 'hi'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, " - "'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, " - "1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.34146341463414637 > 0 DISTINCT values (14 / 41) not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13')]'. It has excess elements '[(20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, 'hi'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1].", ), ( negation, ["col_int", "col_varchar"], ["col_int", "col_varchar"], 0, - util_filternull_never, + filternull_never, True, - util_output_postprocessing_sorter, - slice(None), + [output_processor_sort], None, None, None, - "column(s) 'col_int', 'col_varchar' has a fraction of 0.35714285714285715 > 0 DISTINCT " - "values (15 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, " - "'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, " - "'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, " - "'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), " - "(18, 'hi12'), (19, 'hi12'), (19, 'hi13')]'. It has e.g. (slice(None, None, None)) excess " - "elements '[(20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, " - "'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (22, 'hi14'), (22, 'hi15'), (25, " - "'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, " - "1, 1, 1, 1, 1, 1, 1].", + "column(s) 'col_int', 'col_varchar' has a fraction of 0.35714285714285715 > 0 DISTINCT values (15 / 42) not being an element of '[(0, 'hi0'), (1, 'hi0'), (1, 'hi1'), (2, 'hi1'), (3, 'hi2'), (4, 'hi2'), (4, 'hi3'), (5, 'hi3'), (6, 'hi4'), (7, 'hi4'), (7, 'hi5'), (8, 'hi5'), (9, 'hi6'), (10, 'hi6'), (10, 'hi7'), (11, 'hi7'), (12, 'hi8'), (13, 'hi8'), (13, 'hi9'), (14, 'hi9'), (15, 'hi10'), (16, 'hi10'), (16, 'hi11'), (17, 'hi11'), (18, 'hi12'), (19, 'hi12'), (19, 'hi13')]'. It has excess elements '[(20, 'hi13'), (21, 'hi14'), (23, 'hi15'), (24, 'hi16'), (26, 'hi17'), (27, 'hi18'), (29, 'hi19'), (None, None), (None, 'hi'), (22, 'hi14'), (22, 'hi15'), (25, 'hi16'), (25, 'hi17'), (28, 'hi18'), (28, 'hi19')]' with counts [2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1].", ), ], ) @@ -1376,8 +1094,7 @@ def test_uniques_subset_between_with_outputcheck( max_relative_violations, filter_func, compare_distinct, - output_postprocessing_sorter, - output_remainder_slicer, + output_processors, map_func, condition1, condition2, @@ -1390,8 +1107,7 @@ def test_uniques_subset_between_with_outputcheck( max_relative_violations=max_relative_violations, filter_func=filter_func, compare_distinct=compare_distinct, - output_postprocessing_sorter=output_postprocessing_sorter, - output_remainder_slicer=output_remainder_slicer, + output_processors=output_processors, map_func=map_func, condition1=condition1, condition2=condition2, @@ -1493,6 +1209,73 @@ def test_functional_dependency_within_multi_key( assert operation(req[0].test(engine).outcome) +@pytest.mark.parametrize( + "data", + [ + ( + negation, + ["a", "b"], + ["d"], + [output_processor_sort], + textwrap.dedent( + """\ + column(s) 'a', 'b', 'd' has violations of functional dependence (in total 10 rows): + (1, 1, 3) + (1, 1, 4) + (1, 2, 5) + (1, 2, 6) + (2, 1, 7) + (2, 1, 8) + (2, 2, 9) + (2, 2, 10) + (44, 44, 12) + (44, 44, 13) + """ + ).strip(), + ), + ( + negation, + ["a", "b"], + ["c", "e"], + [output_processor_sort], + textwrap.dedent( + """\ + column(s) 'a', 'b', 'c', 'e' has violations of functional dependence (in total 4 rows): + (43, 43, 6, 6) + (43, 43, 6, 7) + (44, 44, None, None) + (44, 44, None, 99) + """ + ).strip(), + ), + ], +) +def test_functional_dependency_within_multi_key_with_outputcheck( + engine, functional_dependency_table_multi_key, data +): + ( + operation, + key_columns, + value_columns, + output_processors, + failure_message_suffix, + ) = data + req = requirements.WithinRequirement.from_table( + *functional_dependency_table_multi_key + ) + req.add_functional_dependency_constraint( + key_columns, + value_columns, + output_processors=output_processors, + ) + + test_result = req[0].test(engine) + assert operation(test_result.outcome) + assert test_result.failure_message.endswith( + failure_message_suffix + ), test_result.failure_message + + def _flatten_and_filter(data): # Flattening one level res = [] @@ -2945,7 +2728,7 @@ def test_max_null_fraction_between(engine, unique_table1, data): ) def test_column_type_within(engine, mix_table1, data): (operation, col_name, type_name) = data - if is_impala(engine) and type_name is str: + if is_impala(engine) and isinstance(type_name, str): type_name = {"VARCHAR": "string", "INTEGER": "int"}[type_name] req = requirements.WithinRequirement.from_table(*mix_table1) req.add_column_type_constraint(col_name, type_name) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index c56eb6ca..897ebfef 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,6 +1,11 @@ import pytest -from datajudge.utils import format_difference +from datajudge.utils import ( + format_difference, + output_processor_limit, + output_processor_sort, + sort_tuple_none_aware, +) @pytest.mark.parametrize( @@ -53,3 +58,272 @@ def test_diff_color(n1, n2, sep_decimal, expected_n1, expected_n2): expected_n1, expected_n2, ) + + +@pytest.mark.parametrize( + "input_main, input_counts, output_main, output_counts", + [ + ( + [5, None, -2, 42, 1337, 42, 42, -2, None, None], + None, + [-2, -2, None, None, None, 5, 42, 42, 42, 1337], + None, + ), + ( + {5, None, -2, 42, 1337, 42, 42, -2, None, None}, + None, + [-2, None, 5, 42, 1337], + None, + ), + ( + [5, None, -2, 42, 1337, 42, 42, -2, None, None], + [0, 42, 1, 3, 1, 2, 2, 0, 0, -99], + [None, 42, 42, 42, -2, 1337, -2, None, 5, None], + [42, 3, 2, 2, 1, 1, 0, 0, 0, -99], + ), + ( + [(5, 5), (1, None), (1, -2), (3, 42), (4, 1337)], + None, + [(1, -2), (1, None), (3, 42), (4, 1337), (5, 5)], + None, + ), + ( + [(5, 5), (1, None), (1, -2), (3, 42), (4, 1337)], + [0, 0, 0, 0, 1], + [(4, 1337), (1, -2), (1, None), (3, 42), (5, 5)], + [1, 0, 0, 0, 0], + ), + ( + [ + [5, 5], + [1, 5], + [1, -2], + [3, 42], + [4, 1337], + ], + None, + [[1, -2], [1, 5], [3, 42], [4, 1337], [5, 5]], + None, + ), + ], +) +def test_output_processor_sort(input_main, input_counts, output_main, output_counts): + input_main_copy = input_main.copy() + input_counts_copy = input_counts.copy() if input_counts is not None else None + assert output_processor_sort(input_main, input_counts) == ( + output_main, + output_counts, + ) + assert input_main == input_main_copy + assert input_counts == input_counts_copy + + +@pytest.mark.parametrize( + "input_main, input_counts, output_main, output_counts, error", + [ + ( + [ + [5, 5], + [1, None], + [1, -2], + [3, 42], + [4, 1337], + ], + None, + None, + None, + TypeError, + ), + ( + [5, None, -2, 42, 1337, 42, 42, -2, None, None], + [0, 42], + None, + None, + ValueError, + ), + ], +) +def test_output_processor_sort_error( + input_main, input_counts, output_main, output_counts, error +): + with pytest.raises(error): + output_processor_sort(input_main, input_counts) + + +def test_output_processor_limit_defaults(): + input_main = list(range(12345)) + input_counts = None + + input_main_copy = input_main.copy() + input_counts_copy = input_counts.copy() if input_counts is not None else None + assert output_processor_limit(input_main, input_counts) == ( + list(range(100)) + + [""], + None, + ) + assert input_main == input_main_copy + assert input_counts == input_counts_copy + + +def test_output_processor_limit_custom(): + input_main = list(range(12345)) + input_counts = None + + input_main_copy = input_main.copy() + input_counts_copy = input_counts.copy() if input_counts is not None else None + assert output_processor_limit(input_main, input_counts, limit=42) == ( + list(range(42)) + + [""], + None, + ) + assert input_main == input_main_copy + assert input_counts == input_counts_copy + + +def test_output_processor_limit_withcounts(): + input_main = list(range(12345)) + input_counts = list(range(1, 12345 + 1)) + + input_main_copy = input_main.copy() + input_counts_copy = input_counts.copy() if input_counts is not None else None + assert output_processor_limit(input_main, input_counts, limit=42) == ( + list(range(42)) + + [""], + list(range(1, 42 + 1)) + + [""], + ) + assert input_main == input_main_copy + assert input_counts == input_counts_copy + + +class CustomObject: + def __init__(self, value=42): + self.value = value + + def __eq__(self, other): + return self.value == other.value + + def __lt__(self, other): + return self.value < other.value + + def __repr__(self): + return f"CustomObject({self.value})" + + +@pytest.mark.parametrize( + "input_main, output_main", + [ + ( + [ + (5, -3, 42), + (None, None, None), + (3, 5, 42), + (None, None, None), + (3, 5, 42), + (3, 5, -5), + (-3, 5, 42), + (None, None, -1), + (0, 0, -1), # this must occur inbetween the (None, None, -1) tuples + # since sorted(...) is stable + (0, 0, -2), + (0, 0, 2), + (None, None, -1), + (None, 3, 42), + ], + [ + (-3, 5, 42), + (0, 0, -2), + (None, None, -1), + (0, 0, -1), + (None, None, -1), + (None, None, None), + (None, None, None), + (0, 0, 2), + (None, 3, 42), + (3, 5, -5), + (3, 5, 42), + (3, 5, 42), + (5, -3, 42), + ], + ), + ( + [ + (5, 3.14, None, None, None), + (None, None, "abc", CustomObject(13), None), + (-3, -3.14, "äöü", CustomObject(1337), None), + ], + [ + ( + -3, + -3.14, + "äöü", + CustomObject(1337), + None, + ), + ( + None, + None, + "abc", + CustomObject(13), + None, + ), + ( + 5, + 3.14, + None, + None, + None, + ), + ], + ), + ( + [(3.14,), (None,), (-1,)], + [(-1,), (None,), (3.14,)], + ), + ( + [(None,), ("ÄÖÜ",), ("abc",)], + [(None,), ("abc",), ("ÄÖÜ",)], + ), + ( + [(None,), (CustomObject(13),), (CustomObject(1337),)], + [(CustomObject(13),), (None,), (CustomObject(1337),)], + ), + ( + [(None, 5), (None, -2), (None, None)], + [(None, -2), (None, None), (None, 5)], + ), + ], +) +def test_sort_tuple_none_aware(input_main, output_main): + input_main_copy = input_main.copy() + assert sort_tuple_none_aware(input_main) == output_main + assert input_main == input_main_copy + + assert sort_tuple_none_aware(input_main, ascending=False) == output_main[::-1] + assert input_main == input_main_copy + + +@pytest.mark.parametrize( + "input_main, output_main, error", + [ + ( + [ + (5, -3, 42), + [None, None, None], + ], + None, + ValueError, + ), + ( + [ + (5, -3, 42), + (None, None), + ], + None, + ValueError, + ), + ], +) +def test_sort_tuple_none_error(input_main, output_main, error): + with pytest.raises(error): + sort_tuple_none_aware(input_main)