Quantco · SimonLangerQC · Jun 12, 2024 · Jun 3, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/run_integration_tests_postgres.sh b/run_integration_tests_postgres.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+docker stop $(docker ps -q --filter name=postgres_datajudge)
+
+./start_postgres.sh &
+bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" & 
+
+read -p "Press enter to once postgres is ready: "
+kill %%
+
+echo "STARTING PYTEST"
+pytest tests/integration -vv  --backend=postgres "$@"
+
+docker stop $(docker ps -q --filter name=postgres_datajudge)
+
diff --git a/src/datajudge/constraints/base.py b/src/datajudge/constraints/base.py
@@ -1,12 +1,13 @@
 import abc
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, Callable, List, Optional, Tuple, TypeVar
+from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union
 
 import sqlalchemy as sa
 
 from ..db_access import DataReference
 from ..formatter import Formatter
+from ..utils import OutputProcessor
 
 DEFAULT_FORMATTER = Formatter()
 
@@ -113,7 +114,15 @@ class Constraint(abc.ABC):
     """
 
     def __init__(
-        self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None
+        self,
+        ref: DataReference,
+        *,
+        ref2=None,
+        ref_value: Any = None,
+        name: str = None,
+        output_processors: Optional[
+            Union[OutputProcessor, List[OutputProcessor]]
+        ] = None,
     ):
         self._check_if_valid_between_or_within(ref2, ref_value)
         self.ref = ref
@@ -125,6 +134,12 @@ def __init__(
         self.factual_queries: Optional[List[str]] = None
         self.target_queries: Optional[List[str]] = None
 
+        if (output_processors is not None) and (
+            not isinstance(output_processors, list)
+        ):
+            output_processors = [output_processors]
+        self.output_processors = output_processors
+
     def _check_if_valid_between_or_within(
         self, ref2: Optional[DataReference], ref_value: Optional[Any]
     ):
@@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
             target_queries,
         )
 
+    def apply_output_formatting_no_counts(self, values: Collection) -> Collection:
+        if self.output_processors is not None:
+            for output_processor in self.output_processors:
+                values, _ = output_processor(values)
+        return values
+
 
 def format_sample(sample, ref: DataReference) -> str:
     """Build a string from a database row indicating its column values."""

diff --git a/src/datajudge/constraints/miscs.py b/src/datajudge/constraints/miscs.py
@@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
             return TestResult.success()
 
         assertion_text = (
-            f"{self.ref} has violations of functional dependence, e.g.:\n"
-            + "\n".join([f"{tuple(violation)}" for violation in violations][:5])
+            f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n"
+            + "\n".join(
+                [
+                    f"{violation}"
+                    for violation in self.apply_output_formatting_no_counts(
+                        [tuple(elem) for elem in violations]
+                    )
+                ]
+            )
         )
         return TestResult.failure(assertion_text)
 

diff --git a/src/datajudge/constraints/uniques.py b/src/datajudge/constraints/uniques.py
@@ -1,4 +1,5 @@
 import abc
+import warnings
 from collections import Counter
 from itertools import zip_longest
 from math import ceil, floor
@@ -8,6 +9,7 @@
 
 from .. import db_access
 from ..db_access import DataReference
+from ..utils import OutputProcessor, filternull_element
 from .base import Constraint, OptionalSelections, T, TestResult, ToleranceGetter
 
 
@@ -42,9 +44,18 @@ class Uniques(Constraint, abc.ABC):
     are part of a reference set of expected values - either externally supplied
     through parameter `uniques` or obtained from another `DataSource`.
 
-    Null values in the column are ignored. To assert the non-existence of them use
+    Null values in the column are ignored by default. To assert the non-existence of them use
     the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
     `WithinRequirement`.
+    By default, the null filtering does not trigger if multiple columns are fetched at once.
+    It can be configured in more detail by supplying a custom ``filter_func`` function.
+    Some exemplary implementations are available in this module as ``datajudge.utils.filternull_element``,
+    ``datajudge.utils.filternull_never``, ``datajudge.utils.filternull_element_or_tuple_all``, ``datajudge.utils.filternull_element_or_tuple_any``.
+    For new deployments, using one of the above filters or a custom one is recommended.
+    Passing None as the argument is equivalent to ``datajudge.utils.filternull_element``, but triggers a warning.
+    The deprecated default may change in future versions.
+    To silence the warning, set ``filter_func`` explicitly.
+
 
     There are two ways to do some post processing of the data obtained from the
     database by providing a function to be executed. In general, no postprocessing
@@ -63,6 +74,30 @@ class Uniques(Constraint, abc.ABC):
     (eager or lazy) of the same type as the type of the values of the column (in their
     Python equivalent).
 
+    Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
+    threshold for the proportion of elements in the data that can violate the constraint
+    (default: 0).
+    Setting this argument is currently not supported for `UniquesEquality`.
+
+    For `UniquesSubset`, by default,
+    the number of occurrences affects the computed fraction of violations.
+    To disable this weighting, set `compare_distinct=True`.
+    This argument does not have an effect on the test results for other `Uniques` constraints,
+    or if `max_relative_violations` is 0.
+
+    By default, the assertion messages make use of sets,
+    thus, they may differ from run to run despite the exact same situation being present,
+    and can have an arbitrary length.
+    To enforce a reproducible, limited output via (e.g.) sorting and slicing,
+    set `output_processors` to a list of callables
+
+    Each callable takes in two collections, and returns modified (e.g. sorted) versions of them.
+    In most cases, the second argument is simply None,
+    but for `UniquesSubset` it is the counts of each of the elements.
+    The suggested functions are ``datajudge.utils.output_processor_sort``
+    and ``datajudge.utils.output_processor_limit``.,
+    - see their respective docstrings for details.
+
     One use is of this constraint is to test for consistency in columns with expected
     categorical values.
     """
@@ -71,26 +106,44 @@ def __init__(
         self,
         ref: DataReference,
         name: str = None,
+        output_processors: Optional[
+            Union[OutputProcessor, List[OutputProcessor]]
+        ] = None,
         *,
         ref2: DataReference = None,
         uniques: Collection = None,
+        filter_func: Callable[[List[T]], List[T]] = None,
         map_func: Callable[[T], T] = None,
         reduce_func: Callable[[Collection], Collection] = None,
         max_relative_violations=0,
+        compare_distinct=False,
     ):
         ref_value: Optional[Tuple[Collection, List]]
         ref_value = (uniques, []) if uniques else None
-        super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name)
+        super().__init__(
+            ref,
+            ref2=ref2,
+            ref_value=ref_value,
+            name=name,
+            output_processors=output_processors,
+        )
+
+        if filter_func is None:
+            warnings.warn("Using deprecated default null filter function.")
+            filter_func = filternull_element
+
+        self.filter_func = filter_func
         self.local_func = map_func
         self.global_func = reduce_func
         self.max_relative_violations = max_relative_violations
+        self.compare_distinct = compare_distinct
 
     def retrieve(
         self, engine: sa.engine.Engine, ref: DataReference
     ) -> Tuple[Tuple[List[T], List[int]], OptionalSelections]:
         uniques, selection = db_access.get_uniques(engine, ref)
         values = list(uniques.keys())
-        values = list(filter(lambda value: value is not None, values))
+        values = self.filter_func(values)
         counts = [uniques[value] for value in values]
         if self.local_func:
             values = list(map(self.local_func, values))
@@ -106,7 +159,11 @@ def retrieve(
 class UniquesEquality(Uniques):
     def __init__(self, args, name: str = None, **kwargs):
         if kwargs.get("max_relative_violations"):
-            raise RuntimeError("Some useful message")
+            raise RuntimeError(
+                "max_relative_violations is not supported for UniquesEquality."
+            )
+        if kwargs.get("compare_distinct"):
+            raise RuntimeError("compare_distinct is not supported for UniquesEquality.")
         super().__init__(args, name=name, **kwargs)
 
     def compare(
@@ -123,22 +180,22 @@ def compare(
         if not is_subset and not is_superset:
             assertion_text = (
                 f"{self.ref} doesn't have the element(s) "
-                f"'{lacking_values}' and has the excess element(s) "
-                f"'{excess_values}' when compared with the reference values. "
+                f"'{self.apply_output_formatting_no_counts(lacking_values)}' and has the excess element(s) "
+                f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
                 f"{self.condition_string}"
             )
             return False, assertion_text
         if not is_subset:
             assertion_text = (
                 f"{self.ref} has the excess element(s) "
-                f"'{excess_values}' when compared with the reference values. "
+                f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
                 f"{self.condition_string}"
             )
             return False, assertion_text
         if not is_superset:
             assertion_text = (
                 f"{self.ref} doesn't have the element(s) "
-                f"'{lacking_values}' when compared with the reference values. "
+                f"'{self.apply_output_formatting_no_counts(lacking_values)}' when compared with the reference values. "
                 f"{self.condition_string}"
             )
             return False, assertion_text
@@ -153,28 +210,49 @@ def compare(
     ) -> Tuple[bool, Optional[str]]:
         factual_values, factual_counts = factual
         target_values, _ = target
+
         is_subset, remainder = _subset_violation_counts(
             factual_values, factual_counts, target_values
         )
-        n_rows = sum(factual_counts)
-        n_violations = sum(remainder.values())
+        if not self.compare_distinct:
+            n_rows = sum(factual_counts)
+            n_violations = sum(remainder.values())
+        else:
+            n_rows = len(factual_values)
+            n_violations = len(remainder)
+
         if (
             n_rows > 0
             and (relative_violations := (n_violations / n_rows))
             > self.max_relative_violations
         ):
+            output_elemes, output_counts = list(remainder.keys()), list(
+                remainder.values()
+            )
+            if self.output_processors is not None:
+                for output_processor in self.output_processors:
+                    output_elemes, output_counts = output_processor(
+                        output_elemes, output_counts
+                    )
+
             assertion_text = (
                 f"{self.ref} has a fraction of {relative_violations} > "
-                f"{self.max_relative_violations} values not being an element of "
-                f"'{set(target_values)}'. It has e.g. excess elements "
-                f"'{list(remainder.keys())[:5]}'."
+                f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of "
+                f"'{self.apply_output_formatting_no_counts(set(target_values))}'. It has excess elements "
+                f"'{output_elemes}' "
+                f"with counts {output_counts}."
                 f"{self.condition_string}"
             )
             return False, assertion_text
         return True, None
 
 
 class UniquesSuperset(Uniques):
+    def __init__(self, args, name: str = None, **kwargs):
+        if kwargs.get("compare_distinct"):
+            raise RuntimeError("compare_distinct is not supported for UniquesSuperset.")
+        super().__init__(args, name=name, **kwargs)
+
     def compare(
         self,
         factual: Tuple[List[T], List[int]],
@@ -185,14 +263,18 @@ def compare(
         is_superset, remainder = _is_superset(factual_values, target_values)
         if (
             len(factual_values) > 0
-            and (relative_violations := (len(remainder) / len(target_values)))
+            and (
+                relative_violations := (
+                    (n_violations := (len(remainder))) / (n_rows := len(target_values))
+                )
+            )
             > self.max_relative_violations
         ):
             assertion_text = (
                 f"{self.ref} has a fraction of "
-                f"{relative_violations} > {self.max_relative_violations} "
-                f"lacking unique values of '{set(target_values)}'. E.g. it "
-                f"doesn't have the unique value(s) '{list(remainder)[:5]}'."
+                f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) "
+                f"lacking unique values of '{self.apply_output_formatting_no_counts(set(target_values))}'. It "
+                f"doesn't have the unique value(s) '{self.apply_output_formatting_no_counts(list(remainder))}'."
                 f"{self.condition_string}"
             )
             return False, assertion_text