Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding configuration options to uniques functionality #224

Merged
merged 43 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
9ea8866
adding configuration options to uniques functionality
SimonLangerQC Jun 3, 2024
92f2933
improve docstrings
SimonLangerQC Jun 4, 2024
63e9634
move util_ functions to datajudge.utils
SimonLangerQC Jun 4, 2024
3782955
updates following comments
SimonLangerQC Jun 5, 2024
6cad13e
add configuration options to functional dependency checks, and utilit…
SimonLangerQC Jun 5, 2024
7dec26e
fix typo in run_integration_tests_postgres.sh
SimonLangerQC Jun 5, 2024
6322310
rename to output_processor
SimonLangerQC Jun 5, 2024
308ff99
output_processor only
SimonLangerQC Jun 5, 2024
c1fec1a
allow for single output processor
SimonLangerQC Jun 5, 2024
91ea51c
add output_processor_limit
SimonLangerQC Jun 6, 2024
0f94589
Docs update
SimonLangerQC Jun 7, 2024
693b29b
Docs update
SimonLangerQC Jun 7, 2024
5c0c03b
Docs update
SimonLangerQC Jun 7, 2024
52f993d
Docs update
SimonLangerQC Jun 7, 2024
887b0e6
Docs update
SimonLangerQC Jun 7, 2024
0699ddb
Docs update
SimonLangerQC Jun 7, 2024
3ef980e
Docs update
SimonLangerQC Jun 7, 2024
13d866f
Docs update
SimonLangerQC Jun 7, 2024
582af61
Docs update
SimonLangerQC Jun 7, 2024
a522268
Docs update
SimonLangerQC Jun 7, 2024
0c87d34
Docs update
SimonLangerQC Jun 7, 2024
cdd6e1f
Docs update
SimonLangerQC Jun 7, 2024
658c8ac
Docs update
SimonLangerQC Jun 7, 2024
b5c1a1f
Docs update
SimonLangerQC Jun 7, 2024
f40c5c0
Docs update
SimonLangerQC Jun 7, 2024
151d53b
Docs update
SimonLangerQC Jun 7, 2024
2f99478
Docs update
SimonLangerQC Jun 7, 2024
ea326ad
Docs update
SimonLangerQC Jun 7, 2024
c712205
Docs update
SimonLangerQC Jun 7, 2024
9eb3433
update doc string on null columns everywhere and fix typo
SimonLangerQC Jun 7, 2024
e6c396a
Update docs
SimonLangerQC Jun 7, 2024
3ca2003
Update docs
SimonLangerQC Jun 7, 2024
4ddda10
Update docs
SimonLangerQC Jun 7, 2024
0502720
docs updates
SimonLangerQC Jun 7, 2024
cf42e38
update docs
SimonLangerQC Jun 7, 2024
409b611
filternull docs clarification
SimonLangerQC Jun 7, 2024
536096a
replace assert by raise ValueError
SimonLangerQC Jun 7, 2024
0067b84
shorten name to apply_output_formatting
SimonLangerQC Jun 10, 2024
143f0f9
add unit tests for new utils functions
SimonLangerQC Jun 10, 2024
b8842a7
set default to limit 100 elements
SimonLangerQC Jun 10, 2024
0041e99
ensure all relevant tests run for impala and ensure they pass
SimonLangerQC Jun 11, 2024
cb63bef
disable extralong test for bigquery due to slow speed
SimonLangerQC Jun 11, 2024
62f6877
capitalization test handle parallel if table already created
SimonLangerQC Jun 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions run_integration_tests_postgres.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

docker stop $(docker ps -q --filter name=postgres_datajudge)

./start_postgres.sh &
bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" &

read -p "Press enter to once postgres is ready: "
kill %%

echo "STARTING PYTEST"
pytest tests/integration -vv --backend=postgres "$@"

docker stop $(docker ps -q --filter name=postgres_datajudge)

25 changes: 23 additions & 2 deletions src/datajudge/constraints/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import abc
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Any, Callable, List, Optional, Tuple, TypeVar
from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union

import sqlalchemy as sa

from ..db_access import DataReference
from ..formatter import Formatter
from ..utils import OutputProcessor

DEFAULT_FORMATTER = Formatter()

Expand Down Expand Up @@ -113,7 +114,15 @@ class Constraint(abc.ABC):
"""

def __init__(
self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None
self,
ref: DataReference,
*,
ref2=None,
ref_value: Any = None,
name: str = None,
output_processors: Optional[
Union[OutputProcessor, List[OutputProcessor]]
] = None,
):
self._check_if_valid_between_or_within(ref2, ref_value)
self.ref = ref
Expand All @@ -125,6 +134,12 @@ def __init__(
self.factual_queries: Optional[List[str]] = None
self.target_queries: Optional[List[str]] = None

if (output_processors is not None) and (
not isinstance(output_processors, list)
):
output_processors = [output_processors]
self.output_processors = output_processors

def _check_if_valid_between_or_within(
self, ref2: Optional[DataReference], ref_value: Optional[Any]
):
Expand Down Expand Up @@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
target_queries,
)

def apply_output_formatting_no_counts(self, values: Collection) -> Collection:
if self.output_processors is not None:
for output_processor in self.output_processors:
values, _ = output_processor(values)
return values


def format_sample(sample, ref: DataReference) -> str:
"""Build a string from a database row indicating its column values."""
Expand Down
11 changes: 9 additions & 2 deletions src/datajudge/constraints/miscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
return TestResult.success()

assertion_text = (
f"{self.ref} has violations of functional dependence, e.g.:\n"
+ "\n".join([f"{tuple(violation)}" for violation in violations][:5])
f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n"
+ "\n".join(
[
f"{violation}"
for violation in self.apply_output_formatting_no_counts(
[tuple(elem) for elem in violations]
)
]
)
)
return TestResult.failure(assertion_text)

Expand Down
116 changes: 99 additions & 17 deletions src/datajudge/constraints/uniques.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
import warnings
from collections import Counter
from itertools import zip_longest
from math import ceil, floor
Expand All @@ -8,6 +9,7 @@

from .. import db_access
from ..db_access import DataReference
from ..utils import OutputProcessor, filternull_element
from .base import Constraint, OptionalSelections, T, TestResult, ToleranceGetter


Expand Down Expand Up @@ -42,9 +44,18 @@ class Uniques(Constraint, abc.ABC):
are part of a reference set of expected values - either externally supplied
through parameter `uniques` or obtained from another `DataSource`.

Null values in the column are ignored. To assert the non-existence of them use
Null values in the column are ignored by default. To assert the non-existence of them use
the `NullAbsence` constraint via the `add_null_absence_constraint` helper method for
`WithinRequirement`.
By default, the null filtering does not trigger if multiple columns are fetched at once.
It can be configured in more detail by supplying a custom ``filter_func`` function.
Some exemplary implementations are available in this module as ``datajudge.utils.filternull_element``,
``datajudge.utils.filternull_never``, ``datajudge.utils.filternull_element_or_tuple_all``, ``datajudge.utils.filternull_element_or_tuple_any``.
For new deployments, using one of the above filters or a custom one is recommended.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The word deployment in this line seems odd to me. I'm not sure that this sentence is actually needed at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

Passing None as the argument is equivalent to ``datajudge.utils.filternull_element``, but triggers a warning.
The deprecated default may change in future versions.
To silence the warning, set ``filter_func`` explicitly.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One could silence it by configuring the warming module to do so.

I think the fact that's a warning indicating eventual future deprecation should be enough.



There are two ways to do some post processing of the data obtained from the
database by providing a function to be executed. In general, no postprocessing
Expand All @@ -63,6 +74,30 @@ class Uniques(Constraint, abc.ABC):
(eager or lazy) of the same type as the type of the values of the column (in their
Python equivalent).

Furthermore, the `max_relative_violations` parameter can be used to set a tolerance
threshold for the proportion of elements in the data that can violate the constraint
(default: 0).
Setting this argument is currently not supported for `UniquesEquality`.

For `UniquesSubset`, by default,
the number of occurrences affects the computed fraction of violations.
To disable this weighting, set `compare_distinct=True`.
This argument does not have an effect on the test results for other `Uniques` constraints,
or if `max_relative_violations` is 0.

By default, the assertion messages make use of sets,
thus, they may differ from run to run despite the exact same situation being present,
and can have an arbitrary length.
To enforce a reproducible, limited output via (e.g.) sorting and slicing,
set `output_processors` to a list of callables

Each callable takes in two collections, and returns modified (e.g. sorted) versions of them.
In most cases, the second argument is simply None,
but for `UniquesSubset` it is the counts of each of the elements.
The suggested functions are ``datajudge.utils.output_processor_sort``
and ``datajudge.utils.output_processor_limit``.,
- see their respective docstrings for details.

One use is of this constraint is to test for consistency in columns with expected
categorical values.
"""
Expand All @@ -71,26 +106,44 @@ def __init__(
self,
ref: DataReference,
name: str = None,
output_processors: Optional[
Union[OutputProcessor, List[OutputProcessor]]
] = None,
*,
ref2: DataReference = None,
uniques: Collection = None,
filter_func: Callable[[List[T]], List[T]] = None,
map_func: Callable[[T], T] = None,
reduce_func: Callable[[Collection], Collection] = None,
max_relative_violations=0,
compare_distinct=False,
):
ref_value: Optional[Tuple[Collection, List]]
ref_value = (uniques, []) if uniques else None
super().__init__(ref, ref2=ref2, ref_value=ref_value, name=name)
super().__init__(
ref,
ref2=ref2,
ref_value=ref_value,
name=name,
output_processors=output_processors,
)

if filter_func is None:
warnings.warn("Using deprecated default null filter function.")
filter_func = filternull_element

self.filter_func = filter_func
self.local_func = map_func
self.global_func = reduce_func
self.max_relative_violations = max_relative_violations
self.compare_distinct = compare_distinct

def retrieve(
self, engine: sa.engine.Engine, ref: DataReference
) -> Tuple[Tuple[List[T], List[int]], OptionalSelections]:
uniques, selection = db_access.get_uniques(engine, ref)
values = list(uniques.keys())
values = list(filter(lambda value: value is not None, values))
values = self.filter_func(values)
counts = [uniques[value] for value in values]
if self.local_func:
values = list(map(self.local_func, values))
Expand All @@ -106,7 +159,11 @@ def retrieve(
class UniquesEquality(Uniques):
def __init__(self, args, name: str = None, **kwargs):
if kwargs.get("max_relative_violations"):
raise RuntimeError("Some useful message")
raise RuntimeError(
"max_relative_violations is not supported for UniquesEquality."
)
if kwargs.get("compare_distinct"):
raise RuntimeError("compare_distinct is not supported for UniquesEquality.")
super().__init__(args, name=name, **kwargs)

def compare(
Expand All @@ -123,22 +180,22 @@ def compare(
if not is_subset and not is_superset:
assertion_text = (
f"{self.ref} doesn't have the element(s) "
f"'{lacking_values}' and has the excess element(s) "
f"'{excess_values}' when compared with the reference values. "
f"'{self.apply_output_formatting_no_counts(lacking_values)}' and has the excess element(s) "
f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
if not is_subset:
assertion_text = (
f"{self.ref} has the excess element(s) "
f"'{excess_values}' when compared with the reference values. "
f"'{self.apply_output_formatting_no_counts(excess_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
if not is_superset:
assertion_text = (
f"{self.ref} doesn't have the element(s) "
f"'{lacking_values}' when compared with the reference values. "
f"'{self.apply_output_formatting_no_counts(lacking_values)}' when compared with the reference values. "
f"{self.condition_string}"
)
return False, assertion_text
Expand All @@ -153,28 +210,49 @@ def compare(
) -> Tuple[bool, Optional[str]]:
factual_values, factual_counts = factual
target_values, _ = target

is_subset, remainder = _subset_violation_counts(
factual_values, factual_counts, target_values
)
n_rows = sum(factual_counts)
n_violations = sum(remainder.values())
if not self.compare_distinct:
n_rows = sum(factual_counts)
n_violations = sum(remainder.values())
else:
n_rows = len(factual_values)
n_violations = len(remainder)

if (
n_rows > 0
and (relative_violations := (n_violations / n_rows))
> self.max_relative_violations
):
output_elemes, output_counts = list(remainder.keys()), list(
remainder.values()
)
if self.output_processors is not None:
for output_processor in self.output_processors:
output_elemes, output_counts = output_processor(
output_elemes, output_counts
)

assertion_text = (
f"{self.ref} has a fraction of {relative_violations} > "
f"{self.max_relative_violations} values not being an element of "
f"'{set(target_values)}'. It has e.g. excess elements "
f"'{list(remainder.keys())[:5]}'."
f"{self.max_relative_violations} {'DISTINCT ' if self.compare_distinct else ''}values ({n_violations} / {n_rows}) not being an element of "
f"'{self.apply_output_formatting_no_counts(set(target_values))}'. It has excess elements "
f"'{output_elemes}' "
f"with counts {output_counts}."
f"{self.condition_string}"
)
return False, assertion_text
return True, None


class UniquesSuperset(Uniques):
def __init__(self, args, name: str = None, **kwargs):
if kwargs.get("compare_distinct"):
raise RuntimeError("compare_distinct is not supported for UniquesSuperset.")
super().__init__(args, name=name, **kwargs)

def compare(
self,
factual: Tuple[List[T], List[int]],
Expand All @@ -185,14 +263,18 @@ def compare(
is_superset, remainder = _is_superset(factual_values, target_values)
if (
len(factual_values) > 0
and (relative_violations := (len(remainder) / len(target_values)))
and (
relative_violations := (
(n_violations := (len(remainder))) / (n_rows := len(target_values))
)
)
> self.max_relative_violations
):
assertion_text = (
f"{self.ref} has a fraction of "
f"{relative_violations} > {self.max_relative_violations} "
f"lacking unique values of '{set(target_values)}'. E.g. it "
f"doesn't have the unique value(s) '{list(remainder)[:5]}'."
f"{relative_violations} > {self.max_relative_violations} ({n_violations} / {n_rows}) "
f"lacking unique values of '{self.apply_output_formatting_no_counts(set(target_values))}'. It "
f"doesn't have the unique value(s) '{self.apply_output_formatting_no_counts(list(remainder))}'."
f"{self.condition_string}"
)
return False, assertion_text
Expand Down
Loading
Loading