Skip to content

Commit

Permalink
adding configuration options to uniques functionality (#224)
Browse files Browse the repository at this point in the history
Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

Docs update

Co-authored-by: Kevin Klein <[email protected]>

update doc string on null columns everywhere and fix typo

Update docs

Co-authored-by: Kevin Klein <[email protected]>

Update docs

Co-authored-by: Kevin Klein <[email protected]>

Update docs

Co-authored-by: Kevin Klein <[email protected]>

docs updates

update docs

filternull docs clarification

replace assert by raise ValueError

shorten name to apply_output_formatting

add unit tests for new utils functions

set default to limit 100 elements

ensure all relevant tests run for impala and ensure they pass

disable extralong test for bigquery due to slow speed

capitalization test handle parallel if table already created
  • Loading branch information
SimonLangerQC authored and kklein committed Jun 25, 2024
1 parent b8ea385 commit 9bf90f1
Show file tree
Hide file tree
Showing 11 changed files with 909 additions and 718 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ jobs:
uses: ./.github/actions/pytest
with:
backend: bigquery
args: -n auto tests/integration
args: -n 16 -v tests/integration

impala-column:
if: ${{ contains(github.event.pull_request.labels.*.name, 'impala') || contains(github.event.pull_request.labels.*.name, 'ready') || github.ref == 'refs/heads/main' }}
Expand All @@ -275,7 +275,11 @@ jobs:
matrix:
PYTHON_VERSION: [ '3.8' ]
SA_VERSION: ["<2.0"]
PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]
# PYTEST_ARG: ["tests/integration/test_column_capitalization.py", "tests/integration/test_data_source.py", "tests/integration/test_integration.py -k row", "tests/integration/test_integration.py -k uniques", "tests/integration/test_integration.py -k date", "tests/integration/test_integration.py -k varchar", "tests/integration/test_integration.py -k numeric"]

# more comprehensive matching; note that tests which start with test_i and not test_integer are not matched and must be added here

PYTEST_ARG: ["tests/integration/test_integration.py -k 'test_a or test_b or test_c or test_d or test_e or test_f or test_g or test_h or test_integer or test_j or test_k or test_l or test_m'", "tests/integration/test_integration.py -k 'test_n or test_o or test_p or test_q or test_r or test_s or test_t or test_u or test_v or test_w or test_x or test_y or test_z'"]

steps:
- name: Checkout branch
Expand Down
15 changes: 15 additions & 0 deletions run_integration_tests_postgres.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

docker stop $(docker ps -q --filter name=postgres_datajudge)

./start_postgres.sh &
bash -c "while true; do printf '\nPress enter once postgres is ready: '; sleep 1; done" &

read -p "Press enter to once postgres is ready: "
kill %%

echo "STARTING PYTEST"
pytest tests/integration -vv --backend=postgres "$@"

docker stop $(docker ps -q --filter name=postgres_datajudge)

25 changes: 23 additions & 2 deletions src/datajudge/constraints/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import abc
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Any, Callable, List, Optional, Tuple, TypeVar
from typing import Any, Callable, Collection, List, Optional, Tuple, TypeVar, Union

import sqlalchemy as sa

from ..db_access import DataReference
from ..formatter import Formatter
from ..utils import OutputProcessor, output_processor_limit

DEFAULT_FORMATTER = Formatter()

Expand Down Expand Up @@ -113,7 +114,15 @@ class Constraint(abc.ABC):
"""

def __init__(
self, ref: DataReference, *, ref2=None, ref_value: Any = None, name: str = None
self,
ref: DataReference,
*,
ref2=None,
ref_value: Any = None,
name: str = None,
output_processors: Optional[
Union[OutputProcessor, List[OutputProcessor]]
] = output_processor_limit,
):
self._check_if_valid_between_or_within(ref2, ref_value)
self.ref = ref
Expand All @@ -125,6 +134,12 @@ def __init__(
self.factual_queries: Optional[List[str]] = None
self.target_queries: Optional[List[str]] = None

if (output_processors is not None) and (
not isinstance(output_processors, list)
):
output_processors = [output_processors]
self.output_processors = output_processors

def _check_if_valid_between_or_within(
self, ref2: Optional[DataReference], ref_value: Optional[Any]
):
Expand Down Expand Up @@ -241,6 +256,12 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
target_queries,
)

def apply_output_formatting(self, values: Collection) -> Collection:
if self.output_processors is not None:
for output_processor in self.output_processors:
values, _ = output_processor(values)
return values


def format_sample(sample, ref: DataReference) -> str:
"""Build a string from a database row indicating its column values."""
Expand Down
11 changes: 9 additions & 2 deletions src/datajudge/constraints/miscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,15 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
return TestResult.success()

assertion_text = (
f"{self.ref} has violations of functional dependence, e.g.:\n"
+ "\n".join([f"{tuple(violation)}" for violation in violations][:5])
f"{self.ref} has violations of functional dependence (in total {len(violations)} rows):\n"
+ "\n".join(
[
f"{violation}"
for violation in self.apply_output_formatting(
[tuple(elem) for elem in violations]
)
]
)
)
return TestResult.failure(assertion_text)

Expand Down
Loading

0 comments on commit 9bf90f1

Please sign in to comment.