Skip to content

Commit

Permalink
Adds basic ability to configure data quality
Browse files Browse the repository at this point in the history
This works both globally and locally. Currently its only at the node
level, but it shouldn't be to hard to disable specific validators/set
warnings on them.
  • Loading branch information
elijahbenizzy committed Feb 27, 2023
1 parent 42622b0 commit 332fd54
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 7 deletions.
6 changes: 3 additions & 3 deletions data_quality.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ All configuration keys have two components, joined by a `.` The first component
The value will be a dictionary with two possible values:

1. `importance` -- the importance level of the data quality check. Can be either "warn" or "fail"
2. `enable` -- a boolean indicating whether the data quality check is enabled or not.
2. `enabled` -- a boolean indicating whether the data quality check is enabled or not.

The specific node name will take precedence, and `global` will apply after that. The information in the code
will take third place (although you are unable to disable through code aside from removing/commenting the decorator out).
Expand All @@ -175,7 +175,7 @@ will take third place (although you are unable to disable through code aside fro
# This will globally disable *all* data quality checks
config = {
'data_quality.global': {
'enable': False
'enabled': False
},
}
# This will set the importance of all decorated nodes to "warn"
Expand All @@ -188,7 +188,7 @@ config = {
# This will disable the data quality check for the node `foo`
config = {
'data_quality.foo': {
'enable': False
'enabled': False
},
}

Expand Down
20 changes: 17 additions & 3 deletions hamilton/function_modifiers/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,21 @@ class ValidatorConfig:

@staticmethod
def from_validator(
validator: dq_base.DataValidator, config: Dict[str, Any]
validator: dq_base.DataValidator,
config: Dict[str, Any],
node_name: str,
) -> "ValidatorConfig":
return ValidatorConfig(should_run=True, importance=validator.importance)
global_key = "data_quality.global"
node_key = f"data_quality.{node_name}"
global_config = config.get(global_key, {})
node_config = config.get(node_key, {})
should_run = global_config.get("enabled", node_config.get("enabled", True))
importance = node_config.get(
"importance", global_config.get("importance", validator.importance.value)
)
return ValidatorConfig(
should_run=should_run, importance=dq_base.DataValidationLevel(importance)
)


class BaseDataValidationDecorator(base.NodeTransformer):
Expand Down Expand Up @@ -53,7 +65,9 @@ def transform_node(
validator_nodes = []
validator_name_config_map = {}
for validator in validators:
validator_config = ValidatorConfig.from_validator(validator, config)
validator_config = ValidatorConfig.from_validator(
validator=validator, config=config, node_name=node_.name
)
if not validator_config.should_run:
continue

Expand Down
66 changes: 65 additions & 1 deletion tests/function_modifiers/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
from typing import Any, Dict

import numpy as np
import pandas as pd
import pytest

from hamilton import node
from hamilton.data_quality.base import DataValidationError, ValidationResult
from hamilton.data_quality.base import (
DataValidationError,
DataValidationLevel,
DataValidator,
ValidationResult,
)
from hamilton.function_modifiers import (
DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG,
IS_DATA_VALIDATOR_TAG,
check_output,
check_output_custom,
)
from hamilton.function_modifiers.validation import ValidatorConfig
from hamilton.node import DependencyType
from tests.resources.dq_dummy_examples import (
DUMMY_VALIDATORS_FOR_TESTING,
Expand Down Expand Up @@ -154,3 +162,59 @@ def test_data_quality_constants_for_api_consistency():
# simple tests to test data quality constants remain the same
assert IS_DATA_VALIDATOR_TAG == "hamilton.data_quality.contains_dq_results"
assert DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG == "hamilton.data_quality.source_node"


@pytest.mark.parametrize(
"validator,config,node_name,expected_result",
[
(
SampleDataValidator2(0, "warn"),
{},
"test",
ValidatorConfig(True, DataValidationLevel.WARN),
),
(
SampleDataValidator2(0, "fail"),
{},
"test",
ValidatorConfig(True, DataValidationLevel.FAIL),
),
(
SampleDataValidator2(0, "warn"),
{"data_quality.test": {"enabled": False}},
"test",
ValidatorConfig(False, DataValidationLevel.WARN),
),
(
SampleDataValidator2(0, "warn"),
{"data_quality.test": {"enabled": True}},
"test",
ValidatorConfig(True, DataValidationLevel.WARN),
),
(
SampleDataValidator2(0, "fail"),
{"data_quality.test": {"enabled": False, "importance": "warn"}},
"test",
ValidatorConfig(False, DataValidationLevel.WARN),
),
(
SampleDataValidator2(0, "warn"),
{"data_quality.global": {"enabled": False}},
"test",
ValidatorConfig(False, DataValidationLevel.WARN),
),
(
SampleDataValidator2(0, "warn"),
{"data_quality.global": {"enabled": False, "importance": "warn"}},
"test",
ValidatorConfig(False, DataValidationLevel.WARN),
),
],
)
def test_validator_config_derive(
validator: DataValidator,
config: Dict[str, Any],
node_name: str,
expected_result: ValidatorConfig,
):
assert ValidatorConfig.from_validator(validator, config, node_name) == expected_result

0 comments on commit 332fd54

Please sign in to comment.