Adds basic ability to configure data quality

This works both globally and locally. Currently its only at the node level, but it shouldn't be to hard to disable specific validators/set warnings on them.
DAGWorks-Inc · Feb 27, 2023 · 332fd54 · 332fd54
1 parent 42622b0
commit 332fd54
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 7 deletions.
diff --git a/data_quality.md b/data_quality.md
@@ -164,7 +164,7 @@ All configuration keys have two components, joined by a `.` The first component
 The value will be a dictionary with two possible values:
 
 1. `importance` -- the importance level of the data quality check. Can be either "warn" or "fail"
-2. `enable` -- a boolean indicating whether the data quality check is enabled or not.
+2. `enabled` -- a boolean indicating whether the data quality check is enabled or not.
 
 The specific node name will take precedence, and `global` will apply after that. The information in the code
 will take third place (although you are unable to disable through code aside from removing/commenting the decorator out).
@@ -175,7 +175,7 @@ will take third place (although you are unable to disable through code aside fro
 # This will globally disable *all* data quality checks
 config = {
     'data_quality.global': {
-        'enable': False
+        'enabled': False
     },
 }
 # This will set the importance of all decorated nodes to "warn"
@@ -188,7 +188,7 @@ config = {
 # This will disable the data quality check for the node `foo`
 config = {
     'data_quality.foo': {
-        'enable': False
+        'enabled': False
     },
 }
 

diff --git a/hamilton/function_modifiers/validation.py b/hamilton/function_modifiers/validation.py
@@ -20,9 +20,21 @@ class ValidatorConfig:
 
     @staticmethod
     def from_validator(
-        validator: dq_base.DataValidator, config: Dict[str, Any]
+        validator: dq_base.DataValidator,
+        config: Dict[str, Any],
+        node_name: str,
     ) -> "ValidatorConfig":
-        return ValidatorConfig(should_run=True, importance=validator.importance)
+        global_key = "data_quality.global"
+        node_key = f"data_quality.{node_name}"
+        global_config = config.get(global_key, {})
+        node_config = config.get(node_key, {})
+        should_run = global_config.get("enabled", node_config.get("enabled", True))
+        importance = node_config.get(
+            "importance", global_config.get("importance", validator.importance.value)
+        )
+        return ValidatorConfig(
+            should_run=should_run, importance=dq_base.DataValidationLevel(importance)
+        )
 
 
 class BaseDataValidationDecorator(base.NodeTransformer):
@@ -53,7 +65,9 @@ def transform_node(
         validator_nodes = []
         validator_name_config_map = {}
         for validator in validators:
-            validator_config = ValidatorConfig.from_validator(validator, config)
+            validator_config = ValidatorConfig.from_validator(
+                validator=validator, config=config, node_name=node_.name
+            )
             if not validator_config.should_run:
                 continue
 

diff --git a/tests/function_modifiers/test_validation.py b/tests/function_modifiers/test_validation.py
@@ -1,15 +1,23 @@
+from typing import Any, Dict
+
 import numpy as np
 import pandas as pd
 import pytest
 
 from hamilton import node
-from hamilton.data_quality.base import DataValidationError, ValidationResult
+from hamilton.data_quality.base import (
+    DataValidationError,
+    DataValidationLevel,
+    DataValidator,
+    ValidationResult,
+)
 from hamilton.function_modifiers import (
     DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG,
     IS_DATA_VALIDATOR_TAG,
     check_output,
     check_output_custom,
 )
+from hamilton.function_modifiers.validation import ValidatorConfig
 from hamilton.node import DependencyType
 from tests.resources.dq_dummy_examples import (
     DUMMY_VALIDATORS_FOR_TESTING,
@@ -154,3 +162,59 @@ def test_data_quality_constants_for_api_consistency():
     # simple tests to test data quality constants remain the same
     assert IS_DATA_VALIDATOR_TAG == "hamilton.data_quality.contains_dq_results"
     assert DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG == "hamilton.data_quality.source_node"
+
+
+@pytest.mark.parametrize(
+    "validator,config,node_name,expected_result",
+    [
+        (
+            SampleDataValidator2(0, "warn"),
+            {},
+            "test",
+            ValidatorConfig(True, DataValidationLevel.WARN),
+        ),
+        (
+            SampleDataValidator2(0, "fail"),
+            {},
+            "test",
+            ValidatorConfig(True, DataValidationLevel.FAIL),
+        ),
+        (
+            SampleDataValidator2(0, "warn"),
+            {"data_quality.test": {"enabled": False}},
+            "test",
+            ValidatorConfig(False, DataValidationLevel.WARN),
+        ),
+        (
+            SampleDataValidator2(0, "warn"),
+            {"data_quality.test": {"enabled": True}},
+            "test",
+            ValidatorConfig(True, DataValidationLevel.WARN),
+        ),
+        (
+            SampleDataValidator2(0, "fail"),
+            {"data_quality.test": {"enabled": False, "importance": "warn"}},
+            "test",
+            ValidatorConfig(False, DataValidationLevel.WARN),
+        ),
+        (
+            SampleDataValidator2(0, "warn"),
+            {"data_quality.global": {"enabled": False}},
+            "test",
+            ValidatorConfig(False, DataValidationLevel.WARN),
+        ),
+        (
+            SampleDataValidator2(0, "warn"),
+            {"data_quality.global": {"enabled": False, "importance": "warn"}},
+            "test",
+            ValidatorConfig(False, DataValidationLevel.WARN),
+        ),
+    ],
+)
+def test_validator_config_derive(
+    validator: DataValidator,
+    config: Dict[str, Any],
+    node_name: str,
+    expected_result: ValidatorConfig,
+):
+    assert ValidatorConfig.from_validator(validator, config, node_name) == expected_result