Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Misc todos #183

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions hamilton/data_quality/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import dataclasses
import enum
import logging
from typing import Any, Dict, List, Tuple, Type
from typing import Any, Dict, List, Tuple

logger = logging.getLogger(__name__)

Expand All @@ -25,6 +25,13 @@ class ValidationResult:
) # Any extra diagnostics information needed, free-form


def matches_any_type(datatype: type, applicable_types: List[type]) -> bool:
for type_ in applicable_types:
if type_ == Any or issubclass(datatype, type_):
return True
return False


class DataValidator(abc.ABC):
"""Base class for a data quality operator. This will be used by the `data_quality` operator"""

Expand All @@ -35,13 +42,24 @@ def __init__(self, importance: str):
def importance(self) -> DataValidationLevel:
return self._importance

@abc.abstractmethod
def applies_to(self, datatype: Type[Type]) -> bool:
"""Whether or not this data validator can apply to the specified dataset
@classmethod
def applies_to(cls, datatype: type) -> bool:
"""Whether or not this data validator can apply to the specified dataset.
Note that overriding this is not the intended API (it was the old one),
but this will be a stable part of the API moving forward, at least until
Hamilton 2.0.

:param datatype:
:param datatype: Datatype to validate.
:return: True if it can be run on the specified type, false otherwise
"""
return matches_any_type(datatype, cls.applicable_types())

@classmethod
def applicable_types(cls) -> List[type]:
"""Returns the list of classes for which this is valid.

:return: List of classes
"""
pass

@abc.abstractmethod
Expand Down Expand Up @@ -118,7 +136,7 @@ def __init__(self, importance: str):

@classmethod
@abc.abstractmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
def applicable_types(cls) -> List[type]:
pass

@abc.abstractmethod
Expand Down
42 changes: 20 additions & 22 deletions hamilton/data_quality/default_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def arg(cls) -> str:
return "range"

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series) # TODO -- handle dataframes?
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that the datapoint falls within the range ({self.range[0]}, {self.range[1]})"
Expand Down Expand Up @@ -69,8 +69,8 @@ def arg(cls) -> str:
return "values_in"

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series) # TODO -- handle dataframes?
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that all data points are from a fixed set of values: ({self.values}), ignoring NA values."
Expand Down Expand Up @@ -113,8 +113,8 @@ def __init__(self, range: Tuple[numbers.Real, numbers.Real], importance: str):
self.range = range

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, numbers.Real)
def applicable_types(cls) -> List[type]:
return [numbers.Real]

def description(self) -> str:
return f"Validates that the datapoint falls within the range ({self.range[0]}, {self.range[1]})"
Expand Down Expand Up @@ -151,10 +151,8 @@ def arg(cls) -> str:
return "values_in"

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, numbers.Real) or issubclass(
datatype, str
) # TODO support list, dict and typing.* variants
def applicable_types(cls) -> List[type]:
return [numbers.Real, str]

def description(self) -> str:
return f"Validates that python values are from a fixed set of values: ({self.values})."
Expand Down Expand Up @@ -189,8 +187,8 @@ def _to_percent(fraction: float):
return "{0:.2%}".format(fraction)

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series)
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that no more than {MaxFractionNansValidatorPandasSeries._to_percent(self.max_fraction_nans)} of the data is Nan."
Expand Down Expand Up @@ -251,8 +249,8 @@ def __init__(self, data_type: Type[Type], importance: str):
self.datatype = data_type

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series)
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that the datatype of the pandas series is a subclass of: {self.datatype}"
Expand Down Expand Up @@ -282,8 +280,8 @@ def __init__(self, data_type: Type[Type], importance: str):
self.datatype = data_type

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, numbers.Real) or datatype in (str, bool)
def applicable_types(cls) -> List[type]:
return [numbers.Real, str, bool, int, float, list, dict]

def description(self) -> str:
return f"Validates that the datatype of the pandas series is a subclass of: {self.datatype}"
Expand Down Expand Up @@ -312,8 +310,8 @@ def __init__(self, max_standard_dev: float, importance: str):
self.max_standard_dev = max_standard_dev

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series)
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that the standard deviation of a pandas series is no greater than : {self.max_standard_dev}"
Expand All @@ -340,8 +338,8 @@ def __init__(self, mean_in_range: Tuple[float, float], importance: str):
self.mean_in_range = mean_in_range

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(datatype, pd.Series)
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
return f"Validates that a pandas series has mean in range [{self.mean_in_range[0]}, {self.mean_in_range[1]}]"
Expand All @@ -368,8 +366,8 @@ def __init__(self, allow_none: bool, importance: str):
self.allow_none = allow_none

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return True
def applicable_types(cls) -> List[type]:
return [Any]

def description(self) -> str:
if self.allow_none:
Expand Down
14 changes: 5 additions & 9 deletions hamilton/data_quality/pandera_validators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Type
from typing import List

import pandas as pd
import pandera as pa
Expand All @@ -14,10 +14,8 @@ def __init__(self, schema: pa.DataFrameSchema, importance: str):
self.schema = schema

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(
datatype, pd.DataFrame
) # TODO -- allow for modin, etc. as they come for free with pandera
def applicable_types(cls) -> List[type]:
return [pd.DataFrame]

def description(self) -> str:
return "Validates that the returned dataframe matches the pander"
Expand Down Expand Up @@ -54,10 +52,8 @@ def __init__(self, schema: pa.SeriesSchema, importance: str):
self.schema = schema

@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return issubclass(
datatype, pd.Series
) # TODO -- allow for modin, etc. as they come for free with pandera
def applicable_types(cls) -> List[type]:
return [pd.Series]

def description(self) -> str:
pass
Expand Down
34 changes: 34 additions & 0 deletions hamilton/function_modifiers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import functools
import itertools
import logging
import uuid
from abc import ABC

try:
Expand Down Expand Up @@ -704,6 +705,7 @@ def resolve_nodes(fn: Callable, config: Dict[str, Any]) -> Collection[node.Node]
which configuration they need.
:return: A list of nodes into which this function transforms.
"""

try:
function_decorators = get_node_decorators(fn, config)
node_resolvers = function_decorators[NodeResolver.get_lifecycle_name()]
Expand Down Expand Up @@ -734,3 +736,35 @@ class InvalidDecoratorException(Exception):

class MissingConfigParametersException(Exception):
pass


def create_anonymous_node_name(original_node_name: str, *suffixes: str) -> str:
"""Creates an anonymous node name. This is specifically for decorators that
rely on temporary /intermediate nodes. Note that these are not part of the contract, and might
change at any given point.

The algorithm that this follows is simple:

1. Start with the original node name
2. Append the suffixes followed by underscores
3. If that name is taken, then append a number (_1, _2, etc...) to make it unique.

Note that the "stability" of this depends on the nodes being processed in a specific order,
which should be respected in function_modifiers_base. The order is:

1. By node lifecycle
2. By decorator application order

The only likely conflicts come from multiple similar decorators (E.G. check_output) decorating
the same node.

:param original_node_name: Name of the original node that this is related to.
:param suffixes: Suffixes to append to the original node.
:return: new node name
"""
uid = str(uuid.uuid4())[0:8]
name = original_node_name
for suffix in suffixes:
name += f"_{suffix}"
name += f"_{uid}"
return name
2 changes: 1 addition & 1 deletion hamilton/function_modifiers/expanders.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def validate_return_type(fn: Callable):
registry.get_column_type_from_df_type(output_type)
except NotImplementedError:
raise base.InvalidDecoratorException(
# TODO: capture was dataframe libraries are supported and print here.
# TODO: capture what dataframe libraries are supported and print here.
f"Error {fn} does not output a type we know about. Is it a dataframe type we "
f"support? "
)
Expand Down
3 changes: 1 addition & 2 deletions hamilton/function_modifiers/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def transform_node(
self, node_: node.Node, config: Dict[str, Any], fn: Callable
) -> Collection[node.Node]:
raw_node = node.Node(
name=node_.name
+ "_raw", # TODO -- make this unique -- this will break with multiple validation decorators, which we *don't* want
name=base.create_anonymous_node_name(node_.name, "raw"),
typ=node_.type,
doc_string=node_.documentation,
callabl=node_.callable,
Expand Down
40 changes: 26 additions & 14 deletions tests/function_modifiers/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,27 @@ def fn(input: pd.Series) -> pd.Series:
subdag = decorator.transform_node(node_, config={}, fn=fn)
assert 4 == len(subdag)
subdag_as_dict = {node_.name: node_ for node_ in subdag}
assert sorted(subdag_as_dict.keys()) == [
prefixes = [
"fn",
"fn_dummy_data_validator_2",
"fn_dummy_data_validator_3",
"fn_raw",
]
# TODO -- change when we change the naming scheme
assert subdag_as_dict["fn_raw"].input_types["input"][1] == DependencyType.REQUIRED
sorted_keys = sorted(subdag_as_dict)
assert all([node_name.startswith(prefix) for node_name, prefix in zip(sorted_keys, prefixes)])
assert subdag_as_dict[sorted_keys[-1]].input_types["input"][1] == DependencyType.REQUIRED
assert 3 == len(
subdag_as_dict["fn"].input_types
) # Three dependencies -- the two with DQ + the original
# The final function should take in everything but only use the raw results
raw_node_name = sorted_keys[-1]
assert (
subdag_as_dict["fn"].callable(
fn_raw="test",
fn_dummy_data_validator_2=ValidationResult(True, "", {}),
fn_dummy_data_validator_3=ValidationResult(True, "", {}),
**{
raw_node_name: "test",
"fn_dummy_data_validator_2": ValidationResult(True, "", {}),
"fn_dummy_data_validator_3": ValidationResult(True, "", {}),
}
)
== "test"
)
Expand All @@ -68,14 +72,17 @@ def fn(input: pd.Series) -> pd.Series:
subdag = decorator.transform_node(node_, config={}, fn=fn)
assert 4 == len(subdag)
subdag_as_dict = {node_.name: node_ for node_ in subdag}
assert sorted(subdag_as_dict.keys()) == [
prefixes = [
"fn",
"fn_dummy_data_validator_2",
"fn_dummy_data_validator_3",
"fn_raw",
]
sorted_keys = sorted(subdag_as_dict)
assert all([node_name.startswith(prefix) for node_name, prefix in zip(sorted_keys, prefixes)])
raw_node_name = sorted_keys[-1]
# TODO -- change when we change the naming scheme
assert subdag_as_dict["fn_raw"].input_types["input"][1] == DependencyType.REQUIRED
assert subdag_as_dict[raw_node_name].input_types["input"][1] == DependencyType.REQUIRED
assert 3 == len(
subdag_as_dict["fn"].input_types
) # Three dependencies -- the two with DQ + the original
Expand All @@ -98,9 +105,11 @@ def fn(input: pd.Series) -> pd.Series:
# The final function should take in everything but only use the raw results
assert (
subdag_as_dict["fn"].callable(
fn_raw="test",
fn_dummy_data_validator_2=ValidationResult(True, "", {}),
fn_dummy_data_validator_3=ValidationResult(True, "", {}),
**{
raw_node_name: "test",
"fn_dummy_data_validator_2": ValidationResult(True, "", {}),
"fn_dummy_data_validator_3": ValidationResult(True, "", {}),
}
)
== "test"
)
Expand All @@ -119,12 +128,15 @@ def fn(input: pd.Series) -> pd.Series:
subdag = decorator.transform_node(node_, config={}, fn=fn)
assert 4 == len(subdag)
subdag_as_dict = {node_.name: node_ for node_ in subdag}
(raw_node_name,) = [item for item in subdag_as_dict if item.startswith("fn_raw_")]

with pytest.raises(DataValidationError):
subdag_as_dict["fn"].callable(
fn_raw=pd.Series([1.0, 2.0, 3.0]),
fn_dummy_data_validator_2=ValidationResult(False, "", {}),
fn_dummy_data_validator_3=ValidationResult(False, "", {}),
**{
raw_node_name: pd.Series([1.0, 2.0, 3.0]),
"fn_dummy_data_validator_2": ValidationResult(False, "", {}),
"fn_dummy_data_validator_3": ValidationResult(False, "", {}),
}
)


Expand Down
14 changes: 7 additions & 7 deletions tests/resources/data_quality.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd

from hamilton.function_modifiers import check_output
Expand All @@ -11,10 +12,9 @@ def data_might_be_in_range(data_quality_should_fail: bool) -> pd.Series:
return pd.Series([0.5])


# TODO -- enable this once we fix the double-data-quality decorators with the same name bug
# @check_output(data_type=np.float)
# @check_output(range=(0, 1))
# def multi_layered_validator(data_quality_should_fail: bool) -> pd.Series:
# if data_quality_should_fail:
# return pd.Series([10.0])
# return pd.Series([0.5])
@check_output(data_type=np.float64)
@check_output(range=(0, 1))
def multi_layered_validator(data_quality_should_fail: bool) -> pd.Series:
if data_quality_should_fail:
return pd.Series([10.0])
return pd.Series([0.5])
Loading